from:"Felix Kuehling"

Re: [PATCH] drm/amdgpu: Update the impelmentation of AMDGPU_PTE_MTYPE_GFX12

2024-05-21 Thread Felix Kuehling



On 2024-05-20 5:14, Shane Xiao wrote:
> This patch changes the implementation of AMDGPU_PTE_MTYPE_GFX12,
> clear the bits before setting the new one.
> This fixed the potential issue that GFX12 setting memory to NC.
> 
> v2: Clear mtype field before setting the new one (Alex)
> 
> Signed-off-by: longlyao 
> Signed-off-by: Shane Xiao 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  7 +--
>  drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 23 +++
>  2 files changed, 16 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index bc71b44387b2..99b246e82ed6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -116,8 +116,11 @@ struct amdgpu_mem_stats;
>  #define AMDGPU_PTE_PRT_FLAG(adev)\
>   ((amdgpu_ip_version((adev), GC_HWIP, 0) >= IP_VERSION(12, 0, 0)) ? 
> AMDGPU_PTE_PRT_GFX12 : AMDGPU_PTE_PRT)
>  
> -#define AMDGPU_PTE_MTYPE_GFX12(a)((uint64_t)(a) << 54)
> -#define AMDGPU_PTE_MTYPE_GFX12_MASK  AMDGPU_PTE_MTYPE_GFX12(3ULL)
> +#define AMDGPU_PTE_MTYPE_GFX12_SHIFT(mtype)  ((uint64_t)(mytype) << 54)

You have a typo here: mytype -> mtype .

Regards,
  Felix


> +#define AMDGPU_PTE_MTYPE_GFX12_MASK  AMDGPU_PTE_MTYPE_GFX12_SHIFT(3ULL)
> +#define AMDGPU_PTE_MTYPE_GFX12(flags, mtype) \
> + ((flags) & ((~AMDGPU_PTE_MTYPE_GFX12_MASK)) |   \
> +   AMDGPU_PTE_MTYPE_GFX12_SHIFT(mtype))
>  
>  #define AMDGPU_PTE_IS_PTE(1ULL << 63)
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
> index e2c6ec3cc4f3..f2d331d0181f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
> @@ -461,17 +461,17 @@ static uint64_t gmc_v12_0_map_mtype(struct 
> amdgpu_device *adev, uint32_t flags)
>  {
>   switch (flags) {
>   case AMDGPU_VM_MTYPE_DEFAULT:
> - return AMDGPU_PTE_MTYPE_GFX12(MTYPE_NC);
> + return AMDGPU_PTE_MTYPE_GFX12(0ULL,MTYPE_NC);
>   case AMDGPU_VM_MTYPE_NC:
> - return AMDGPU_PTE_MTYPE_GFX12(MTYPE_NC);
> + return AMDGPU_PTE_MTYPE_GFX12(0ULL,MTYPE_NC);
>   case AMDGPU_VM_MTYPE_WC:
> - return AMDGPU_PTE_MTYPE_GFX12(MTYPE_WC);
> + return AMDGPU_PTE_MTYPE_GFX12(0ULL,MTYPE_WC);
>   case AMDGPU_VM_MTYPE_CC:
> - return AMDGPU_PTE_MTYPE_GFX12(MTYPE_CC);
> + return AMDGPU_PTE_MTYPE_GFX12(0ULL,MTYPE_CC);
>   case AMDGPU_VM_MTYPE_UC:
> - return AMDGPU_PTE_MTYPE_GFX12(MTYPE_UC);
> + return AMDGPU_PTE_MTYPE_GFX12(0ULL,MTYPE_UC);
>   default:
> - return AMDGPU_PTE_MTYPE_GFX12(MTYPE_NC);
> + return AMDGPU_PTE_MTYPE_GFX12(0ULL,MTYPE_NC);
>   }
>  }
>  
> @@ -509,8 +509,8 @@ static void gmc_v12_0_get_vm_pte(struct amdgpu_device 
> *adev,
>   *flags &= ~AMDGPU_PTE_EXECUTABLE;
>   *flags |= mapping->flags & AMDGPU_PTE_EXECUTABLE;
>  
> - *flags &= ~AMDGPU_PTE_MTYPE_GFX12_MASK;
> - *flags |= (mapping->flags & AMDGPU_PTE_MTYPE_GFX12_MASK);
> + *flags = AMDGPU_PTE_MTYPE_GFX12(*flags, (mapping->flags &   \
> +  AMDGPU_PTE_MTYPE_GFX12_MASK) >> 
> AMDGPU_PTE_MTYPE_GFX12_SHIFT);
>  
>   if (mapping->flags & AMDGPU_PTE_PRT_GFX12) {
>   *flags |= AMDGPU_PTE_PRT_GFX12;
> @@ -524,8 +524,7 @@ static void gmc_v12_0_get_vm_pte(struct amdgpu_device 
> *adev,
>  
>   if (bo->flags & (AMDGPU_GEM_CREATE_COHERENT |
>  AMDGPU_GEM_CREATE_UNCACHED))
> - *flags = (*flags & ~AMDGPU_PTE_MTYPE_GFX12_MASK) |
> -  AMDGPU_PTE_MTYPE_GFX12(MTYPE_UC);
> + *flags = AMDGPU_PTE_MTYPE_GFX12(*flags, MTYPE_UC);
>  
>   bo_adev = amdgpu_ttm_adev(bo->tbo.bdev);
>   coherent = bo->flags & AMDGPU_GEM_CREATE_COHERENT;
> @@ -534,7 +533,7 @@ static void gmc_v12_0_get_vm_pte(struct amdgpu_device 
> *adev,
>  
>   /* WA for HW bug */
>   if (is_system || ((bo_adev != adev) && coherent))
> - *flags |= AMDGPU_PTE_MTYPE_GFX12(MTYPE_NC);
> + *flags = AMDGPU_PTE_MTYPE_GFX12(*flags, MTYPE_NC);
>  
>  }
>  
> @@ -707,7 +706,7 @@ static int gmc_v12_0_gart_init(struct amdgpu_device *adev)
>   return r;
>  
>   adev->gart.table_size = adev->gart.num_gpu_pages * 8;
> - adev->gart.gart_pte_flags = AMDGPU_PTE_MTYPE_GFX12(MTYPE_UC) |
> + adev->gart.gart_pte_flags = AMDGPU_PTE_MTYPE_GFX12(0ULL, MTYPE_UC) |
>   AMDGPU_PTE_EXECUTABLE |
>   AMDGPU_PTE_IS_PTE;
>

Re: [PATCH] drm/kfd: Correct pined buffer handling at kfd restore and validate process

2024-05-13 Thread Felix Kuehling




On 2024-05-13 11:18, Xiaogang.Chen wrote:
> From: Xiaogang Chen 
> 
> This reverts 8a774fe912ff09e39c2d3a3589c729330113f388 "drm/amdgpu: avoid 
> restore
> process run into dead loop" since buffer got pined is not related whether it

Spelling: pined -> pinned

Same in the commit headline.


> needs mapping. And skip buffer validation at kfd driver if the buffer has been
> pinned.
> 
> Signed-off-by: Xiaogang Chen 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 9 +
>  1 file changed, 5 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 3314821e4cf3..80018738bd1c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -415,6 +415,10 @@ static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo 
> *bo, uint32_t domain,
>"Called with userptr BO"))
>   return -EINVAL;
>  
> + /* bo has been pined, not need validate it */

pined -> pinned

With those typos fixed, the patch is

Reviewed-by: Felix Kuehling 


> + if (bo->tbo.pin_count)
> + return 0;
> +
>   amdgpu_bo_placement_from_domain(bo, domain);
>  
>   ret = ttm_bo_validate(>tbo, >placement, );
> @@ -2736,7 +2740,7 @@ static int confirm_valid_user_pages_locked(struct 
> amdkfd_process_info *process_i
>  
>   /* keep mem without hmm range at userptr_inval_list */
>   if (!mem->range)
> -  continue;
> + continue;
>  
>   /* Only check mem with hmm range associated */
>   valid = amdgpu_ttm_tt_get_user_pages_done(
> @@ -2981,9 +2985,6 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, 
> struct dma_fence __rcu *
>   if (!attachment->is_mapped)
>   continue;
>  
> - if (attachment->bo_va->base.bo->tbo.pin_count)
> - continue;
> -
>   kfd_mem_dmaunmap_attachment(mem, attachment);
>   ret = update_gpuvm_pte(mem, attachment, _obj);
>   if (ret) {

Re: [PATCH v2] drm/amdkfd: Check correct memory types for is_system variable

2024-05-10 Thread Felix Kuehling




On 2024-05-10 10:06, Sreekant Somasekharan wrote:

To catch GPU mapping of system memory, TTM_PL_TT and AMDGPU_PL_PREEMPT
must be checked.

'Fixes: 3b01ca1b860d ("drm/amdkfd: mark GFX12 system and peer
GPU memory mappings as MTYPE_NC")'


I don't think that's a valid format for the Fixes tag. It should be a 
single line and no single quotes. Other than that, the patch is


Reviewed-by: Felix Kuehling 



Signed-off-by: Sreekant Somasekharan 
---
  drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
index df0363ad1a51..6eb370609d01 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
@@ -495,7 +495,8 @@ static void gmc_v12_0_get_vm_pte(struct amdgpu_device *adev,
struct amdgpu_bo *bo = mapping->bo_va->base.bo;
struct amdgpu_device *bo_adev = amdgpu_ttm_adev(bo->tbo.bdev);
bool coherent = bo->flags & AMDGPU_GEM_CREATE_COHERENT;
-   bool is_system = bo->tbo.resource->mem_type == TTM_PL_SYSTEM;
+   bool is_system = (bo->tbo.resource->mem_type == TTM_PL_TT) ||
+   (bo->tbo.resource->mem_type == AMDGPU_PL_PREEMPT);
  
  
  	*flags &= ~AMDGPU_PTE_EXECUTABLE;

Re: [PATCH] drm/amdkfd: Ensure gpu_id is unique

2024-05-10 Thread Felix Kuehling




On 2024-05-09 16:06, Harish Kasiviswanathan wrote:

gpu_id needs to be unique for user space to identify GPUs via KFD
interface. In the current implementation there is a very small
probability of having non unique gpu_ids.

v2: Add check to confirm if gpu_id is unique. If not unique, find one
 Changed commit header to reflect the above
v3: Use crc16 as suggested-by: Lijo Lazar 
 Ensure that gpu_id != 0

Signed-off-by: Harish Kasiviswanathan 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 40 +++
  1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 219dcf504f24..4954a3021f70 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -31,6 +31,7 @@
  #include 
  #include 
  #include 
+#include 
  
  #include "kfd_priv.h"

  #include "kfd_crat.h"
@@ -1091,14 +1092,17 @@ void kfd_topology_shutdown(void)
  
  static uint32_t kfd_generate_gpu_id(struct kfd_node *gpu)

  {
-   uint32_t hashout;
+   uint32_t gpu_id;
uint32_t buf[8];
uint64_t local_mem_size;
-   int i;
+   struct kfd_topology_device *dev;
+   bool is_unique;
+   uint8_t *crc_buf;
  
  	if (!gpu)

return 0;
  
+	crc_buf = (uint8_t*)

local_mem_size = gpu->local_mem_info.local_mem_size_private +
gpu->local_mem_info.local_mem_size_public;
buf[0] = gpu->adev->pdev->devfn;
@@ -,10 +1115,34 @@ static uint32_t kfd_generate_gpu_id(struct kfd_node 
*gpu)
buf[6] = upper_32_bits(local_mem_size);
buf[7] = (ffs(gpu->xcc_mask) - 1) | (NUM_XCC(gpu->xcc_mask) << 16);
  
-	for (i = 0, hashout = 0; i < 8; i++)

-   hashout ^= hash_32(buf[i], KFD_GPU_ID_HASH_WIDTH);
+   gpu_id = crc16(0, crc_buf, sizeof(buf)) &
+((1 << KFD_GPU_ID_HASH_WIDTH) - 1);
  
-	return hashout;

+   /* There is a very small possibility when generating a
+* 16 (KFD_GPU_ID_HASH_WIDTH) bit value from 8 word buffer
+* that the value could be 0 or non-unique. So, check if
+* it is unique and non-zero. If not unique increment till
+* unique one is found. In case of overflow, restart from 1
+*/
+
+   down_read(_lock);
+   do {
+   is_unique = true;
+   if (!gpu_id)
+   gpu_id = 1;
+   list_for_each_entry(dev, _device_list, list) {
+   if (dev->gpu && dev->gpu_id == gpu_id) {
+   is_unique = false;
+   break;
+   }
+   }
+   if (unlikely(!is_unique))
+   gpu_id = (gpu_id + 1) &
+ ((1 << KFD_GPU_ID_HASH_WIDTH) - 1);
+   } while (!is_unique);
+   up_read(_lock);
+
+   return gpu_id;
  }
  /* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If
   *the GPU device is not already present in the topology device
@@ -1945,7 +1973,6 @@ int kfd_topology_add_device(struct kfd_node *gpu)
struct amdgpu_gfx_config *gfx_info = >adev->gfx.config;
struct amdgpu_cu_info *cu_info = >adev->gfx.cu_info;
  
-	gpu_id = kfd_generate_gpu_id(gpu);

if (gpu->xcp && !gpu->xcp->ddev) {
dev_warn(gpu->adev->dev,
 "Won't add GPU to topology since it has no drm node 
assigned.");
@@ -1968,6 +1995,7 @@ int kfd_topology_add_device(struct kfd_node *gpu)
if (res)
return res;
  
+	gpu_id = kfd_generate_gpu_id(gpu);

dev->gpu_id = gpu_id;
gpu->id = gpu_id;

Re: [PATCH 11/11] drm/tegra: Use fbdev client helpers

2024-05-07 Thread Felix Kuehling




On 2024-05-07 07:58, Thomas Zimmermann wrote:

Implement struct drm_client_funcs with the respective helpers and
remove the custom code from the emulation. The generic helpers are
equivalent in functionality.

Signed-off-by: Thomas Zimmermann 
---
  drivers/gpu/drm/radeon/radeon_fbdev.c | 66 ++-


Was radeon meant to be a separate patch?

Regards,
  Felix



  drivers/gpu/drm/tegra/fbdev.c | 58 ++-
  2 files changed, 6 insertions(+), 118 deletions(-)

diff --git a/drivers/gpu/drm/radeon/radeon_fbdev.c 
b/drivers/gpu/drm/radeon/radeon_fbdev.c
index 02bf25759059a..cf790922174ea 100644
--- a/drivers/gpu/drm/radeon/radeon_fbdev.c
+++ b/drivers/gpu/drm/radeon/radeon_fbdev.c
@@ -29,7 +29,6 @@
  #include 
  #include 
  
-#include 

  #include 
  #include 
  #include 
@@ -293,71 +292,12 @@ static const struct drm_fb_helper_funcs 
radeon_fbdev_fb_helper_funcs = {
  };
  
  /*

- * Fbdev client and struct drm_client_funcs
+ * struct drm_client_funcs
   */
  
-static void radeon_fbdev_client_unregister(struct drm_client_dev *client)

-{
-   struct drm_fb_helper *fb_helper = drm_fb_helper_from_client(client);
-   struct drm_device *dev = fb_helper->dev;
-   struct radeon_device *rdev = dev->dev_private;
-
-   if (fb_helper->info) {
-   vga_switcheroo_client_fb_set(rdev->pdev, NULL);
-   drm_helper_force_disable_all(dev);
-   drm_fb_helper_unregister_info(fb_helper);
-   } else {
-   drm_client_release(_helper->client);
-   drm_fb_helper_unprepare(fb_helper);
-   kfree(fb_helper);
-   }
-}
-
-static int radeon_fbdev_client_restore(struct drm_client_dev *client)
-{
-   drm_fb_helper_lastclose(client->dev);
-   vga_switcheroo_process_delayed_switch();
-
-   return 0;
-}
-
-static int radeon_fbdev_client_hotplug(struct drm_client_dev *client)
-{
-   struct drm_fb_helper *fb_helper = drm_fb_helper_from_client(client);
-   struct drm_device *dev = client->dev;
-   struct radeon_device *rdev = dev->dev_private;
-   int ret;
-
-   if (dev->fb_helper)
-   return drm_fb_helper_hotplug_event(dev->fb_helper);
-
-   ret = drm_fb_helper_init(dev, fb_helper);
-   if (ret)
-   goto err_drm_err;
-
-   if (!drm_drv_uses_atomic_modeset(dev))
-   drm_helper_disable_unused_functions(dev);
-
-   ret = drm_fb_helper_initial_config(fb_helper);
-   if (ret)
-   goto err_drm_fb_helper_fini;
-
-   vga_switcheroo_client_fb_set(rdev->pdev, fb_helper->info);
-
-   return 0;
-
-err_drm_fb_helper_fini:
-   drm_fb_helper_fini(fb_helper);
-err_drm_err:
-   drm_err(dev, "Failed to setup radeon fbdev emulation (ret=%d)\n", ret);
-   return ret;
-}
-
  static const struct drm_client_funcs radeon_fbdev_client_funcs = {
-   .owner  = THIS_MODULE,
-   .unregister = radeon_fbdev_client_unregister,
-   .restore= radeon_fbdev_client_restore,
-   .hotplug= radeon_fbdev_client_hotplug,
+   .owner = THIS_MODULE,
+   DRM_FBDEV_HELPER_CLIENT_FUNCS,
  };
  
  void radeon_fbdev_setup(struct radeon_device *rdev)

diff --git a/drivers/gpu/drm/tegra/fbdev.c b/drivers/gpu/drm/tegra/fbdev.c
index db6eaac3d30e6..f9cc365cfed94 100644
--- a/drivers/gpu/drm/tegra/fbdev.c
+++ b/drivers/gpu/drm/tegra/fbdev.c
@@ -12,7 +12,6 @@
  #include 
  
  #include 

-#include 
  #include 
  #include 
  #include 
@@ -150,63 +149,12 @@ static const struct drm_fb_helper_funcs 
tegra_fb_helper_funcs = {
  };
  
  /*

- * struct drm_client
+ * struct drm_client_funcs
   */
  
-static void tegra_fbdev_client_unregister(struct drm_client_dev *client)

-{
-   struct drm_fb_helper *fb_helper = drm_fb_helper_from_client(client);
-
-   if (fb_helper->info) {
-   drm_fb_helper_unregister_info(fb_helper);
-   } else {
-   drm_client_release(_helper->client);
-   drm_fb_helper_unprepare(fb_helper);
-   kfree(fb_helper);
-   }
-}
-
-static int tegra_fbdev_client_restore(struct drm_client_dev *client)
-{
-   drm_fb_helper_lastclose(client->dev);
-
-   return 0;
-}
-
-static int tegra_fbdev_client_hotplug(struct drm_client_dev *client)
-{
-   struct drm_fb_helper *fb_helper = drm_fb_helper_from_client(client);
-   struct drm_device *dev = client->dev;
-   int ret;
-
-   if (dev->fb_helper)
-   return drm_fb_helper_hotplug_event(dev->fb_helper);
-
-   ret = drm_fb_helper_init(dev, fb_helper);
-   if (ret)
-   goto err_drm_err;
-
-   if (!drm_drv_uses_atomic_modeset(dev))
-   drm_helper_disable_unused_functions(dev);
-
-   ret = drm_fb_helper_initial_config(fb_helper);
-   if (ret)
-   goto err_drm_fb_helper_fini;
-
-   return 0;
-
-err_drm_fb_helper_fini:
-   drm_fb_helper_fini(fb_helper);
-err_drm_err:
-   drm_err(dev,

Re: [PATCH] drm/amdkfd: Ensure gpu_id is unique

2024-05-06 Thread Felix Kuehling


On 2024-05-06 17:10, Harish Kasiviswanathan wrote:

On 2024-05-06 16:30, Felix Kuehling wrote:

On 2024-05-03 18:06, Harish Kasiviswanathan wrote:

gpu_id needs to be unique for user space to identify GPUs via KFD
interface. In the current implementation there is a very small
probability of having non unique gpu_ids.

v2: Add check to confirm if gpu_id is unique. If not unique, find one
  Changed commit header to reflect the above

Signed-off-by: Harish Kasiviswanathan 
---
   drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 26 ++-
   1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index b93913934b03..01d4c2e10c6d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -1095,6 +1095,8 @@ static uint32_t kfd_generate_gpu_id(struct kfd_node *gpu)
   uint32_t hashout;
   uint32_t buf[8];
   uint64_t local_mem_size;
+    struct kfd_topology_device *dev;
+    bool is_unique;
   int i;
     if (!gpu)
@@ -1115,6 +1117,28 @@ static uint32_t kfd_generate_gpu_id(struct kfd_node *gpu)
   for (i = 0, hashout = 0; i < 8; i++)
   hashout ^= hash_32(buf[i], KFD_GPU_ID_HASH_WIDTH);
   +    /* hash generated could be non-unique. Check if it is unique.
+ * If not unique increment till unique one is found. In case
+ * of overflow, restart from 1
+    */
+    down_read(_lock);
+    do {
+    is_unique = true;
+    list_for_each_entry(dev, _device_list, list) {
+    if (dev->gpu && dev->gpu_id == hashout) {
+    is_unique = false;
+    break;
+    }
+    }
+    if (unlikely(!is_unique)) {
+    hashout = (hashout + 1) &
+  ((1 << KFD_GPU_ID_HASH_WIDTH) - 1);
+    if (!hashout)
+    hashout = 1;

This doesn't catch the case that hashout was 0 before incrementing it, and was 
found to be unique.

I didn't actively think about this case when I sent the patch out. However, we 
don't have gpu_id to be 0. There are places where gpu_id=0 means it is CPU node


I think we make that assumption in a few places, both in kernel mode and 
user mode, e.g.:


struct kfd_process_device *kfd_process_device_data_by_id(struct kfd_process *p, 
uint32_t gpu_id)
{
int i;

if (gpu_id) {
for (i = 0; i < p->n_pdds; i++) {
struct kfd_process_device *pdd = p->pdds[i];

if (pdd->user_gpu_id == gpu_id)
return pdd;
}
}
return NULL;
}

Or in the Thunk in hsaKmtGetNodeProperties:

/* For CPU only node don't add any additional GPU memory banks. */
if (gpu_id) {
uint64_t base, limit;
if (is_dgpu)
NodeProperties->NumMemoryBanks += NUM_OF_DGPU_HEAPS;
else
NodeProperties->NumMemoryBanks += NUM_OF_IGPU_HEAPS;
if (fmm_get_aperture_base_and_limit(FMM_MMIO, gpu_id, ,
) == HSAKMT_STATUS_SUCCESS)
NodeProperties->NumMemoryBanks += 1;
}

Regards,
  Felix





Regards,
   Felix



+    }
+    } while (!is_unique);
+    up_read(_lock);
+
   return hashout;
   }
   /* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If
@@ -1946,7 +1970,6 @@ int kfd_topology_add_device(struct kfd_node *gpu)
   struct amdgpu_gfx_config *gfx_info = >adev->gfx.config;
   struct amdgpu_cu_info *cu_info = >adev->gfx.cu_info;
   -    gpu_id = kfd_generate_gpu_id(gpu);
   if (gpu->xcp && !gpu->xcp->ddev) {
   dev_warn(gpu->adev->dev,
    "Won't add GPU to topology since it has no drm node assigned.");
@@ -1969,6 +1992,7 @@ int kfd_topology_add_device(struct kfd_node *gpu)
   if (res)
   return res;
   +    gpu_id = kfd_generate_gpu_id(gpu);
   dev->gpu_id = gpu_id;
   gpu->id = gpu_id;

Re: [PATCH] drm/amdkfd: Ensure gpu_id is unique

2024-05-06 Thread Felix Kuehling




On 2024-05-03 18:06, Harish Kasiviswanathan wrote:

gpu_id needs to be unique for user space to identify GPUs via KFD
interface. In the current implementation there is a very small
probability of having non unique gpu_ids.

v2: Add check to confirm if gpu_id is unique. If not unique, find one
 Changed commit header to reflect the above

Signed-off-by: Harish Kasiviswanathan 
---
  drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 26 ++-
  1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index b93913934b03..01d4c2e10c6d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -1095,6 +1095,8 @@ static uint32_t kfd_generate_gpu_id(struct kfd_node *gpu)
uint32_t hashout;
uint32_t buf[8];
uint64_t local_mem_size;
+   struct kfd_topology_device *dev;
+   bool is_unique;
int i;
  
  	if (!gpu)

@@ -1115,6 +1117,28 @@ static uint32_t kfd_generate_gpu_id(struct kfd_node *gpu)
for (i = 0, hashout = 0; i < 8; i++)
hashout ^= hash_32(buf[i], KFD_GPU_ID_HASH_WIDTH);
  
+	/* hash generated could be non-unique. Check if it is unique.

+* If not unique increment till unique one is found. In case
+* of overflow, restart from 1
+   */
+   down_read(_lock);
+   do {
+   is_unique = true;
+   list_for_each_entry(dev, _device_list, list) {
+   if (dev->gpu && dev->gpu_id == hashout) {
+   is_unique = false;
+   break;
+   }
+   }
+   if (unlikely(!is_unique)) {
+   hashout = (hashout + 1) &
+ ((1 << KFD_GPU_ID_HASH_WIDTH) - 1);
+   if (!hashout)
+   hashout = 1;


This doesn't catch the case that hashout was 0 before incrementing it, 
and was found to be unique.


Regards,
  Felix



+   }
+   } while (!is_unique);
+   up_read(_lock);
+
return hashout;
  }
  /* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If
@@ -1946,7 +1970,6 @@ int kfd_topology_add_device(struct kfd_node *gpu)
struct amdgpu_gfx_config *gfx_info = >adev->gfx.config;
struct amdgpu_cu_info *cu_info = >adev->gfx.cu_info;
  
-	gpu_id = kfd_generate_gpu_id(gpu);

if (gpu->xcp && !gpu->xcp->ddev) {
dev_warn(gpu->adev->dev,
 "Won't add GPU to topology since it has no drm node 
assigned.");
@@ -1969,6 +1992,7 @@ int kfd_topology_add_device(struct kfd_node *gpu)
if (res)
return res;
  
+	gpu_id = kfd_generate_gpu_id(gpu);

dev->gpu_id = gpu_id;
gpu->id = gpu_id;

Re: [PATCH] drm/amdkfd: Refactor kfd CRIU into its own file

2024-05-06 Thread Felix Kuehling




On 2024-05-06 15:20, David Francis wrote:

The kfd CRIU code takes up about a thousand lines
in the kfd_chardev file; move it to its own file.

No functional change intended.

Signed-off-by: David Francis 
---
  drivers/gpu/drm/amd/amdkfd/Makefile  |   1 +
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 972 +-
  drivers/gpu/drm/amd/amdkfd/kfd_criu.c| 989 +++
  drivers/gpu/drm/amd/amdkfd/kfd_criu.h|  50 ++
  4 files changed, 1046 insertions(+), 966 deletions(-)
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_criu.c
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_criu.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index 0d3d8972240d..e06af4073ac5 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -32,6 +32,7 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_flat_memory.o \
$(AMDKFD_PATH)/kfd_process.o \
$(AMDKFD_PATH)/kfd_queue.o \
+   $(AMDKFD_PATH)/kfd_criu.o \


Any particular reason for adding this in the middle and not the end?



$(AMDKFD_PATH)/kfd_mqd_manager.o \
$(AMDKFD_PATH)/kfd_mqd_manager_cik.o \
$(AMDKFD_PATH)/kfd_mqd_manager_vi.o \
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 6b713fb0b818..e6e44a199a93 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -45,6 +45,7 @@


Can you remove #include  and "amdgpu_dma_buf.h" here? 
Or is it still needed by something else left in kfd_chardev.c?


Other than that, this patch is

Reviewed-by: Felix Kuehling 



  #include "kfd_smi_events.h"
  #include "amdgpu_dma_buf.h"
  #include "kfd_debug.h"
+#include "kfd_criu.h"
  
  static long kfd_ioctl(struct file *, unsigned int, unsigned long);

  static int kfd_open(struct inode *, struct file *);
@@ -1751,967 +1752,6 @@ static int kfd_ioctl_svm(struct file *filep, struct 
kfd_process *p, void *data)
  }
  #endif
  
-static int criu_checkpoint_process(struct kfd_process *p,

-uint8_t __user *user_priv_data,
-uint64_t *priv_offset)
-{
-   struct kfd_criu_process_priv_data process_priv;
-   int ret;
-
-   memset(_priv, 0, sizeof(process_priv));
-
-   process_priv.version = KFD_CRIU_PRIV_VERSION;
-   /* For CR, we don't consider negative xnack mode which is used for
-* querying without changing it, here 0 simply means disabled and 1
-* means enabled so retry for finding a valid PTE.
-*/
-   process_priv.xnack_mode = p->xnack_enabled ? 1 : 0;
-
-   ret = copy_to_user(user_priv_data + *priv_offset,
-   _priv, sizeof(process_priv));
-
-   if (ret) {
-   pr_err("Failed to copy process information to user\n");
-   ret = -EFAULT;
-   }
-
-   *priv_offset += sizeof(process_priv);
-   return ret;
-}
-
-static int criu_checkpoint_devices(struct kfd_process *p,
-uint32_t num_devices,
-uint8_t __user *user_addr,
-uint8_t __user *user_priv_data,
-uint64_t *priv_offset)
-{
-   struct kfd_criu_device_priv_data *device_priv = NULL;
-   struct kfd_criu_device_bucket *device_buckets = NULL;
-   int ret = 0, i;
-
-   device_buckets = kvzalloc(num_devices * sizeof(*device_buckets), 
GFP_KERNEL);
-   if (!device_buckets) {
-   ret = -ENOMEM;
-   goto exit;
-   }
-
-   device_priv = kvzalloc(num_devices * sizeof(*device_priv), GFP_KERNEL);
-   if (!device_priv) {
-   ret = -ENOMEM;
-   goto exit;
-   }
-
-   for (i = 0; i < num_devices; i++) {
-   struct kfd_process_device *pdd = p->pdds[i];
-
-   device_buckets[i].user_gpu_id = pdd->user_gpu_id;
-   device_buckets[i].actual_gpu_id = pdd->dev->id;
-
-   /*
-* priv_data does not contain useful information for now and is 
reserved for
-* future use, so we do not set its contents.
-*/
-   }
-
-   ret = copy_to_user(user_addr, device_buckets, num_devices * 
sizeof(*device_buckets));
-   if (ret) {
-   pr_err("Failed to copy device information to user\n");
-   ret = -EFAULT;
-   goto exit;
-   }
-
-   ret = copy_to_user(user_priv_data + *priv_offset,
-  device_priv,
-  num_devices * sizeof(*device_priv));
-   if (ret) {
-   pr_err("Failed to copy device information to user\n");
-   ret = -EFAULT;
-   }

Re: [PATCH] drm/amdkfd: Remove arbitrary timeout for hmm_range_fault

2024-05-06 Thread Felix Kuehling




On 2024-05-01 18:56, Philip Yang wrote:

On system with khugepaged enabled and user cases with THP buffer, the
hmm_range_fault may takes > 15 seconds to return -EBUSY, the arbitrary
timeout value is not accurate, cause memory allocation failure.

Remove the arbitrary timeout value, return EAGAIN to application if
hmm_range_fault return EBUSY, then userspace libdrm and Thunk will call
ioctl again.

Change EAGAIN to debug message as this is not error.

Signed-off-by: Philip Yang 


Assuming this passes your stress testing without CPU stall warnings, 
this patch is


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c |  5 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c  | 12 +++-
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c |  5 +
  3 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 54198c3928c7..02696c2102f1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1087,7 +1087,10 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t 
user_addr,
  
  	ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, );

if (ret) {
-   pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
+   if (ret == -EAGAIN)
+   pr_debug("Failed to get user pages, try again\n");
+   else
+   pr_err("%s: Failed to get user pages: %d\n", __func__, 
ret);
goto unregister_out;
}
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c

index 431ec72655ec..e36fede7f74c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
@@ -202,20 +202,12 @@ int amdgpu_hmm_range_get_pages(struct 
mmu_interval_notifier *notifier,
pr_debug("hmm range: start = 0x%lx, end = 0x%lx",
hmm_range->start, hmm_range->end);
  
-		/* Assuming 64MB takes maximum 1 second to fault page address */

-   timeout = max((hmm_range->end - hmm_range->start) >> 26, 1UL);
-   timeout *= HMM_RANGE_DEFAULT_TIMEOUT;
-   timeout = jiffies + msecs_to_jiffies(timeout);
+   timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
  
  retry:

hmm_range->notifier_seq = mmu_interval_read_begin(notifier);
r = hmm_range_fault(hmm_range);
if (unlikely(r)) {
-   schedule();
-   /*
-* FIXME: This timeout should encompass the retry from
-* mmu_interval_read_retry() as well.
-*/
if (r == -EBUSY && !time_after(jiffies, timeout))
goto retry;
goto out_free_pfns;
@@ -247,6 +239,8 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier 
*notifier,
  out_free_range:
kfree(hmm_range);
  
+	if (r == -EBUSY)

+   r = -EAGAIN;
return r;
  }
  
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c

index 94f83be2232d..e7040f809f33 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1670,11 +1670,8 @@ static int svm_range_validate_and_map(struct mm_struct 
*mm,
   readonly, owner, NULL,
   _range);
WRITE_ONCE(p->svms.faulting_task, NULL);
-   if (r) {
+   if (r)
pr_debug("failed %d to get svm range pages\n", 
r);
-   if (r == -EBUSY)
-   r = -EAGAIN;
-   }
} else {
r = -EFAULT;
}

Re: Proposal to add CRIU support to DRM render nodes

2024-05-03 Thread Felix Kuehling




On 2024-04-16 10:04, Tvrtko Ursulin wrote:
> 
> On 01/04/2024 18:58, Felix Kuehling wrote:
>>
>> On 2024-04-01 12:56, Tvrtko Ursulin wrote:
>>>
>>> On 01/04/2024 17:37, Felix Kuehling wrote:
>>>> On 2024-04-01 11:09, Tvrtko Ursulin wrote:
>>>>>
>>>>> On 28/03/2024 20:42, Felix Kuehling wrote:
>>>>>>
>>>>>> On 2024-03-28 12:03, Tvrtko Ursulin wrote:
>>>>>>>
>>>>>>> Hi Felix,
>>>>>>>
>>>>>>> I had one more thought while browsing around the amdgpu CRIU plugin. It 
>>>>>>> appears it relies on the KFD support being compiled in and /dev/kfd 
>>>>>>> present, correct? AFAICT at least, it relies on that to figure out the 
>>>>>>> amdgpu DRM node.
>>>>>>>
>>>>>>> In would be probably good to consider designing things without that 
>>>>>>> dependency. So that checkpointing an application which does not use 
>>>>>>> /dev/kfd is possible. Or if the kernel does not even have the KFD 
>>>>>>> support compiled in.
>>>>>>
>>>>>> Yeah, if we want to support graphics apps that don't use KFD, we should 
>>>>>> definitely do that. Currently we get a lot of topology information from 
>>>>>> KFD, not even from the /dev/kfd device but from the sysfs nodes exposed 
>>>>>> by KFD. We'd need to get GPU device info from the render nodes instead. 
>>>>>> And if KFD is available, we may need to integrate both sources of 
>>>>>> information.
>>>>>>
>>>>>>
>>>>>>>
>>>>>>> It could perhaps mean no more than adding some GPU discovery code into 
>>>>>>> CRIU. Which shuold be flexible enough to account for things like 
>>>>>>> re-assigned minor numbers due driver reload.
>>>>>>
>>>>>> Do you mean adding GPU discovery to the core CRIU, or to the plugin. I 
>>>>>> was thinking this is still part of the plugin.
>>>>>
>>>>> Yes I agree. I was only thinking about adding some DRM device discovery 
>>>>> code in a more decoupled fashion from the current plugin, for both the 
>>>>> reason discussed above (decoupling a bit from reliance on kfd sysfs), and 
>>>>> then also if/when a new DRM driver might want to implement this the code 
>>>>> could be move to some common plugin area.
>>>>>
>>>>> I am not sure how feasible that would be though. The "gpu id" concept and 
>>>>> it's matching in the current kernel code and CRIU plugin - is that value 
>>>>> tied to the physical GPU instance or how it works?
>>>>
>>>> The concept of the GPU ID is that it's stable while the system is up, even 
>>>> when devices get added and removed dynamically. It was baked into the API 
>>>> early on, but I don't think we ever fully validated device hot plug. I 
>>>> think the closest we're getting is with our latest MI GPUs and dynamic 
>>>> partition mode change.
>>>
>>> Doesn't it read the saved gpu id from the image file while doing restore 
>>> and tries to open the render node to match it? Maybe I am misreading the 
>>> code.. But if it does, does it imply that in practice it could be stable 
>>> across reboots? Or that it is not possible to restore to a different 
>>> instance of maybe the same GPU model installed in a system?
>>
>> Ah, the idea is, that when you restore on a different system, you may get 
>> different GPU IDs. Or you may checkpoint an app running on GPU 1 but restore 
>> it on GPU 2 on the same system. That's why we need to translate GPU IDs in 
>> restored applications. User mode still uses the old GPU IDs, but the kernel 
>> mode driver translates them to the actual GPU IDs of the GPUs that the 
>> process was restored on.
> 
> I see.. I think. Normal flow is ppd->user_gpu_id set during client init, but 
> for restored clients it gets overriden during restore so that any further 
> ioctls can actually not instantly fail.
> 
> And then in amdgpu_plugin_restore_file, when it is opening the render node, 
> it relies on the kfd topology to have filled in (more or less) the 
> target_gpu_id corresponding to the render node gpu id of the target GPU - the 
> one associated with the new kfd gpu_id?

Yes.

> 
> I am digging into this be

Re: [PATCH v3 2/3] drm/amdgpu: Reduce mem_type to domain double indirection

2024-05-02 Thread Felix Kuehling




On 2024-04-30 13:16, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

All apart from AMDGPU_GEM_DOMAIN_GTT memory domains map 1:1 to TTM
placements. And the former be either AMDGPU_PL_PREEMPT or TTM_PL_TT,
depending on AMDGPU_GEM_CREATE_PREEMPTIBLE.

Simplify a few places in the code which convert the TTM placement into
a domain by checking against the current placement directly.

In the conversion AMDGPU_PL_PREEMPT either does not have to be handled
because amdgpu_mem_type_to_domain() cannot return that value anyway.

v2:
  * Remove AMDGPU_PL_PREEMPT handling.

v3:
  * Rebase.

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Christian König  # v1
Reviewed-by: Felix Kuehling  # v2


I'm waiting for Christian to review patches 1 and 3. Then I can apply 
the whole series.


Regards,
  Felix



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c |  3 +--
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c  | 29 +
  2 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
index 055ba2ea4c12..0b3b10d21952 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
@@ -165,8 +165,7 @@ static struct sg_table *amdgpu_dma_buf_map(struct 
dma_buf_attachment *attach,
if (r)
return ERR_PTR(r);
  
-	} else if (!(amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type) &

-AMDGPU_GEM_DOMAIN_GTT)) {
+   } else if (bo->tbo.resource->mem_type != TTM_PL_TT) {
return ERR_PTR(-EBUSY);
}
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c

index b2a83c802bbd..c581e4952cbd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -983,12 +983,11 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 
domain,
  
  	ttm_bo_pin(>tbo);
  
-	domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type);

-   if (domain == AMDGPU_GEM_DOMAIN_VRAM) {
+   if (bo->tbo.resource->mem_type == TTM_PL_VRAM) {
atomic64_add(amdgpu_bo_size(bo), >vram_pin_size);
atomic64_add(amdgpu_vram_mgr_bo_visible_size(bo),
 >visible_pin_size);
-   } else if (domain == AMDGPU_GEM_DOMAIN_GTT) {
+   } else if (bo->tbo.resource->mem_type == TTM_PL_TT) {
atomic64_add(amdgpu_bo_size(bo), >gart_pin_size);
}
  
@@ -1289,7 +1288,6 @@ void amdgpu_bo_get_memory(struct amdgpu_bo *bo,

struct ttm_resource *res = bo->tbo.resource;
uint64_t size = amdgpu_bo_size(bo);
struct drm_gem_object *obj;
-   unsigned int domain;
bool shared;
  
  	/* Abort if the BO doesn't currently have a backing store */

@@ -1299,21 +1297,20 @@ void amdgpu_bo_get_memory(struct amdgpu_bo *bo,
obj = >tbo.base;
shared = drm_gem_object_is_shared_for_memory_stats(obj);
  
-	domain = amdgpu_mem_type_to_domain(res->mem_type);

-   switch (domain) {
-   case AMDGPU_GEM_DOMAIN_VRAM:
+   switch (res->mem_type) {
+   case TTM_PL_VRAM:
stats->vram += size;
-   if (amdgpu_res_cpu_visible(adev, bo->tbo.resource))
+   if (amdgpu_res_cpu_visible(adev, res))
stats->visible_vram += size;
if (shared)
stats->vram_shared += size;
break;
-   case AMDGPU_GEM_DOMAIN_GTT:
+   case TTM_PL_TT:
stats->gtt += size;
if (shared)
stats->gtt_shared += size;
break;
-   case AMDGPU_GEM_DOMAIN_CPU:
+   case TTM_PL_SYSTEM:
default:
stats->cpu += size;
if (shared)
@@ -1326,7 +1323,7 @@ void amdgpu_bo_get_memory(struct amdgpu_bo *bo,
if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)
stats->requested_visible_vram += size;
  
-		if (domain != AMDGPU_GEM_DOMAIN_VRAM) {

+   if (res->mem_type != TTM_PL_VRAM) {
stats->evicted_vram += size;
if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)
stats->evicted_visible_vram += size;
@@ -1600,20 +1597,18 @@ u64 amdgpu_bo_print_info(int id, struct amdgpu_bo *bo, 
struct seq_file *m)
u64 size;
  
  	if (dma_resv_trylock(bo->tbo.base.resv)) {

-   unsigned int domain;
  
-		domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type);

-   switch (domain) {
-   case AMDGPU_GEM_DOMAIN_VRAM:
+   switch (bo->tbo.resource->mem_type) {
+   case TTM_PL_VRAM:
if (amdgpu_res_cpu_v

Re: [PATCH 1/2] drm/amdkfd: Use dev_error intead of pr_error

2024-05-01 Thread Felix Kuehling



On 2024-05-01 21:08, Harish Kasiviswanathan wrote:
> No functional change. This will help in moving gpu_id creation to next
> step while still being able to identify the correct GPU
> 
> Signed-off-by: Harish Kasiviswanathan 

Reviewed-by: Felix Kuehling 

> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 19 ---
>  1 file changed, 8 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> index ba326b43bec5..b93913934b03 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> @@ -1773,7 +1773,7 @@ static void kfd_fill_cache_non_crat_info(struct 
> kfd_topology_device *dev, struct
>   pr_debug("Added [%d] GPU cache entries\n", num_of_entries);
>  }
>  
> -static int kfd_topology_add_device_locked(struct kfd_node *gpu, uint32_t 
> gpu_id,
> +static int kfd_topology_add_device_locked(struct kfd_node *gpu,
> struct kfd_topology_device **dev)
>  {
>   int proximity_domain = ++topology_crat_proximity_domain;
> @@ -1786,8 +1786,7 @@ static int kfd_topology_add_device_locked(struct 
> kfd_node *gpu, uint32_t gpu_id,
>   COMPUTE_UNIT_GPU, gpu,
>   proximity_domain);
>   if (res) {
> - pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n",
> -gpu_id);
> + dev_err(gpu->adev->dev, "Error creating VCRAT\n");
>   topology_crat_proximity_domain--;
>   goto err;
>   }
> @@ -1798,8 +1797,7 @@ static int kfd_topology_add_device_locked(struct 
> kfd_node *gpu, uint32_t gpu_id,
>  _topology_device_list,
>  proximity_domain);
>   if (res) {
> - pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n",
> -gpu_id);
> + dev_err(gpu->adev->dev, "Error parsing VCRAT\n");
>   topology_crat_proximity_domain--;
>   goto err;
>   }
> @@ -1825,8 +1823,8 @@ static int kfd_topology_add_device_locked(struct 
> kfd_node *gpu, uint32_t gpu_id,
>   if (!res)
>   sys_props.generation_count++;
>   else
> - pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. 
> res=%d\n",
> -gpu_id, res);
> + dev_err(gpu->adev->dev, "Failed to update GPU to sysfs 
> topology. res=%d\n",
> + res);
>  
>  err:
>   kfd_destroy_crat_image(crat_image);
> @@ -1951,11 +1949,10 @@ int kfd_topology_add_device(struct kfd_node *gpu)
>   gpu_id = kfd_generate_gpu_id(gpu);
>   if (gpu->xcp && !gpu->xcp->ddev) {
>   dev_warn(gpu->adev->dev,
> - "Won't add GPU (ID: 0x%x) to topology since it has no drm node 
> assigned.",
> - gpu_id);
> +  "Won't add GPU to topology since it has no drm node 
> assigned.");
>   return 0;
>   } else {
> - pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id);
> + dev_dbg(gpu->adev->dev, "Adding new GPU to topology\n");
>   }
>  
>   /* Check to see if this gpu device exists in the topology_device_list.
> @@ -1967,7 +1964,7 @@ int kfd_topology_add_device(struct kfd_node *gpu)
>   down_write(_lock);
>   dev = kfd_assign_gpu(gpu);
>   if (!dev)
> - res = kfd_topology_add_device_locked(gpu, gpu_id, );
> + res = kfd_topology_add_device_locked(gpu, );
>   up_write(_lock);
>   if (res)
>   return res;

Re: [PATCH 2/2] drm/amdkfd: Improve chances of unique gpu_id

2024-05-01 Thread Felix Kuehling




On 2024-05-01 21:08, Harish Kasiviswanathan wrote:
> gpu_id needs to be unique for user space to identify GPUs via KFD
> interface. Do a single pass search to detect collision. If
> detected, increment gpu_id by one.
> 
> Probability of collisons are very rare. Hence, no more complexity is
> added to ensure uniqueness.> 
> Signed-off-by: Harish Kasiviswanathan 
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 12 ++--
>  1 file changed, 10 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> index b93913934b03..f2d1e82e7bed 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> @@ -1095,6 +1095,8 @@ static uint32_t kfd_generate_gpu_id(struct kfd_node 
> *gpu)
>   uint32_t hashout;
>   uint32_t buf[8];
>   uint64_t local_mem_size;
> + struct kfd_topology_device *dev;
> + bool is_unique = true;
>   int i;
>  
>   if (!gpu)
> @@ -1115,7 +1117,13 @@ static uint32_t kfd_generate_gpu_id(struct kfd_node 
> *gpu)
>   for (i = 0, hashout = 0; i < 8; i++)
>   hashout ^= hash_32(buf[i], KFD_GPU_ID_HASH_WIDTH);
>  
> - return hashout;
> + down_read(_lock);
> + list_for_each_entry(dev, _device_list, list) {
> + if (dev->gpu && dev->gpu_id == hashout)
> + is_unique = false;

You can break early here.

> + }
> + up_read(_lock);
> + return is_unique ? hashout : ++hashout;

We should make sure that hashout stays within the KFD_GPU_ID_HASH_WIDTH. And if 
we're already adding a collision check, we may as well make it air-tight. It 
should be easy enough by wrapping it in a do-while loop. While we're at it, can 
we also check that the hash is not 0, because that value is used for non-GPU 
nodes? I think this would satisfy all my requests:

do {
if (!hashout)
hashout++;
is_unique = true;
list_for_each_entry(dev, _device_list, list) {
if (dev->gpu && dev->gpu_id == hashout) {
is_unique = false;
hashout = (hashout + 1) &
  ((1U << KFD_GPU_ID_HASH_WIDTH) - 1);
break;
}
}
} while (!is_unique);

Regards,
  Felix


>  }
>  /* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If
>   *   the GPU device is not already present in the topology device
> @@ -1946,7 +1954,6 @@ int kfd_topology_add_device(struct kfd_node *gpu)
>   struct amdgpu_gfx_config *gfx_info = >adev->gfx.config;
>   struct amdgpu_cu_info *cu_info = >adev->gfx.cu_info;
>  
> - gpu_id = kfd_generate_gpu_id(gpu);
>   if (gpu->xcp && !gpu->xcp->ddev) {
>   dev_warn(gpu->adev->dev,
>"Won't add GPU to topology since it has no drm node 
> assigned.");
> @@ -1969,6 +1976,7 @@ int kfd_topology_add_device(struct kfd_node *gpu)
>   if (res)
>   return res;
>  
> + gpu_id = kfd_generate_gpu_id(gpu);
>   dev->gpu_id = gpu_id;
>   gpu->id = gpu_id;
>

Re: [PATCH v2] drm/amd/amdkfd: Fix a resource leak in svm_range_validate_and_map()

2024-05-01 Thread Felix Kuehling




On 2024-05-01 16:38, Ramesh Errabolu wrote:

Analysis of code by Coverity, a static code analyser, has identified
a resource leak in the symbol hmm_range. This leak occurs when one of
the prior steps before it is released encounters an error.

Signed-off-by: Ramesh Errabolu 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 9 +++--
  1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 386875e6eb96..481cb958e165 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1658,7 +1658,7 @@ static int svm_range_validate_and_map(struct mm_struct 
*mm,
start = map_start << PAGE_SHIFT;
end = (map_last + 1) << PAGE_SHIFT;
for (addr = start; !r && addr < end; ) {
-   struct hmm_range *hmm_range;
+   struct hmm_range *hmm_range = NULL;
unsigned long map_start_vma;
unsigned long map_last_vma;
struct vm_area_struct *vma;
@@ -1696,7 +1696,12 @@ static int svm_range_validate_and_map(struct mm_struct 
*mm,
}
  
  		svm_range_lock(prange);

-   if (!r && amdgpu_hmm_range_get_pages_done(hmm_range)) {
+
+   /* Free backing memory of hmm_range if it was initialized
+* Overrride return value to TRY AGAIN only if prior returns
+* were successful
+*/
+   if (hmm_range && amdgpu_hmm_range_get_pages_done(hmm_range) && 
!r) {
pr_debug("hmm update the range, need validate again\n");
r = -EAGAIN;
}

Re: [PATCH] drm/amd/amdkfd: Fix a resource leak in svm_range_validate_and_map()

2024-05-01 Thread Felix Kuehling




On 2024-05-01 14:34, Felix Kuehling wrote:



On 2024-04-30 19:29, Ramesh Errabolu wrote:

Analysis of code by Coverity, a static code analyser, has identified
a resource leak in the symbol hmm_range. This leak occurs when one of
the prior steps before it is released encounters an error.

Signed-off-by: Ramesh Errabolu 
---
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 6 --
  1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c

index 386875e6eb96..dcb1d5d3f860 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1658,7 +1658,7 @@ static int svm_range_validate_and_map(struct 
mm_struct *mm,

  start = map_start << PAGE_SHIFT;
  end = (map_last + 1) << PAGE_SHIFT;
  for (addr = start; !r && addr < end; ) {
-    struct hmm_range *hmm_range;
+    struct hmm_range *hmm_range = NULL;
  unsigned long map_start_vma;
  unsigned long map_last_vma;
  struct vm_area_struct *vma;
@@ -1696,7 +1696,9 @@ static int svm_range_validate_and_map(struct 
mm_struct *mm,

  }
  svm_range_lock(prange);
-    if (!r && amdgpu_hmm_range_get_pages_done(hmm_range)) {
+
+    // Free backing memory of hmm_range if it was initialized
+    if (hmm_range && amdgpu_hmm_range_get_pages_done(hmm_range)) {
  pr_debug("hmm update the range, need validate again\n");
  r = -EAGAIN;


Nack! This can now override other error codes that aren't meant to be 
overridden with -EAGAIN.


I think a better solution would be to just revserse this condition to 
ensure that amdgpu_hmm_range_get_pages_done is always called:


     if (amdgpu_hmm_range_get_pages_done(hmm_range) && !r) {


Correction: You still need the NULL check:

if (hmm_range &&
amdgpu_hmm_range_get_pages_done(hmm_range) &&
!r) {
...
}

Regards,
  Felix


     ...
     r = -EAGAIN;
     }

Regards,
   Felix


  }

Re: [PATCH] drm/amd/amdkfd: Fix a resource leak in svm_range_validate_and_map()

2024-05-01 Thread Felix Kuehling





On 2024-04-30 19:29, Ramesh Errabolu wrote:

Analysis of code by Coverity, a static code analyser, has identified
a resource leak in the symbol hmm_range. This leak occurs when one of
the prior steps before it is released encounters an error.

Signed-off-by: Ramesh Errabolu 
---
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 6 --
  1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 386875e6eb96..dcb1d5d3f860 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1658,7 +1658,7 @@ static int svm_range_validate_and_map(struct mm_struct 
*mm,
start = map_start << PAGE_SHIFT;
end = (map_last + 1) << PAGE_SHIFT;
for (addr = start; !r && addr < end; ) {
-   struct hmm_range *hmm_range;
+   struct hmm_range *hmm_range = NULL;
unsigned long map_start_vma;
unsigned long map_last_vma;
struct vm_area_struct *vma;
@@ -1696,7 +1696,9 @@ static int svm_range_validate_and_map(struct mm_struct 
*mm,
}
  
  		svm_range_lock(prange);

-   if (!r && amdgpu_hmm_range_get_pages_done(hmm_range)) {
+
+   // Free backing memory of hmm_range if it was initialized
+   if (hmm_range && amdgpu_hmm_range_get_pages_done(hmm_range)) {
pr_debug("hmm update the range, need validate again\n");
r = -EAGAIN;


Nack! This can now override other error codes that aren't meant to be 
overridden with -EAGAIN.


I think a better solution would be to just revserse this condition to 
ensure that amdgpu_hmm_range_get_pages_done is always called:


if (amdgpu_hmm_range_get_pages_done(hmm_range) && !r) {
...
r = -EAGAIN;
}

Regards,
  Felix


}

Re: [PATCH v2] drm/amdkfd: Let VRAM allocations go to GTT domain on small APUs

2024-04-30 Thread Felix Kuehling




On 2024-04-30 6:08, Lang Yu wrote:

Small APUs(i.e., consumer, embedded products) usually have a small
carveout device memory which can't satisfy most compute workloads
memory allocation requirements.

We can't even run a Basic MNIST Example with a default 512MB carveout.
https://github.com/pytorch/examples/tree/main/mnist.
Error Log when running mnist:
"torch.cuda.OutOfMemoryError: HIP out of memory. Tried to allocate
84.00 MiB. GPU 0 has a total capacity of 512.00 MiB of which 0 bytes
is free. Of the allocated memory 103.83 MiB is allocated by PyTorch,
and 22.17 MiB is reserved by PyTorch but unallocated"

Though we can change BIOS settings to enlarge carveout size,
which is inflexible and may bring complaint. On the other hand,
the memory resource can't be effectively used between host and device.

The solution is MI300A approach, i.e., let VRAM allocations go to GTT.
Then device and host can effectively share system memory.

v2: Report local_mem_size_private as 0. (Felix)

Signed-off-by: Lang Yu 


Reviewed-by: Felix Kuehling 


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c|  5 +
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 20 ++-
  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c  |  2 +-
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c  |  6 --
  drivers/gpu/drm/amd/amdkfd/kfd_svm.h  |  3 ++-
  5 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 7ba05f030dd1..e3738d417245 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -455,6 +455,9 @@ void amdgpu_amdkfd_get_local_mem_info(struct amdgpu_device 
*adev,
else
mem_info->local_mem_size_private =
KFD_XCP_MEMORY_SIZE(adev, xcp->id);
+   } else if (adev->flags & AMD_IS_APU) {
+   mem_info->local_mem_size_public = (ttm_tt_pages_limit() << 
PAGE_SHIFT);
+   mem_info->local_mem_size_private = 0;
} else {
mem_info->local_mem_size_public = adev->gmc.visible_vram_size;
mem_info->local_mem_size_private = adev->gmc.real_vram_size -
@@ -824,6 +827,8 @@ u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device 
*adev, int xcp_id)
}
do_div(tmp, adev->xcp_mgr->num_xcp_per_mem_partition);
return ALIGN_DOWN(tmp, PAGE_SIZE);
+   } else if (adev->flags & AMD_IS_APU) {
+   return (ttm_tt_pages_limit() << PAGE_SHIFT);
} else {
return adev->gmc.real_vram_size;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 4bdf59213384..5843c3d35cb9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -196,7 +196,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
*adev,
return -EINVAL;
  
  		vram_size = KFD_XCP_MEMORY_SIZE(adev, xcp_id);

-   if (adev->gmc.is_app_apu) {
+   if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
system_mem_needed = size;
ttm_mem_needed = size;
}
@@ -232,7 +232,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
*adev,
  "adev reference can't be null when vram is used");
if (adev && xcp_id >= 0) {
adev->kfd.vram_used[xcp_id] += vram_needed;
-   adev->kfd.vram_used_aligned[xcp_id] += adev->gmc.is_app_apu ?
+   adev->kfd.vram_used_aligned[xcp_id] +=
+   (adev->gmc.is_app_apu || adev->flags & 
AMD_IS_APU) ?
vram_needed :
ALIGN(vram_needed, VRAM_AVAILABLITY_ALIGN);
}
@@ -260,7 +261,7 @@ void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device 
*adev,
  
  		if (adev) {

adev->kfd.vram_used[xcp_id] -= size;
-   if (adev->gmc.is_app_apu) {
+   if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
adev->kfd.vram_used_aligned[xcp_id] -= size;
kfd_mem_limit.system_mem_used -= size;
kfd_mem_limit.ttm_mem_used -= size;
@@ -889,7 +890,7 @@ static int kfd_mem_attach(struct amdgpu_device *adev, 
struct kgd_mem *mem,
 * if peer device has large BAR. In contrast, access over xGMI is
 * allowed for both small and large BAR configurations of peer device
 */
-   if ((adev != bo_adev && !adev->gmc.is_app_apu) &&
+   if (

Re: [PATCH 2/3] drm/amdgpu: Reduce mem_type to domain double indirection

2024-04-29 Thread Felix Kuehling




On 2024-04-29 12:47, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

All apart from AMDGPU_GEM_DOMAIN_GTT memory domains map 1:1 to TTM
placements. And the former be either AMDGPU_PL_PREEMPT or TTM_PL_TT,
depending on AMDGPU_GEM_CREATE_PREEMPTIBLE.

Simplify a few places in the code which convert the TTM placement into
a domain by checking against the current placement directly.

In the conversion AMDGPU_PL_PREEMPT either does not have to be handled
because amdgpu_mem_type_to_domain() cannot return that value anyway.

v2:
  * Remove AMDGPU_PL_PREEMPT handling.

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Christian König  # v1

Reviewed-by: Felix Kuehling 

I also ran kfdtest on a multi-GPU system just to make sure this didn't 
break our multi-GPU support. BTW, I had to fix up some things when I 
tried to apply your patch to the current amd-staging-drm-next branch. 
That branch was just rebased on Linux 6.8, so maybe that's part of the 
reason.




---
  drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c |  3 +--
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c  | 27 +
  2 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
index 055ba2ea4c12..0b3b10d21952 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
@@ -165,8 +165,7 @@ static struct sg_table *amdgpu_dma_buf_map(struct 
dma_buf_attachment *attach,
if (r)
return ERR_PTR(r);
  
-	} else if (!(amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type) &

-AMDGPU_GEM_DOMAIN_GTT)) {
+   } else if (bo->tbo.resource->mem_type != TTM_PL_TT) {
return ERR_PTR(-EBUSY);
}
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c

index 8bc79924d171..eb5bd6962560 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -976,12 +976,11 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 
domain,
  
  	ttm_bo_pin(>tbo);
  
-	domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type);

-   if (domain == AMDGPU_GEM_DOMAIN_VRAM) {
+   if (bo->tbo.resource->mem_type == TTM_PL_VRAM) {
atomic64_add(amdgpu_bo_size(bo), >vram_pin_size);
atomic64_add(amdgpu_vram_mgr_bo_visible_size(bo),
 >visible_pin_size);
-   } else if (domain == AMDGPU_GEM_DOMAIN_GTT) {
+   } else if (bo->tbo.resource->mem_type == TTM_PL_TT) {
atomic64_add(amdgpu_bo_size(bo), >gart_pin_size);
}
  
@@ -1280,7 +1279,6 @@ void amdgpu_bo_get_memory(struct amdgpu_bo *bo,

  {
uint64_t size = amdgpu_bo_size(bo);
struct drm_gem_object *obj;
-   unsigned int domain;
bool shared;
  
  	/* Abort if the BO doesn't currently have a backing store */

@@ -1290,21 +1288,20 @@ void amdgpu_bo_get_memory(struct amdgpu_bo *bo,
obj = >tbo.base;
shared = drm_gem_object_is_shared_for_memory_stats(obj);
  
-	domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type);

-   switch (domain) {
-   case AMDGPU_GEM_DOMAIN_VRAM:
+   switch (bo->tbo.resource->mem_type) {
+   case TTM_PL_VRAM:
stats->vram += size;
if (amdgpu_bo_in_cpu_visible_vram(bo))
stats->visible_vram += size;
if (shared)
stats->vram_shared += size;
break;
-   case AMDGPU_GEM_DOMAIN_GTT:
+   case TTM_PL_TT:
stats->gtt += size;
if (shared)
stats->gtt_shared += size;
break;
-   case AMDGPU_GEM_DOMAIN_CPU:
+   case TTM_PL_SYSTEM:
default:
stats->cpu += size;
if (shared)
@@ -1317,7 +1314,7 @@ void amdgpu_bo_get_memory(struct amdgpu_bo *bo,
if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)
stats->requested_visible_vram += size;
  
-		if (domain != AMDGPU_GEM_DOMAIN_VRAM) {

+   if (bo->tbo.resource->mem_type != TTM_PL_VRAM) {
stats->evicted_vram += size;
if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)
stats->evicted_visible_vram += size;
@@ -1592,19 +1589,17 @@ u64 amdgpu_bo_print_info(int id, struct amdgpu_bo *bo, 
struct seq_file *m)
u64 size;
  
  	if (dma_resv_trylock(bo->tbo.base.resv)) {

-   unsigned int domain;
-   domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type);
-   switch (domain) {
-   case AMDGPU_GEM_DOMAIN_VRAM:
+   switch (bo->

Re: [PATCH] drm/amdkfd: update buffer_{store,load}_* modifiers for gfx940

2024-04-29 Thread Felix Kuehling


On 2024-04-29 17:50, Jay Cornwall wrote:

On 4/29/2024 06:06, Lancelot SIX wrote:

Instruction modifiers of the untyped vector memory buffer instructions
(MUBUF encoded) changed in gfx940.  The slc, scc and glc modifiers have
been replaced with sc0, sc1 and nt.

The current CWSR trap handler is written using pre-gfx940 modifier
names, making the source incompatible with a strict gfx940 assembler.

This patch updates the cwsr_trap_handler_gfx9.s source file to be
compatible with all gfx9 variants of the ISA.  The binary assembled code
is unchanged (so the behaviour is unchanged as well), only the source
representation is updated.

Signed-off-by: Lancelot SIX 
---
  .../drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | 24 ---
  1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm 
b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm

index bb26338204f4..a2d597d7fb57 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
@@ -48,6 +48,12 @@ var ACK_SQC_STORE    = 1    
//workaround for suspected SQC store bug causing
  var SAVE_AFTER_XNACK_ERROR    =    1 //workaround for TCP store 
failure after XNACK error when ALLOW_REPLAY=0, for debugger
  var SINGLE_STEP_MISSED_WORKAROUND   =    (ASIC_FAMILY <= 
CHIP_ALDEBARAN)    //workaround for lost MODE.DEBUG_EN exception when 
SAVECTX raised

  +#if ASIC_FAMILY < CHIP_GC_9_4_3
+#define VMEM_MODIFIERS slc:1 glc:1
+#else
+#define VMEM_MODIFIERS sc0:1 nt:1
+#endif
+
/**/
  /*    variables  */
/**/
@@ -581,7 +587,7 @@ end
  L_SAVE_LDS_LOOP_VECTOR:
    ds_read_b64 v[0:1], v2    //x =LDS[a], byte address
    s_waitcnt lgkmcnt(0)
-  buffer_store_dwordx2  v[0:1], v2, s_save_buf_rsrc0, 
s_save_mem_offset offen:1  glc:1  slc:1
+  buffer_store_dwordx2  v[0:1], v2, s_save_buf_rsrc0, 
s_save_mem_offset VMEM_MODIFIERS offen:1

  //    s_waitcnt vmcnt(0)
  //    v_add_u32 v2, vcc[0:1], v2, v3
    v_add_u32 v2, v2, v3
@@ -979,17 +985,17 @@ L_TCP_STORE_CHECK_DONE:
  end
    function write_4vgprs_to_mem(s_rsrc, s_mem_offset)
-    buffer_store_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1
-    buffer_store_dword v1, v0, s_rsrc, s_mem_offset slc:1 glc:1  
offset:256
-    buffer_store_dword v2, v0, s_rsrc, s_mem_offset slc:1 glc:1  
offset:256*2
-    buffer_store_dword v3, v0, s_rsrc, s_mem_offset slc:1 glc:1  
offset:256*3

+    buffer_store_dword v0, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS
+    buffer_store_dword v1, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS 
offset:256
+    buffer_store_dword v2, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS 
offset:256*2
+    buffer_store_dword v3, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS 
offset:256*3

  end
    function read_4vgprs_from_mem(s_rsrc, s_mem_offset)
-    buffer_load_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1
-    buffer_load_dword v1, v0, s_rsrc, s_mem_offset slc:1 glc:1 
offset:256
-    buffer_load_dword v2, v0, s_rsrc, s_mem_offset slc:1 glc:1 
offset:256*2
-    buffer_load_dword v3, v0, s_rsrc, s_mem_offset slc:1 glc:1 
offset:256*3

+    buffer_load_dword v0, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS
+    buffer_load_dword v1, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS 
offset:256
+    buffer_load_dword v2, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS 
offset:256*2
+    buffer_load_dword v3, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS 
offset:256*3

  s_waitcnt vmcnt(0)
  end

base-commit: cf743996352e327f483dc7d66606c90276f57380


Reviewed-by: Jay Cornwall 


Acked-by: Felix Kuehling 

Do you need me to submit the patch to amd-staging-drm-next?

Thanks,
  Felix

Re: [PATCH 2/2] drm/amdkfd: Allow memory oversubscription on small APUs

2024-04-29 Thread Felix Kuehling

On 2024-04-29 06:38, Yu, Lang wrote:

[Public]

-Original Message-
From: Kuehling, Felix 
Sent: Saturday, April 27, 2024 6:45 AM
To: Yu, Lang ; amd-gfx@lists.freedesktop.org
Cc: Yang, Philip ; Koenig, Christian
; Zhang, Yifan ; Liu,
Aaron 
Subject: Re: [PATCH 2/2] drm/amdkfd: Allow memory oversubscription on
small APUs

On 2024-04-26 04:37, Lang Yu wrote:

The default ttm_tt_pages_limit is 1/2 of system memory.
It is prone to out of memory with such a configuration.

Indiscriminately allowing the violation of all memory limits is not a good
solution. It will lead to poor performance once you actually reach
ttm_pages_limit and TTM starts swapping out BOs.

Hi Felix,

I just feel it's like a bug that 1/2 of system memory is fee, the driver tells 
users out of memory.
On the other hand, if memory is available, why not use it.

TTM does not allow us to use more than 1/2 system memory. I believe 
that's because TTM needs additional memory to swap out BOs. Any GTT 
allocation through the render node APIs is subject to the same limitations.

Render node APIs can handle memory overcommitment more gracefully 
because the kernel mode driver is in the loop for command submissions 
and fences. That doesn't work for KFD with user mode queues. The memory 
limits in KFD are there to prevent overcommitting memory because we need 
all of our memory (per process) to be resident at the same time. If we 
let KFD exceed the TTM limits, we get into situations where we're 
thrashing (processes evicting each other constantly) or even worse, 
where we're just not able to make all memory resident. So we end up with 
suspended user mode queues and extremely poor performance or soft hangs.

By the way, can we use USERPTR for VRAM allocations?
Then we don't have ttm_tt_pages_limit limitations. Thanks.

No. There is an expectation that VRAM BOs can be shared between 
processes through DMABufs (for HIP IPC APIs). You can't export userptrs 
as DMABufs.

You can try to raise the TTM pages limit using a TTM module parameter. 
But this is taking a risk for system stability when TTM gets into a 
situation where it needs to swap out a large BO.

Regards,
  Felix

I actually did some tests on Strix (12 CU@2100 MHz, 29412M 128bits 
LPDDR5@937MHz) with
https://github.com/ROCm/pytorch-micro-benchmarking.

Command: python micro_benchmarking_pytorch.py --network resnet50 
--batch-size=64 --iterations=20

1, Run 1 resnet50 (FP32, batch size 64)
Memory usage:
 System mem used 6748M out of 29412M
 TTM mem used 6658M out of 15719M
Memory oversubscription percentage:  0
Throughput [img/sec] : 49.04

2,  Run 2 resnet50 simultaneously (FP32, batch size 64)
Memory usage:
 System mem used 13496M out of 29412M
 TTM mem used 13316M out of 15719M
Memory oversubscription percentage:  0
Throughput [img/sec] (respectively) : 25.27 / 26.70

3, Run 3 resnet50 simultaneously (FP32, batch size 64)
Memory usage:
 System mem used 20245M out of 29412M
 TTM mem used 19974M out of 15719M
Memory oversubscription percentage:  ~27%

Throughput [img/sec](respectively) : 10.62 / 7.47 / 6.90 (In theory: 16 / 16 / 
16)

 From my observations,

1, GPU is underutilized a lot, sometimes its loading is less than 50% and even 
0, when running 3 resnet50 simultaneously with ~27% memory oversubscription.
The driver is busying evicting and restoring process. It takes ~2-5 seconds to 
restore all the BOs for one process (swap in and out BOs, actually allocate and 
copy pages),
even though the process doesn't need all the allocated BOs to be resident.

2, Sometimes, the fairness can't be guaranteed between process when memory is 
oversubscribed.
They can't share the GPU equally when created with default priority.

3, The less GPU underutilization time during evicting and restoring, the less 
performance degradation under memory oversubscription.

Regards,
Lang

Regards,
   Felix

Signed-off-by: Lang Yu 
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c   |  2 +-
   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h   |  4 ++--
   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 12

+---

   3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 3295838e9a1d..c01c6f3ab562 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -167,7 +167,7 @@ void amdgpu_amdkfd_device_init(struct

amdgpu_device *adev)

  int i;
  int last_valid_bit;

-amdgpu_amdkfd_gpuvm_init_mem_limits();
+amdgpu_amdkfd_gpuvm_init_mem_limits(adev);

  if (adev->kfd.dev) {
  struct kgd2kfd_shared_resources gpu_resources = { diff --git
a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 1de021ebdd46..13284dbd8c58 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++

Re: [PATCH 3/3] drm/amdgpu: Fix pinned GART area accounting and fdinfo reporting

2024-04-29 Thread Felix Kuehling


On 2024-04-29 5:43, Tvrtko Ursulin wrote:


On 26/04/2024 23:24, Felix Kuehling wrote:


On 2024-04-26 12:43, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

When commit b453e42a6e8b ("drm/amdgpu: Add new placement for preemptible
SG BOs") added a new TTM region it missed to notice the conceptual
imbalance in GART pin size accounting as done in amdgpu_bo_pin/unpin.

That imbalance leads to such objects getting accounted against the
resource, but are not un-accounted when unpinned.


AMDGPU_PL_PREEMPT is mostly used for userptr BOs, which cannot be 
pinned. In any case you should make sure that the accounting is 
consistent between amdgpu_bo_pin_restricted and amdgpu_bo_unpin. This 
patch breaks that consistency.


You mean amdgpu_bo_pin(_restricted) and amdgpu_bo_unpin do not run for 
such objects, or something else?


Right. amdgpu_bo_pin_restricted will return an error for userptr BOs:

if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm))
return -EPERM;




If they run, then at the end of pin there is:

 domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type);
...
 } else if (domain == AMDGPU_GEM_DOMAIN_GTT) {
     atomic64_add(amdgpu_bo_size(bo), >gart_pin_size);


You changed that in your patch 2:

-   } else if (domain == AMDGPU_GEM_DOMAIN_GTT) {
+   } else if (bo->tbo.resource->mem_type == TTM_PL_TT ||
+  bo->tbo.resource->mem_type == AMDGPU_PL_PREEMPT) {
atomic64_add(amdgpu_bo_size(bo), >gart_pin_size);
}

I was suggesting you just change this in patch 2 like this, so it 
matches what's done on unpin:


-   } else if (domain == AMDGPU_GEM_DOMAIN_GTT) {
+   } else if (bo->tbo.resource->mem_type == TTM_PL_TT) {
atomic64_add(amdgpu_bo_size(bo), >gart_pin_size);
}




And unpin has no handling for AMDGPU_PL_PREEMPT.

Ah I see.. does it rely on amdgpu_mem_type_to_domain returning 0 for 
AMDGPU_PL_PREEMPT? My confusion was I misread the pinning check as 
checking the domain as stored in the bo at creation time.


Although I am still confused by the statement userptr BOs are not 
pinned. It is not needed to map them via GART on AMD hardware for GPU to 
be able to access them?

Fix by extending the accounting criteria in amdgpu_bo_unpin.

What also aappears needs fixing is not reporting their size from the
amdgpu_bo_get_memory, which is used to implement fdinfo stats, so 
they are

not mixed with the regular userspace created and driver owned objects.


I think that's true. It's a very fine distinction. AMDGPU_PL_PREEMPT 
does use system memory and it is GPU accessible, just like GTT. The 
only difference is, that it's not subject to the GTT limits because 
their eviction is handled by callbacks other than TTM evictions and 
doesn't need to wait for fences.


As in you think those two hunks of the patch are correct?


Yes. It seems, Christian agrees but wants to show preemptible memory 
separately in debugfs instead of not showing it at all.


Regards,
  Felix




Regards,

Tvrtko



Regards,
   Felix




And also amdgpu_bo_print_info for debugfs reporting.

Note that the patch depends on the previous one which broke down the
relevant checks from the domain based to placement based.

Signed-off-by: Tvrtko Ursulin 
Fixes: b453e42a6e8b ("drm/amdgpu: Add new placement for preemptible 
SG BOs")

Cc: Felix Kuehling 
Cc: Christian König 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 5 ++---
  1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c

index fb984669fc3a..5a2bbc793953 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -1032,7 +1032,8 @@ void amdgpu_bo_unpin(struct amdgpu_bo *bo)
  atomic64_sub(amdgpu_bo_size(bo), >vram_pin_size);
  atomic64_sub(amdgpu_vram_mgr_bo_visible_size(bo),
   >visible_pin_size);
-    } else if (bo->tbo.resource->mem_type == TTM_PL_TT) {
+    } else if (bo->tbo.resource->mem_type == TTM_PL_TT ||
+   bo->tbo.resource->mem_type == AMDGPU_PL_PREEMPT) {
  atomic64_sub(amdgpu_bo_size(bo), >gart_pin_size);
  }
@@ -1298,7 +1299,6 @@ void amdgpu_bo_get_memory(struct amdgpu_bo *bo,
  stats->vram_shared += size;
  break;
  case TTM_PL_TT:
-    case AMDGPU_PL_PREEMPT:
  stats->gtt += size;
  if (shared)
  stats->gtt_shared += size;
@@ -1599,7 +1599,6 @@ u64 amdgpu_bo_print_info(int id, struct 
amdgpu_bo *bo, struct seq_file *m)

  placement = "VRAM";
  break;
  case TTM_PL_TT:
-    case AMDGPU_PL_PREEMPT:
  placement = "GTT";
  break;
  case TTM_PL_SYSTEM:

Re: [PATCH 3/3] drm/amdgpu: Fix pinned GART area accounting and fdinfo reporting

2024-04-29 Thread Felix Kuehling

On 2024-04-29 9:45, Tvrtko Ursulin wrote:

On 29/04/2024 12:11, Christian König wrote:

Am 29.04.24 um 11:43 schrieb Tvrtko Ursulin:

On 26/04/2024 23:24, Felix Kuehling wrote:

On 2024-04-26 12:43, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

When commit b453e42a6e8b ("drm/amdgpu: Add new placement for 
preemptible

SG BOs") added a new TTM region it missed to notice the conceptual
imbalance in GART pin size accounting as done in amdgpu_bo_pin/unpin.

That imbalance leads to such objects getting accounted against the
resource, but are not un-accounted when unpinned.

AMDGPU_PL_PREEMPT is mostly used for userptr BOs, which cannot be 
pinned. In any case you should make sure that the accounting is 
consistent between amdgpu_bo_pin_restricted and amdgpu_bo_unpin. 
This patch breaks that consistency.

You mean amdgpu_bo_pin(_restricted) and amdgpu_bo_unpin do not run 
for such objects, or something else?

If they run, then at the end of pin there is:

domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type);
...
} else if (domain == AMDGPU_GEM_DOMAIN_GTT) {
    atomic64_add(amdgpu_bo_size(bo), >gart_pin_size);

And unpin has no handling for AMDGPU_PL_PREEMPT.

Ah I see.. does it rely on amdgpu_mem_type_to_domain returning 0 for 
AMDGPU_PL_PREEMPT? My confusion was I misread the pinning check as 
checking the domain as stored in the bo at creation time.

Although I am still confused by the statement userptr BOs are not 
pinned. It is not needed to map them via GART on AMD hardware for GPU 
to be able to access them?

No, a GART mapping is only needed if you want to scanout from them or 
otherwise use them from the kernel on the GPU.

Background is that the kernel doesn't has VM with page tables..

Got it, thanks!

Presumably somewhere else in the code then it is prevented to call 
pin/unpin on those?

I was referring to this condition in amdgpu_bo_pin_restricted:

if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm))
return -EPERM;

However, when I look into it more, I see that AMDGPU_PL_PREEMPT is used 
for other SG BOs that actually are pinned, specifically BOs created by 
KFD with KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL or 
KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP. These are very small BOs (one or two 
pages), and only one per process, per GPU, so I'm not sure it's worth 
adding special handling for them in the BO pin accounting.

Regards,
  Felix

What to do, if anything, with the attempt to address the asymmetry in 
the accounting criteria between the pin and unpin?

I mean domain based on pin:

 domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type);
 if (domain == AMDGPU_GEM_DOMAIN_VRAM) {
     atomic64_add(amdgpu_bo_size(bo), >vram_pin_size);
     atomic64_add(amdgpu_vram_mgr_bo_visible_size(bo),
  >visible_pin_size);
 } else if (domain == AMDGPU_GEM_DOMAIN_GTT) {
     atomic64_add(amdgpu_bo_size(bo), >gart_pin_size);
 }

Versus placement based on unpin:

 if (bo->tbo.resource->mem_type == TTM_PL_VRAM) {
     atomic64_sub(amdgpu_bo_size(bo), >vram_pin_size);
     atomic64_sub(amdgpu_vram_mgr_bo_visible_size(bo),
  >visible_pin_size);
 } else if (bo->tbo.resource->mem_type == TTM_PL_TT) {
     atomic64_sub(amdgpu_bo_size(bo), >gart_pin_size);
 }

The fact amdgpu_mem_type_to_domain never translates back to 
AMDGPU_PL_PREEMPT means there is indeed currently no bug.

Is 2/3 still desirable to convert the check in pin to me mem_type based?

Fix by extending the accounting criteria in amdgpu_bo_unpin.

What also aappears needs fixing is not reporting their size from the
amdgpu_bo_get_memory, which is used to implement fdinfo stats, so 
they are

not mixed with the regular userspace created and driver owned objects.

I think that's true. It's a very fine distinction. AMDGPU_PL_PREEMPT 
does use system memory and it is GPU accessible, just like GTT. The 
only difference is, that it's not subject to the GTT limits because 
their eviction is handled by callbacks other than TTM evictions and 
doesn't need to wait for fences.

As in you think those two hunks of the patch are correct?

I think so as well, yes. But we still need a name for preemptible BOs 
while printing them in debugfs.

Currently it looks the name is 'CPU':

amdgpu_bo_print_info()
...
     case AMDGPU_GEM_DOMAIN_CPU:
     default:
     placement = "CPU";
     break;

Also, where to account them in struct amdgpu_mem_stats?

Regards,

Tvrtko

Regards,
Christian.

Regards,

Tvrtko

Regards,
   Felix

And also amdgpu_bo_print_info for debugfs reporting.

Note that the patch depends on the previous one which broke down the
relevant checks from the domain based to placement based.

Signed-off-by: Tvrtko Ursulin 
Fixes: b453e42a6e8b ("drm/amdgpu: Add new placement for preemptible 
SG BOs")

Cc: Felix Kuehl

Re: [PATCH 1/2] drm/amdkfd: Let VRAM allocations go to GTT domain on small APUs

2024-04-26 Thread Felix Kuehling




On 2024-04-26 04:37, Lang Yu wrote:

Small APUs(i.e., consumer, embedded products) usually have a small
carveout device memory which can't satisfy most compute workloads
memory allocation requirements.

We can't even run a Basic MNIST Example with a default 512MB carveout.
https://github.com/pytorch/examples/tree/main/mnist.

Though we can change BIOS settings to enlarge carveout size,
which is inflexible and may bring complaint. On the other hand,
the memory resource can't be effectively used between host and device.

The solution is MI300A approach, i.e., let VRAM allocations go to GTT.

Signed-off-by: Lang Yu 


Two nit-picks inline. Other than that, this patch looks reasonable to me.



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c|  6 +-
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 21 +++
  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c  |  2 +-
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c  |  6 --
  drivers/gpu/drm/amd/amdkfd/kfd_svm.h  |  3 ++-
  5 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 7ba05f030dd1..3295838e9a1d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -456,7 +456,9 @@ void amdgpu_amdkfd_get_local_mem_info(struct amdgpu_device 
*adev,
mem_info->local_mem_size_private =
KFD_XCP_MEMORY_SIZE(adev, xcp->id);
} else {
-   mem_info->local_mem_size_public = adev->gmc.visible_vram_size;
+   mem_info->local_mem_size_public = adev->flags & AMD_IS_APU ?
+ (ttm_tt_pages_limit() << 
PAGE_SHIFT) :
+ adev->gmc.visible_vram_size;
mem_info->local_mem_size_private = adev->gmc.real_vram_size -
adev->gmc.visible_vram_size;


On an APU the private size should be reported as 0.



}
@@ -824,6 +826,8 @@ u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device 
*adev, int xcp_id)
}
do_div(tmp, adev->xcp_mgr->num_xcp_per_mem_partition);
return ALIGN_DOWN(tmp, PAGE_SIZE);
+   } else if (adev->flags & AMD_IS_APU) {
+   return (ttm_tt_pages_limit() << PAGE_SHIFT);
} else {
return adev->gmc.real_vram_size;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index c4f9960dafbb..7eb5afcc4895 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -196,7 +196,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
*adev,
return -EINVAL;
  
  		vram_size = KFD_XCP_MEMORY_SIZE(adev, xcp_id);

-   if (adev->gmc.is_app_apu) {
+   if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
system_mem_needed = size;
ttm_mem_needed = size;
}
@@ -232,7 +232,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
*adev,
  "adev reference can't be null when vram is used");
if (adev && xcp_id >= 0) {
adev->kfd.vram_used[xcp_id] += vram_needed;
-   adev->kfd.vram_used_aligned[xcp_id] += adev->gmc.is_app_apu ?
+   adev->kfd.vram_used_aligned[xcp_id] +=
+   (adev->gmc.is_app_apu || adev->flags & 
AMD_IS_APU) ?
vram_needed :
ALIGN(vram_needed, VRAM_AVAILABLITY_ALIGN);
}
@@ -260,7 +261,7 @@ void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device 
*adev,
  
  		if (adev) {

adev->kfd.vram_used[xcp_id] -= size;
-   if (adev->gmc.is_app_apu) {
+   if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
adev->kfd.vram_used_aligned[xcp_id] -= size;
kfd_mem_limit.system_mem_used -= size;
kfd_mem_limit.ttm_mem_used -= size;
@@ -889,7 +890,7 @@ static int kfd_mem_attach(struct amdgpu_device *adev, 
struct kgd_mem *mem,
 * if peer device has large BAR. In contrast, access over xGMI is
 * allowed for both small and large BAR configurations of peer device
 */
-   if ((adev != bo_adev && !adev->gmc.is_app_apu) &&
+   if ((adev != bo_adev && !(adev->gmc.is_app_apu || adev->flags & AMD_IS_APU)) 
&&
((mem->domain == AMDGPU_GEM_DOMAIN_VRAM) ||
 (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) ||
 (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))) {
@@ -1657,7 +1658,7 @@ size_t amdgpu_amdkfd_get_available_memory(struct 
amdgpu_device

Re: [PATCH 2/2] drm/amdkfd: Allow memory oversubscription on small APUs

2024-04-26 Thread Felix Kuehling


On 2024-04-26 04:37, Lang Yu wrote:

The default ttm_tt_pages_limit is 1/2 of system memory.
It is prone to out of memory with such a configuration.
Indiscriminately allowing the violation of all memory limits is not a 
good solution. It will lead to poor performance once you actually reach 
ttm_pages_limit and TTM starts swapping out BOs.


Regards,
  Felix




Signed-off-by: Lang Yu 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c   |  2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h   |  4 ++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 12 +---
  3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 3295838e9a1d..c01c6f3ab562 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -167,7 +167,7 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
int i;
int last_valid_bit;
  
-	amdgpu_amdkfd_gpuvm_init_mem_limits();

+   amdgpu_amdkfd_gpuvm_init_mem_limits(adev);
  
  	if (adev->kfd.dev) {

struct kgd2kfd_shared_resources gpu_resources = {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 1de021ebdd46..13284dbd8c58 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -363,7 +363,7 @@ u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device 
*adev, int xcp_id);
  
  
  #if IS_ENABLED(CONFIG_HSA_AMD)

-void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
+void amdgpu_amdkfd_gpuvm_init_mem_limits(struct amdgpu_device *adev);
  void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
struct amdgpu_vm *vm);
  
@@ -376,7 +376,7 @@ void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo);

  void amdgpu_amdkfd_reserve_system_mem(uint64_t size);
  #else
  static inline
-void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
+void amdgpu_amdkfd_gpuvm_init_mem_limits(struct amdgpu_device *adev)
  {
  }
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

index 7eb5afcc4895..a3e623a320b3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -60,6 +60,7 @@ static struct {
int64_t system_mem_used;
int64_t ttm_mem_used;
spinlock_t mem_limit_lock;
+   bool alow_oversubscribe;
  } kfd_mem_limit;
  
  static const char * const domain_bit_to_string[] = {

@@ -110,7 +111,7 @@ static bool reuse_dmamap(struct amdgpu_device *adev, struct 
amdgpu_device *bo_ad
   *  System (TTM + userptr) memory - 15/16th System RAM
   *  TTM memory - 3/8th System RAM
   */
-void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
+void amdgpu_amdkfd_gpuvm_init_mem_limits(struct amdgpu_device *adev)
  {
struct sysinfo si;
uint64_t mem;
@@ -130,6 +131,7 @@ void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
kfd_mem_limit.max_system_mem_limit -= AMDGPU_RESERVE_MEM_LIMIT;
  
  	kfd_mem_limit.max_ttm_mem_limit = ttm_tt_pages_limit() << PAGE_SHIFT;

+   kfd_mem_limit.alow_oversubscribe = !!(adev->flags & AMD_IS_APU);
pr_debug("Kernel memory limit %lluM, TTM limit %lluM\n",
(kfd_mem_limit.max_system_mem_limit >> 20),
(kfd_mem_limit.max_ttm_mem_limit >> 20));
@@ -221,8 +223,12 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
*adev,
 kfd_mem_limit.max_ttm_mem_limit) ||
(adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] + vram_needed >
 vram_size - reserved_for_pt - 
atomic64_read(>vram_pin_size))) {
-   ret = -ENOMEM;
-   goto release;
+   if (kfd_mem_limit.alow_oversubscribe) {
+   pr_warn_ratelimited("Memory is getting 
oversubscried.\n");
+   } else {
+   ret = -ENOMEM;
+   goto release;
+   }
}
  
  	/* Update memory accounting by decreasing available system

Re: [PATCH 3/3] drm/amdgpu: Fix pinned GART area accounting and fdinfo reporting

2024-04-26 Thread Felix Kuehling




On 2024-04-26 12:43, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

When commit b453e42a6e8b ("drm/amdgpu: Add new placement for preemptible
SG BOs") added a new TTM region it missed to notice the conceptual
imbalance in GART pin size accounting as done in amdgpu_bo_pin/unpin.

That imbalance leads to such objects getting accounted against the
resource, but are not un-accounted when unpinned.


AMDGPU_PL_PREEMPT is mostly used for userptr BOs, which cannot be 
pinned. In any case you should make sure that the accounting is 
consistent between amdgpu_bo_pin_restricted and amdgpu_bo_unpin. This 
patch breaks that consistency.





Fix by extending the accounting criteria in amdgpu_bo_unpin.

What also aappears needs fixing is not reporting their size from the
amdgpu_bo_get_memory, which is used to implement fdinfo stats, so they are
not mixed with the regular userspace created and driver owned objects.


I think that's true. It's a very fine distinction. AMDGPU_PL_PREEMPT 
does use system memory and it is GPU accessible, just like GTT. The only 
difference is, that it's not subject to the GTT limits because their 
eviction is handled by callbacks other than TTM evictions and doesn't 
need to wait for fences.


Regards,
  Felix




And also amdgpu_bo_print_info for debugfs reporting.

Note that the patch depends on the previous one which broke down the
relevant checks from the domain based to placement based.

Signed-off-by: Tvrtko Ursulin 
Fixes: b453e42a6e8b ("drm/amdgpu: Add new placement for preemptible SG BOs")
Cc: Felix Kuehling 
Cc: Christian König 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 5 ++---
  1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index fb984669fc3a..5a2bbc793953 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -1032,7 +1032,8 @@ void amdgpu_bo_unpin(struct amdgpu_bo *bo)
atomic64_sub(amdgpu_bo_size(bo), >vram_pin_size);
atomic64_sub(amdgpu_vram_mgr_bo_visible_size(bo),
 >visible_pin_size);
-   } else if (bo->tbo.resource->mem_type == TTM_PL_TT) {
+   } else if (bo->tbo.resource->mem_type == TTM_PL_TT ||
+  bo->tbo.resource->mem_type == AMDGPU_PL_PREEMPT) {
atomic64_sub(amdgpu_bo_size(bo), >gart_pin_size);
}
  
@@ -1298,7 +1299,6 @@ void amdgpu_bo_get_memory(struct amdgpu_bo *bo,

stats->vram_shared += size;
break;
case TTM_PL_TT:
-   case AMDGPU_PL_PREEMPT:
stats->gtt += size;
if (shared)
stats->gtt_shared += size;
@@ -1599,7 +1599,6 @@ u64 amdgpu_bo_print_info(int id, struct amdgpu_bo *bo, 
struct seq_file *m)
placement = "VRAM";
break;
case TTM_PL_TT:
-   case AMDGPU_PL_PREEMPT:
placement = "GTT";
break;
case TTM_PL_SYSTEM:

Re: [PATCH] drm/amdkfd: Flush the process wq before creating a kfd_process

2024-04-26 Thread Felix Kuehling




On 2024-04-26 14:55, Lancelot SIX wrote:

There is a race condition when re-creating a kfd_process for a process.
This has been observed when a process under the debugger executes
exec(3).  In this scenario:
- The process executes exec.
  - This will eventually release the process's mm, which will cause the
kfd_process object associated with the process to be freed
(kfd_process_free_notifier decrements the reference count to the
kfd_process to 0).  This causes kfd_process_ref_release to enqueue
kfd_process_wq_release to the kfd_process_wq.
- The debugger receives the PTRACE_EVENT_EXEC notification, and tries to
   re-enable AMDGPU traps (KFD_IOC_DBG_TRAP_ENABLE).
  - When handling this request, KFD tries to re-create a kfd_process.
This eventually calls kfd_create_process and kobject_init_and_add.

At this point the call to kobject_init_and_add can fail because the
old kfd_process.kobj has not been freed yet by kfd_process_wq_release.

This patch proposes to avoid this race by making sure to drain
kfd_process_wq before creating a new kfd_process object.  This way, we
know that any cleanup task is done executing when we reach
kobject_init_and_add.

Signed-off-by: Lancelot SIX 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdkfd/kfd_process.c | 8 
  1 file changed, 8 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 58c1fe542193..451bb058cc62 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -829,6 +829,14 @@ struct kfd_process *kfd_create_process(struct task_struct 
*thread)
if (process) {
pr_debug("Process already found\n");
} else {
+   /* If the process just called exec(3), it is possible that the
+* cleanup of the kfd_process (following the release of the mm
+* of the old process image) is still in the cleanup work queue.
+* Make sure to drain any job before trying to recreate any
+* resource for this process.
+*/
+   flush_workqueue(kfd_process_wq);
+
process = create_process(thread);
if (IS_ERR(process))
goto out;

base-commit: cf743996352e327f483dc7d66606c90276f57380

Re: [PATCH] drm/amdkfd: Enforce queue BO's adev

2024-04-24 Thread Felix Kuehling


On 2024-04-24 13:40, Harish Kasiviswanathan wrote:

Queue buffer, though it is in system memory, has to be created using the
correct amdgpu device. Enforce this as the BO needs to mapped to the
GART for MES Hardware scheduler to access it.

Signed-off-by: Harish Kasiviswanathan 


I guess this doesn't break existing user mode. It only makes it fail in 
a more obvious way. If that's the case, the patch is


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 5 +
  1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 8fd5e0da628c..963cf6d657cb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -373,6 +373,11 @@ static int kfd_ioctl_create_queue(struct file *filep, 
struct kfd_process *p,
err = -EINVAL;
goto err_wptr_map_gart;
}
+   if (dev->adev != amdgpu_ttm_adev(wptr_bo->tbo.bdev)) {
+   pr_err("Queue memory allocated to wrong device\n");
+   err = -EINVAL;
+   goto err_wptr_map_gart;
+   }
  
  		err = amdgpu_amdkfd_map_gtt_bo_to_gart(dev->adev, wptr_bo);

if (err) {

Re: [PATCH v6 0/5] Best effort contiguous VRAM allocation

2024-04-24 Thread Felix Kuehling


The series is

Reviewed-by: Felix Kuehling 

On 2024-04-24 11:27, Philip Yang wrote:

This patch series implement new KFD memory alloc flag for best effort contiguous
VRAM allocation, to support peer direct access RDMA device with limited 
scatter-gather
dma capability.

v2: rebase on patch ("drm/amdgpu: Modify the contiguous flags behaviour")
 to avoid adding the new GEM flag

v3: add patch 2 to handle sg segment size limit (Christian)

v4: remove the buddy block size limit from vram mgr because sg table creation 
already
 remove the limit, and resource uses u64 to handle block start, size 
(Christian)

v5: remove patch 7 which is not for upstream, add AMDGPU prefix to the macro 
name.

v6: use shorter flag name, use interruptible wait ctx, drop patch 5/6 (Felix)

Philip Yang (5):
   drm/amdgpu: Support contiguous VRAM allocation
   drm/amdgpu: Handle sg size limit for contiguous allocation
   drm/amdgpu: Evict BOs from same process for contiguous allocation
   drm/amdkfd: Evict BO itself for contiguous allocation
   drm/amdkfd: Bump kfd version for contiguous VRAM allocation

  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 23 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   |  3 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  | 12 +-
  include/uapi/linux/kfd_ioctl.h|  4 +++-
  4 files changed, 33 insertions(+), 9 deletions(-)

Re: [PATCH v5 1/6] drm/amdgpu: Support contiguous VRAM allocation

2024-04-23 Thread Felix Kuehling




On 2024-04-23 11:28, Philip Yang wrote:

RDMA device with limited scatter-gather ability requires contiguous VRAM
buffer allocation for RDMA peer direct support.

Add a new KFD alloc memory flag and store as bo alloc flag
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS. When pin this bo to export for RDMA
peerdirect access, this will set TTM_PL_FLAG_CONTIFUOUS flag, and ask
VRAM buddy allocator to get contiguous VRAM.

Signed-off-by: Philip Yang 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 
  include/uapi/linux/kfd_ioctl.h   | 1 +
  2 files changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 0ae9fd844623..ef9154043757 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1712,6 +1712,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) 
?
AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0;
+
+   /* For contiguous VRAM allocation */
+   if (flags & 
KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT)
+   alloc_flags |= 
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
}
xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ?
0 : fpriv->xcp_id;
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 2040a470ddb4..c1394c162d4e 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -407,6 +407,7 @@ struct kfd_ioctl_acquire_vm_args {
  #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT  (1 << 26)
  #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED  (1 << 25)
  #define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT  (1 << 24)
+#define KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT (1 << 23)


If I understand it correctly, AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS was 
redefined to mean "best effort". Maybe we can drop the explicit 
"BEST_EFFORT" from this flag as well to keep the name to a reasonable 
length.


Regards,
  Felix


  
  /* Allocate memory for later SVM (shared virtual memory) mapping.

   *

Re: [PATCH v5 3/6] drm/amdgpu: Evict BOs from same process for contiguous allocation

2024-04-23 Thread Felix Kuehling


On 2024-04-23 11:28, Philip Yang wrote:

When TTM failed to alloc VRAM, TTM try evict BOs from VRAM to system
memory then retry the allocation, this skips the KFD BOs from the same
process because KFD require all BOs are resident for user queues.

If TTM with TTM_PL_FLAG_CONTIGUOUS flag to alloc contiguous VRAM, allow
TTM evict KFD BOs from the same process, this will evict the user queues
first, and restore the queues later after contiguous VRAM allocation.

Signed-off-by: Philip Yang 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 851509c6e90e..c907d6005641 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1398,7 +1398,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct 
ttm_buffer_object *bo,
 */
dma_resv_for_each_fence(_cursor, bo->base.resv,
DMA_RESV_USAGE_BOOKKEEP, f) {
-   if (amdkfd_fence_check_mm(f, current->mm))
+   if (amdkfd_fence_check_mm(f, current->mm) &&
+   !(place->flags & TTM_PL_FLAG_CONTIGUOUS))
return false;
}

Re: [PATCH v5 4/6] drm/amdkfd: Evict BO itself for contiguous allocation

2024-04-23 Thread Felix Kuehling


On 2024-04-23 11:28, Philip Yang wrote:

If the BO pages pinned for RDMA is not contiguous on VRAM, evict it to
system memory first to free the VRAM space, then allocate contiguous
VRAM space, and then move it from system memory back to VRAM.

Signed-off-by: Philip Yang 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++-
  1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index ef9154043757..5d118e5580ce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1470,13 +1470,27 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo 
*bo, u32 domain)
if (unlikely(ret))
return ret;
  
+	if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {

+   /*
+* If bo is not contiguous on VRAM, move to system memory first 
to ensure
+* we can get contiguous VRAM space after evicting other BOs.
+*/
+   if (!(bo->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) {
+   ret = amdgpu_amdkfd_bo_validate(bo, 
AMDGPU_GEM_DOMAIN_GTT, false);


amdgpu_amdkfd_bo_validate is meant for use in kernel threads. It always 
runs uninterruptible. I believe pin_bo runs in the context of ioctls 
from user mode. So it should be interruptible.


Regards,
  Felix



+   if (unlikely(ret)) {
+   pr_debug("validate bo 0x%p to GTT failed %d\n", 
>tbo, ret);
+   goto out;
+   }
+   }
+   }
+
ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0);
if (ret)
pr_err("Error in Pinning BO to domain: %d\n", domain);
  
  	amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false);

+out:
amdgpu_bo_unreserve(bo);
-
return ret;
  }

Re: [PATCH v5 5/6] drm/amdkfd: Increase KFD bo restore wait time

2024-04-23 Thread Felix Kuehling


On 2024-04-23 11:28, Philip Yang wrote:

TTM allocate contiguous VRAM may takes more than 1 second to evict BOs
for larger size RDMA buffer. Because KFD restore bo worker reserves all
KFD BOs, then TTM cannot hold the remainning KFD BOs lock to evict them,
this causes TTM failed to alloc contiguous VRAM.

Increase the KFD restore BO wait time to 2 seconds, long enough for RDMA
pin BO to alloc the contiguous VRAM.


Two seconds is a very long time that the GPU will be idle whenever 
memory gets evicted. Maybe we need to look for a solution where the 
restore gets scheduled in response to a fence when the migration completes.


With my most recent changes I made to the eviction fence handling, I 
think we can decouple the scheduling of the restore work from the evict 
work. So we could schedule the delayed restore worker in a fence 
callback set up in amdgpu_bo_move or somewhere around there, and keep a 
short delay that starts counting at the end of the eviction move blit.


Regards,
  Felix




Signed-off-by: Philip Yang 
---
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index a81ef232fdef..c205e2d3acf9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -698,7 +698,7 @@ struct qcm_process_device {
  /* KFD Memory Eviction */
  
  /* Approx. wait time before attempting to restore evicted BOs */

-#define PROCESS_RESTORE_TIME_MS 100
+#define PROCESS_RESTORE_TIME_MS 2000
  /* Approx. back off time if restore fails due to lack of memory */
  #define PROCESS_BACK_OFF_TIME_MS 100
  /* Approx. time before evicting the process again */

Re: [PATCH] drm/amdkfd: handle duplicate BOs in reserve_bo_and_cond_vms

2024-04-23 Thread Felix Kuehling




On 2024-04-22 05:10, Lang Yu wrote:

Observed on gfx8 ASIC when KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM is used.
Two attachments use the same VM, root PD would be locked twice.

[   57.910418] Call Trace:
[   57.793726]  ? reserve_bo_and_cond_vms+0x111/0x1c0 [amdgpu]
[   57.793820]  amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu+0x6c/0x1c0 [amdgpu]
[   57.793923]  ? idr_get_next_ul+0xbe/0x100
[   57.793933]  kfd_process_device_free_bos+0x7e/0xf0 [amdgpu]
[   57.794041]  kfd_process_wq_release+0x2ae/0x3c0 [amdgpu]
[   57.794141]  ? process_scheduled_works+0x29c/0x580
[   57.794147]  process_scheduled_works+0x303/0x580
[   57.794157]  ? __pfx_worker_thread+0x10/0x10
[   57.794160]  worker_thread+0x1a2/0x370
[   57.794165]  ? __pfx_worker_thread+0x10/0x10
[   57.794167]  kthread+0x11b/0x150
[   57.794172]  ? __pfx_kthread+0x10/0x10
[   57.794177]  ret_from_fork+0x3d/0x60
[   57.794181]  ? __pfx_kthread+0x10/0x10
[   57.794184]  ret_from_fork_asm+0x1b/0x30

Signed-off-by: Lang Yu 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 101a2836480d..c4aaf9c394e7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1188,7 +1188,8 @@ static int reserve_bo_and_cond_vms(struct kgd_mem *mem,
int ret;
  
  	ctx->sync = >sync;

-   drm_exec_init(>exec, DRM_EXEC_INTERRUPTIBLE_WAIT);
+   drm_exec_init(>exec, DRM_EXEC_INTERRUPTIBLE_WAIT |
+ DRM_EXEC_IGNORE_DUPLICATES);
drm_exec_until_all_locked(>exec) {
ctx->n_vms = 0;
list_for_each_entry(entry, >attachments, list) {

Re: [PATCH] drm/amdgpu: Fix VRAM memory accounting

2024-04-23 Thread Felix Kuehling


On 2024-04-23 14:56, Mukul Joshi wrote:

Subtract the VRAM pinned memory when checking for available memory
in amdgpu_amdkfd_reserve_mem_limit function since that memory is not
available for use.

Signed-off-by: Mukul Joshi 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 101a2836480d..f672205243e0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -220,7 +220,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
*adev,
(kfd_mem_limit.ttm_mem_used + ttm_mem_needed >
 kfd_mem_limit.max_ttm_mem_limit) ||
(adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] + vram_needed >
-vram_size - reserved_for_pt)) {
+vram_size - reserved_for_pt - 
atomic64_read(>vram_pin_size))) {
ret = -ENOMEM;
goto release;
}

Re: [PATCH] drm/amdgpu: Fix two reset triggered in a row

2024-04-23 Thread Felix Kuehling




On 2024-04-23 01:50, Christian König wrote:

Am 22.04.24 um 21:45 schrieb Yunxiang Li:

Reset request from KFD is missing a check for if a reset is already in
progress, this causes a second reset to be triggered right after the
previous one finishes. Add the check to align with the other reset 
sources.


NAK, that isn't how this should be handled.

Instead all reset source which are handled by a previous reset should 
be canceled.


In other words there should be a cancel_work(>kfd.reset_work); 
somewhere in the KFD code. When this doesn't work correctly then that 
is somehow missing.


If you see the use of amdgpu_in_reset() outside of the low level 
functions than that is clearly a bug.
Do we need to do that for all reset workers in the driver separately? I 
don't see where this is done for other reset workers.


Regards,
  Felix




Regards,
Christian.



Signed-off-by: Yunxiang Li 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c

index 3b4591f554f1..ce3dbb1cc2da 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -283,7 +283,7 @@ int amdgpu_amdkfd_post_reset(struct amdgpu_device 
*adev)

    void amdgpu_amdkfd_gpu_reset(struct amdgpu_device *adev)
  {
-    if (amdgpu_device_should_recover_gpu(adev))
+    if (amdgpu_device_should_recover_gpu(adev) && 
!amdgpu_in_reset(adev))

  amdgpu_reset_domain_schedule(adev->reset_domain,
   >kfd.reset_work);
  }

Re: [PATCH] drm/amdgpu: Fix two reset triggered in a row

2024-04-22 Thread Felix Kuehling


On 2024-04-22 16:14, Alex Deucher wrote:

On Mon, Apr 22, 2024 at 3:52 PM Yunxiang Li  wrote:

Reset request from KFD is missing a check for if a reset is already in
progress, this causes a second reset to be triggered right after the
previous one finishes. Add the check to align with the other reset sources.

Acked-by: Alex Deucher 


Reviewed-by: Felix Kuehling 





Signed-off-by: Yunxiang Li 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 3b4591f554f1..ce3dbb1cc2da 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -283,7 +283,7 @@ int amdgpu_amdkfd_post_reset(struct amdgpu_device *adev)

  void amdgpu_amdkfd_gpu_reset(struct amdgpu_device *adev)
  {
-   if (amdgpu_device_should_recover_gpu(adev))
+   if (amdgpu_device_should_recover_gpu(adev) && !amdgpu_in_reset(adev))
 amdgpu_reset_domain_schedule(adev->reset_domain,
  >kfd.reset_work);
  }
--
2.34.1

Re: [PATCH] drm/amdkfd: Add VRAM accounting for SVM migration

2024-04-19 Thread Felix Kuehling


On 2024-04-19 12:23, Mukul Joshi wrote:

Do VRAM accounting when doing migrations to vram to make sure
there is enough available VRAM and migrating to VRAM doesn't evict
other possible non-unified memory BOs. If migrating to VRAM fails,
driver can fall back to using system memory seamlessly.

Signed-off-by: Mukul Joshi 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 16 +++-
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c |  2 +-
  2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index bdc01ca9609a..a6bfc00c0310 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -509,10 +509,19 @@ svm_migrate_ram_to_vram(struct svm_range *prange, 
uint32_t best_loc,
start = start_mgr << PAGE_SHIFT;
end = (last_mgr + 1) << PAGE_SHIFT;
  
+	r = amdgpu_amdkfd_reserve_mem_limit(node->adev,

+   prange->npages * PAGE_SIZE,
+   KFD_IOC_ALLOC_MEM_FLAGS_VRAM,
+   node->xcp ? node->xcp->id : 0);
+   if (r) {
+   dev_dbg(node->adev->dev, "failed to allocate VRAM, size exceeds VRAM 
limit\n", r);
+   return -ENOSPC;
+   }
+
r = svm_range_vram_node_new(node, prange, true);
if (r) {
dev_dbg(node->adev->dev, "fail %ld to alloc vram\n", r);
-   return r;
+   goto out;
}
ttm_res_offset = (start_mgr - prange->start + prange->offset) << 
PAGE_SHIFT;
  
@@ -545,6 +554,11 @@ svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc,

svm_range_vram_node_free(prange);
}
  
+out:

+   amdgpu_amdkfd_unreserve_mem_limit(node->adev,
+   prange->npages * PAGE_SIZE,
+   KFD_IOC_ALLOC_MEM_FLAGS_VRAM,
+   node->xcp ? node->xcp->id : 0);
return r < 0 ? r : 0;
  }
  
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c

index f7d75b432cc6..bfab16b43fec 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -3426,7 +3426,7 @@ svm_range_trigger_migration(struct mm_struct *mm, struct 
svm_range *prange,
mm, KFD_MIGRATE_TRIGGER_PREFETCH);
*migrated = !r;
  
-	return r;

+   return 0;
  }
  
  int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence)

[PATCH] drm/amdkfd: Fix rescheduling of restore worker

2024-04-19 Thread Felix Kuehling

Handle the case that the restore worker was already scheduled by another
eviction while the restore was in progress.

Fixes: 9a1c1339abf9 ("drm/amdkfd: Run restore_workers on freezable WQs")
Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index aafdf064651f..58c1fe542193 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -2012,9 +2012,9 @@ static void restore_process_worker(struct work_struct 
*work)
if (ret) {
pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d 
ms\n",
 p->pasid, PROCESS_BACK_OFF_TIME_MS);
-   ret = queue_delayed_work(kfd_restore_wq, >restore_work,
-   msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS));
-   WARN(!ret, "reschedule restore work failed\n");
+   if (mod_delayed_work(kfd_restore_wq, >restore_work,
+msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)))
+   kfd_process_restore_queues(p);
}
 }
 
-- 
2.34.1

Re: [PATCH v2] drm/amdkfd: make sure VM is ready for updating operations

2024-04-18 Thread Felix Kuehling


On 2024-04-11 4:11, Lang Yu wrote:

When page table BOs were evicted but not validated before
updating page tables, VM is still in evicting state,
amdgpu_vm_update_range returns -EBUSY and
restore_process_worker runs into a dead loop.

v2: Split the BO validation and page table update into
two separate loops in amdgpu_amdkfd_restore_process_bos. (Felix)
   1.Validate BOs
   2.Validate VM (and DMABuf attachments)
   3.Update page tables for the BOs validated above

Fixes: 2fdba514ad5a ("drm/amdgpu: Auto-validate DMABuf imports in compute VMs")

Signed-off-by: Lang Yu 


Reviewed-by: Felix Kuehling 



---
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 34 +++
  1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 0ae9fd844623..e2c9e6ddb1d1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2900,13 +2900,12 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, 
struct dma_fence __rcu *
  
  	amdgpu_sync_create(_obj);
  
-	/* Validate BOs and map them to GPUVM (update VM page tables). */

+   /* Validate BOs managed by KFD */
list_for_each_entry(mem, _info->kfd_bo_list,
validate_list) {
  
  		struct amdgpu_bo *bo = mem->bo;

uint32_t domain = mem->domain;
-   struct kfd_mem_attachment *attachment;
struct dma_resv_iter cursor;
struct dma_fence *fence;
  
@@ -2931,6 +2930,25 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu *

goto validate_map_fail;
}
}
+   }
+
+   if (failed_size)
+   pr_debug("0x%lx/0x%lx in system\n", failed_size, total_size);
+
+   /* Validate PDs, PTs and evicted DMABuf imports last. Otherwise BO
+* validations above would invalidate DMABuf imports again.
+*/
+   ret = process_validate_vms(process_info, );
+   if (ret) {
+   pr_debug("Validating VMs failed, ret: %d\n", ret);
+   goto validate_map_fail;
+   }
+
+   /* Update mappings managed by KFD. */
+   list_for_each_entry(mem, _info->kfd_bo_list,
+   validate_list) {
+   struct kfd_mem_attachment *attachment;
+
list_for_each_entry(attachment, >attachments, list) {
if (!attachment->is_mapped)
continue;
@@ -2947,18 +2965,6 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, 
struct dma_fence __rcu *
}
}
  
-	if (failed_size)

-   pr_debug("0x%lx/0x%lx in system\n", failed_size, total_size);
-
-   /* Validate PDs, PTs and evicted DMABuf imports last. Otherwise BO
-* validations above would invalidate DMABuf imports again.
-*/
-   ret = process_validate_vms(process_info, );
-   if (ret) {
-   pr_debug("Validating VMs failed, ret: %d\n", ret);
-   goto validate_map_fail;
-   }
-
/* Update mappings not managed by KFD */
list_for_each_entry(peer_vm, _info->vm_list_head,
vm_list_node) {

[PATCH] drm/amdgpu: Update BO eviction priorities

2024-04-18 Thread Felix Kuehling

Make SVM BOs more likely to get evicted than other BOs. These BOs
opportunistically use available VRAM, but can fall back relatively
seamlessly to system memory. It also avoids SVM migrations evicting
other, more important BOs as they will evict other SVM allocations
first.

Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index cd2dd3ed7153..d80671535ab3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -608,6 +608,8 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
else
amdgpu_bo_placement_from_domain(bo, bp->domain);
if (bp->type == ttm_bo_type_kernel)
+   bo->tbo.priority = 2;
+   else if (!(bp->flags & AMDGPU_GEM_CREATE_DISCARDABLE))
bo->tbo.priority = 1;
 
if (!bp->destroy)
-- 
2.34.1

Re: [PATCH] drm/amdgpu/mes11: print MES opcodes rather than numbers

2024-04-18 Thread Felix Kuehling




On 2024-04-17 15:53, Alex Deucher wrote:

Makes it easier to review the logs when there are MES
errors.

v2: use dbg for emitted, add helpers for fetching strings
v3: fix missing commas (Harish)

Reviewed by Shaoyun.liu  (v2)
Signed-off-by: Alex Deucher 
---
  drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 78 --
  1 file changed, 74 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 81833395324a0..414b7beff397f 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -100,18 +100,72 @@ static const struct amdgpu_ring_funcs 
mes_v11_0_ring_funcs = {
.insert_nop = amdgpu_ring_insert_nop,
  };
  
+static const char *mes_v11_0_opcodes[] = {

+   "MES_SCH_API_SET_HW_RSRC",
+   "MES_SCH_API_SET_SCHEDULING_CONFIG",
+   "MES_SCH_API_ADD_QUEUE",
+   "MES_SCH_API_REMOVE_QUEUE",
+   "MES_SCH_API_PERFORM_YIELD",
+   "MES_SCH_API_SET_GANG_PRIORITY_LEVEL",
+   "MES_SCH_API_SUSPEND",
+   "MES_SCH_API_RESUME",
+   "MES_SCH_API_RESET",
+   "MES_SCH_API_SET_LOG_BUFFER",
+   "MES_SCH_API_CHANGE_GANG_PRORITY",
+   "MES_SCH_API_QUERY_SCHEDULER_STATUS",
+   "MES_SCH_API_PROGRAM_GDS",
+   "MES_SCH_API_SET_DEBUG_VMID",
+   "MES_SCH_API_MISC",
+   "MES_SCH_API_UPDATE_ROOT_PAGE_TABLE",
+   "MES_SCH_API_AMD_LOG",


Maybe drop the prefixes. They don't add any information value and only 
bloat the log messages and module binary size. Other than that, the patch is


Acked-by: Felix Kuehling 



+};
+
+static const char *mes_v11_0_misc_opcodes[] = {
+   "MESAPI_MISC__WRITE_REG",
+   "MESAPI_MISC__INV_GART",
+   "MESAPI_MISC__QUERY_STATUS",
+   "MESAPI_MISC__READ_REG",
+   "MESAPI_MISC__WAIT_REG_MEM",
+   "MESAPI_MISC__SET_SHADER_DEBUGGER",
+};
+
+static const char *mes_v11_0_get_op_string(union MESAPI__MISC *x_pkt)
+{
+   const char *op_str = NULL;
+
+   if (x_pkt->header.opcode < ARRAY_SIZE(mes_v11_0_opcodes))
+   op_str = mes_v11_0_opcodes[x_pkt->header.opcode];
+
+   return op_str;
+}
+
+static const char *mes_v11_0_get_misc_op_string(union MESAPI__MISC *x_pkt)
+{
+   const char *op_str = NULL;
+
+   if ((x_pkt->header.opcode == MES_SCH_API_MISC) &&
+   (x_pkt->opcode <= ARRAY_SIZE(mes_v11_0_misc_opcodes)))
+   op_str = mes_v11_0_misc_opcodes[x_pkt->opcode];
+
+   return op_str;
+}
+
  static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
void *pkt, int size,
int api_status_off)
  {
int ndw = size / 4;
signed long r;
-   union MESAPI__ADD_QUEUE *x_pkt = pkt;
+   union MESAPI__MISC *x_pkt = pkt;
struct MES_API_STATUS *api_status;
struct amdgpu_device *adev = mes->adev;
struct amdgpu_ring *ring = >ring;
unsigned long flags;
signed long timeout = 300; /* 3000 ms */
+   const char *op_str, *misc_op_str;
+
+   if (x_pkt->header.opcode >= MES_SCH_API_MAX)
+   return -EINVAL;
  
  	if (amdgpu_emu_mode) {

timeout *= 100;
@@ -135,13 +189,29 @@ static int 
mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
amdgpu_ring_commit(ring);
spin_unlock_irqrestore(>ring_lock, flags);
  
-	DRM_DEBUG("MES msg=%d was emitted\n", x_pkt->header.opcode);

+   op_str = mes_v11_0_get_op_string(x_pkt);
+   misc_op_str = mes_v11_0_get_misc_op_string(x_pkt);
+
+   if (misc_op_str)
+   dev_dbg(adev->dev, "MES msg=%s (%s) was emitted\n", op_str, 
misc_op_str);
+   else if (op_str)
+   dev_dbg(adev->dev, "MES msg=%s was emitted\n", op_str);
+   else
+   dev_dbg(adev->dev, "MES msg=%d was emitted\n", 
x_pkt->header.opcode);
  
  	r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,

  timeout);
if (r < 1) {
-   DRM_ERROR("MES failed to response msg=%d\n",
- x_pkt->header.opcode);
+
+   if (misc_op_str)
+   dev_err(adev->dev, "MES failed to respond to msg=%s 
(%s)\n",
+   op_str, misc_op_str);
+   else if (op_str)
+   dev_err(adev->dev, "MES failed to respond to msg=%s\n",
+   op_str);
+   else
+   dev_err(adev->dev, "MES failed to respond to msg=%d\n",
+   x_pkt->header.opcode);
  
  		while (halt_if_hws_hang)

schedule();

[PATCH] drm/amdkfd: Fix eviction fence handling

2024-04-17 Thread Felix Kuehling

Handle case that dma_fence_get_rcu_safe returns NULL.

If restore work is already scheduled, only update its timer. The same
work item cannot be queued twice, so undo the extra queue eviction.

Fixes: 9a1c1339abf9 ("drm/amdkfd: Run restore_workers on freezable WQs")
Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index b79986412cd8..aafdf064651f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1922,6 +1922,8 @@ static int signal_eviction_fence(struct kfd_process *p)
rcu_read_lock();
ef = dma_fence_get_rcu_safe(>ef);
rcu_read_unlock();
+   if (!ef)
+   return -EINVAL;
 
ret = dma_fence_signal(ef);
dma_fence_put(ef);
@@ -1949,10 +1951,9 @@ static void evict_process_worker(struct work_struct 
*work)
 * they are responsible stopping the queues and scheduling
 * the restore work.
 */
-   if (!signal_eviction_fence(p))
-   queue_delayed_work(kfd_restore_wq, >restore_work,
-   msecs_to_jiffies(PROCESS_RESTORE_TIME_MS));
-   else
+   if (signal_eviction_fence(p) ||
+   mod_delayed_work(kfd_restore_wq, >restore_work,
+msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)))
kfd_process_restore_queues(p);
 
pr_debug("Finished evicting pasid 0x%x\n", p->pasid);
-- 
2.34.1

Re: [PATCH] rock-dgb_defconfig: Update for Linux 6.7 with UBSAN

2024-04-16 Thread Felix Kuehling


On 2024-04-16 13:02, Chen, Xiaogang wrote:


On 4/15/2024 2:49 PM, Felix Kuehling wrote:
Caution: This message originated from an External Source. Use proper 
caution when opening attachments, clicking links, or responding.



make rock-dbg_defconfig
make savedefconfig
cp defconfig arch/x86/config/rock-dbg_defconfig

This also enables UBSAN, which can help catch some types of bugs at
compile time.


Enabling UBSAN cause compiler insert code to perform certain kinds of 
check before operations that may cause undefined behaviour. I think it 
catches errors at run time, not compile time, and increases kernel size.


You're right. I saw it supports some range checking only on arrays where 
the size is known at compile time. But the range checking itself needs 
to happen at runtime.


Regards,
  Felix




Regards

Xiaogang



Signed-off-by: Felix Kuehling 
---
  arch/x86/configs/rock-dbg_defconfig | 46 +
  1 file changed, 14 insertions(+), 32 deletions(-)

diff --git a/arch/x86/configs/rock-dbg_defconfig 
b/arch/x86/configs/rock-dbg_defconfig

index 0ad80a8c8eab..80129ca354b4 100644
--- a/arch/x86/configs/rock-dbg_defconfig
+++ b/arch/x86/configs/rock-dbg_defconfig
@@ -34,11 +34,12 @@ CONFIG_CHECKPOINT_RESTORE=y
  CONFIG_SCHED_AUTOGROUP=y
  CONFIG_BLK_DEV_INITRD=y
  CONFIG_EXPERT=y
-CONFIG_USERFAULTFD=y
-# CONFIG_COMPAT_BRK is not set
  CONFIG_PROFILING=y
+CONFIG_KEXEC=y
+CONFIG_KEXEC_FILE=y
+CONFIG_KEXEC_JUMP=y
+CONFIG_CRASH_DUMP=y
  CONFIG_SMP=y
-# CONFIG_RETPOLINE is not set
  CONFIG_X86_INTEL_LPSS=y
  CONFIG_IOSF_MBI_DEBUG=y
  CONFIG_HYPERVISOR_GUEST=y
@@ -48,9 +49,6 @@ CONFIG_PROCESSOR_SELECT=y
  CONFIG_GART_IOMMU=y
  CONFIG_NR_CPUS=256
  CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
-CONFIG_I8K=m
-CONFIG_MICROCODE_AMD=y
-CONFIG_MICROCODE_OLD_INTERFACE=y
  CONFIG_X86_MSR=m
  CONFIG_X86_CPUID=m
  # CONFIG_X86_5LEVEL is not set
@@ -61,12 +59,8 @@ CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT=1
  CONFIG_EFI=y
  CONFIG_EFI_STUB=y
  CONFIG_EFI_MIXED=y
-CONFIG_KEXEC=y
-CONFIG_KEXEC_FILE=y
-CONFIG_CRASH_DUMP=y
-CONFIG_KEXEC_JUMP=y
  CONFIG_PHYSICAL_ALIGN=0x100
-CONFIG_LEGACY_VSYSCALL_EMULATE=y
+# CONFIG_RETPOLINE is not set
  CONFIG_HIBERNATION=y
  CONFIG_PM_WAKELOCKS=y
  CONFIG_PM_DEBUG=y
@@ -74,7 +68,6 @@ CONFIG_PM_ADVANCED_DEBUG=y
  CONFIG_PM_TRACE_RTC=y
  CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y
  CONFIG_ACPI_EC_DEBUGFS=m
-CONFIG_ACPI_VIDEO=m
  CONFIG_ACPI_DOCK=y
  CONFIG_ACPI_PROCESSOR_AGGREGATOR=m
  CONFIG_ACPI_PCI_SLOT=y
@@ -108,6 +101,8 @@ CONFIG_BLK_DEV_INTEGRITY=y
  CONFIG_BLK_DEV_THROTTLING=y
  CONFIG_PARTITION_ADVANCED=y
  CONFIG_BINFMT_MISC=y
+CONFIG_ZSWAP=y
+# CONFIG_COMPAT_BRK is not set
  CONFIG_MEMORY_HOTPLUG=y
  CONFIG_MEMORY_HOTREMOVE=y
  CONFIG_KSM=y
@@ -115,15 +110,12 @@ CONFIG_DEFAULT_MMAP_MIN_ADDR=65536
  CONFIG_MEMORY_FAILURE=y
  CONFIG_HWPOISON_INJECT=m
  CONFIG_TRANSPARENT_HUGEPAGE=y
-CONFIG_CLEANCACHE=y
-CONFIG_FRONTSWAP=y
  CONFIG_CMA=y
  CONFIG_CMA_AREAS=7
  CONFIG_MEM_SOFT_DIRTY=y
-CONFIG_ZSWAP=y
-CONFIG_ZSMALLOC=y
  CONFIG_ZONE_DEVICE=y
  CONFIG_DEVICE_PRIVATE=y
+CONFIG_USERFAULTFD=y
  CONFIG_NET=y
  CONFIG_PACKET=y
  CONFIG_PACKET_DIAG=y
@@ -167,7 +159,6 @@ CONFIG_BRIDGE_NETFILTER=m
  CONFIG_NF_CONNTRACK=m
  CONFIG_NF_CONNTRACK_SECMARK=y
  CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CONNTRACK_PROCFS is not set
  CONFIG_NF_CONNTRACK_EVENTS=y
  CONFIG_NF_CONNTRACK_TIMEOUT=y
  CONFIG_NF_CONNTRACK_TIMESTAMP=y
@@ -178,7 +169,6 @@ CONFIG_NETFILTER_NETLINK_GLUE_CT=y
  CONFIG_NF_TABLES=m
  CONFIG_NF_TABLES_NETDEV=y
  CONFIG_NFT_CT=m
-CONFIG_NFT_COUNTER=m
  CONFIG_NFT_CONNLIMIT=m
  CONFIG_NFT_LOG=m
  CONFIG_NFT_LIMIT=m
@@ -270,7 +260,6 @@ CONFIG_IP_NF_TARGET_MASQUERADE=m
  CONFIG_IP_NF_TARGET_NETMAP=m
  CONFIG_IP_NF_TARGET_REDIRECT=m
  CONFIG_IP_NF_MANGLE=m
-CONFIG_IP_NF_TARGET_CLUSTERIP=m
  CONFIG_IP_NF_TARGET_ECN=m
  CONFIG_IP_NF_TARGET_TTL=m
  CONFIG_IP_NF_RAW=m
@@ -312,7 +301,6 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
  CONFIG_DEVTMPFS=y
  CONFIG_DEVTMPFS_MOUNT=y
  # CONFIG_PREVENT_FIRMWARE_BUILD is not set
-CONFIG_EFI_VARS=y
  CONFIG_PARPORT=y
  CONFIG_PARPORT_PC=y
  CONFIG_PARPORT_SERIAL=y
@@ -363,7 +351,6 @@ CONFIG_E1000=y
  CONFIG_E1000E=y
  CONFIG_IGB=y
  CONFIG_IGBVF=y
-CONFIG_IXGB=y
  CONFIG_IXGBE=y
  CONFIG_I40E=y
  CONFIG_SKY2=y
@@ -401,14 +388,14 @@ CONFIG_SENSORS_K10TEMP=m
  CONFIG_WATCHDOG=y
  CONFIG_RC_CORE=y
  CONFIG_RC_DECODERS=y
+CONFIG_IR_JVC_DECODER=y
+CONFIG_IR_MCE_KBD_DECODER=y
  CONFIG_IR_NEC_DECODER=y
  CONFIG_IR_RC5_DECODER=y
  CONFIG_IR_RC6_DECODER=y
-CONFIG_IR_JVC_DECODER=y
-CONFIG_IR_SONY_DECODER=y
  CONFIG_IR_SANYO_DECODER=y
  CONFIG_IR_SHARP_DECODER=y
-CONFIG_IR_MCE_KBD_DECODER=y
+CONFIG_IR_SONY_DECODER=y
  CONFIG_IR_XMP_DECODER=y
  CONFIG_AGP=y
  CONFIG_AGP_AMD64=y
@@ -422,7 +409,6 @@ CONFIG_HSA_AMD_P2P=y
  CONFIG_DRM_AST=m
  CONFIG_FB=y
  CONFIG_BACKLIGHT_CLASS_DEVICE=y
-CONFIG_FRAMEBUFFER_CONSOLE=y
  CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y
  CONFIG_HID_BATTERY_STRENGTH=y
  CONFIG_HIDRAW=y
@@ -456,7 +442,6 @@ CONFIG_R

Re: [PATCH] drm/amdkfd: fix NULL pointer dereference

2024-04-15 Thread Felix Kuehling

This patch does not apply to amd-staging-drm-next. This is against a 
DKMS branch and should be reviewed on our internal mailing list.


However, I suspect that part of the problem is, that the DKMS branch has 
diverged quite a bit in this area, and is missing at least one patch 
from me that was reverted, probably because of an improper port. The 
proper solution should involve getting the DKMS branch back in sync with 
upstream. I'll look into that.


Regards,
  Felix

On 2024-04-13 14:07, vitaly.pros...@amd.com wrote:

From: Vitaly Prosyak 

[  +0.006038] BUG: kernel NULL pointer dereference, address: 0028
[  +0.006969] #PF: supervisor read access in kernel mode
[  +0.005139] #PF: error_code(0x) - not-present page
[  +0.005139] PGD 0 P4D 0
[  +0.002530] Oops:  [#1] PREEMPT SMP NOPTI
[  +0.004356] CPU: 11 PID: 12625 Comm: kworker/11:0 Tainted: GW 
 6.7.0+ #2
[  +0.008097] Hardware name: ASUS System Product Name/Pro WS WRX80E-SAGE SE 
WIFI II, BIOS 1302 12/08/2023
[  +0.009398] Workqueue: events evict_process_worker [amdgpu]
[  +0.005750] RIP: 0010:evict_process_worker+0x2f/0x460 [amdgpu]
[  +0.005991] Code: 55 48 89 e5 41 57 41 56 4c 8d b7 a8 fc ff ff 41 55 41 54 53 48 89 
fb 48 83 ec 10 0f 1f 44 00 00 48 8b 43 f8 8b 93 b0 00 00 00 <48> 3b 50 28 0f 85 
50 03 00 00 48 8d 7b 58 e8 ee be cb bf 48 8b 05
[  +0.018791] RSP: 0018:c90009a2be10 EFLAGS: 00010282
[  +0.005226] RAX:  RBX: 888197ffc358 RCX: 
[  +0.007140] RDX: 0a1b RSI:  RDI: 888197ffc358
[  +0.007139] RBP: c90009a2be48 R08:  R09: 
[  +0.007139] R10:  R11:  R12: 888197ffc358
[  +0.007139] R13: 888100153a00 R14: 888197ffc000 R15: 888100153a05
[  +0.007137] FS:  () GS:889facac() 
knlGS:
[  +0.008094] CS:  0010 DS:  ES:  CR0: 80050033
[  +0.005747] CR2: 0028 CR3: 00010d1fc001 CR4: 00770ef0
[  +0.007138] PKRU: 5554
[  +0.002702] Call Trace:
[  +0.002443]  
[  +0.002096]  ? show_regs+0x72/0x90
[  +0.003402]  ? __die+0x25/0x80
[  +0.003052]  ? page_fault_oops+0x154/0x4c0
[  +0.004099]  ? do_user_addr_fault+0x30e/0x6e0
[  +0.004357]  ? psi_group_change+0x237/0x520
[  +0.004185]  ? exc_page_fault+0x84/0x1b0
[  +0.003926]  ? asm_exc_page_fault+0x27/0x30
[  +0.004187]  ? evict_process_worker+0x2f/0x460 [amdgpu]
[  +0.005377]  process_one_work+0x17b/0x360
[  +0.004011]  ? __pfx_worker_thread+0x10/0x10
[  +0.004269]  worker_thread+0x307/0x430
[  +0.003748]  ? __pfx_worker_thread+0x10/0x10
[  +0.004268]  kthread+0xf7/0x130
[  +0.003142]  ? __pfx_kthread+0x10/0x10
[  +0.003749]  ret_from_fork+0x46/0x70
[  +0.003573]  ? __pfx_kthread+0x10/0x10
[  +0.003747]  ret_from_fork_asm+0x1b/0x30
[  +0.003924]  

When we run stressful tests, the eviction fence could be zero and not match
to last_eviction_seqno.

Avoid calling dma_fence_signal and dma_fence_put with zero fences to rely
on checking parameters in DMA API.

Cc: Alex Deucher 
Cc: Christian Koenig 
Cc: Xiaogang Chen 
Cc: Felix Kuehling 
Signed-off-by: Vitaly Prosyak 
---
  drivers/gpu/drm/amd/amdkfd/kfd_process.c | 10 ++
  1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index eb380296017d..a15fae1c398a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -2118,7 +2118,7 @@ static void evict_process_worker(struct work_struct *work)
 */
p = container_of(dwork, struct kfd_process, eviction_work);
trace_kfd_evict_process_worker_start(p);
-   WARN_ONCE(p->last_eviction_seqno != p->ef->seqno,
+   WARN_ONCE(p->ef && p->last_eviction_seqno != p->ef->seqno,
  "Eviction fence mismatch\n");
  
  	/* Narrow window of overlap between restore and evict work

@@ -2134,9 +2134,11 @@ static void evict_process_worker(struct work_struct 
*work)
pr_debug("Started evicting pasid 0x%x\n", p->pasid);
ret = kfd_process_evict_queues(p, false, 
KFD_QUEUE_EVICTION_TRIGGER_TTM);
if (!ret) {
-   dma_fence_signal(p->ef);
-   dma_fence_put(p->ef);
-   p->ef = NULL;
+   if (p->ef) {
+   dma_fence_signal(p->ef);
+   dma_fence_put(p->ef);
+   p->ef = NULL;
+   }
  
  		if (!kfd_process_unmap_doorbells_if_idle(p))

kfd_process_schedule_restore(p);

[PATCH] rock-dgb_defconfig: Update for Linux 6.7 with UBSAN

2024-04-15 Thread Felix Kuehling

make rock-dbg_defconfig
make savedefconfig
cp defconfig arch/x86/config/rock-dbg_defconfig

This also enables UBSAN, which can help catch some types of bugs at
compile time.

Signed-off-by: Felix Kuehling 
---
 arch/x86/configs/rock-dbg_defconfig | 46 +
 1 file changed, 14 insertions(+), 32 deletions(-)

diff --git a/arch/x86/configs/rock-dbg_defconfig 
b/arch/x86/configs/rock-dbg_defconfig
index 0ad80a8c8eab..80129ca354b4 100644
--- a/arch/x86/configs/rock-dbg_defconfig
+++ b/arch/x86/configs/rock-dbg_defconfig
@@ -34,11 +34,12 @@ CONFIG_CHECKPOINT_RESTORE=y
 CONFIG_SCHED_AUTOGROUP=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_EXPERT=y
-CONFIG_USERFAULTFD=y
-# CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
+CONFIG_KEXEC=y
+CONFIG_KEXEC_FILE=y
+CONFIG_KEXEC_JUMP=y
+CONFIG_CRASH_DUMP=y
 CONFIG_SMP=y
-# CONFIG_RETPOLINE is not set
 CONFIG_X86_INTEL_LPSS=y
 CONFIG_IOSF_MBI_DEBUG=y
 CONFIG_HYPERVISOR_GUEST=y
@@ -48,9 +49,6 @@ CONFIG_PROCESSOR_SELECT=y
 CONFIG_GART_IOMMU=y
 CONFIG_NR_CPUS=256
 CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
-CONFIG_I8K=m
-CONFIG_MICROCODE_AMD=y
-CONFIG_MICROCODE_OLD_INTERFACE=y
 CONFIG_X86_MSR=m
 CONFIG_X86_CPUID=m
 # CONFIG_X86_5LEVEL is not set
@@ -61,12 +59,8 @@ CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT=1
 CONFIG_EFI=y
 CONFIG_EFI_STUB=y
 CONFIG_EFI_MIXED=y
-CONFIG_KEXEC=y
-CONFIG_KEXEC_FILE=y
-CONFIG_CRASH_DUMP=y
-CONFIG_KEXEC_JUMP=y
 CONFIG_PHYSICAL_ALIGN=0x100
-CONFIG_LEGACY_VSYSCALL_EMULATE=y
+# CONFIG_RETPOLINE is not set
 CONFIG_HIBERNATION=y
 CONFIG_PM_WAKELOCKS=y
 CONFIG_PM_DEBUG=y
@@ -74,7 +68,6 @@ CONFIG_PM_ADVANCED_DEBUG=y
 CONFIG_PM_TRACE_RTC=y
 CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y
 CONFIG_ACPI_EC_DEBUGFS=m
-CONFIG_ACPI_VIDEO=m
 CONFIG_ACPI_DOCK=y
 CONFIG_ACPI_PROCESSOR_AGGREGATOR=m
 CONFIG_ACPI_PCI_SLOT=y
@@ -108,6 +101,8 @@ CONFIG_BLK_DEV_INTEGRITY=y
 CONFIG_BLK_DEV_THROTTLING=y
 CONFIG_PARTITION_ADVANCED=y
 CONFIG_BINFMT_MISC=y
+CONFIG_ZSWAP=y
+# CONFIG_COMPAT_BRK is not set
 CONFIG_MEMORY_HOTPLUG=y
 CONFIG_MEMORY_HOTREMOVE=y
 CONFIG_KSM=y
@@ -115,15 +110,12 @@ CONFIG_DEFAULT_MMAP_MIN_ADDR=65536
 CONFIG_MEMORY_FAILURE=y
 CONFIG_HWPOISON_INJECT=m
 CONFIG_TRANSPARENT_HUGEPAGE=y
-CONFIG_CLEANCACHE=y
-CONFIG_FRONTSWAP=y
 CONFIG_CMA=y
 CONFIG_CMA_AREAS=7
 CONFIG_MEM_SOFT_DIRTY=y
-CONFIG_ZSWAP=y
-CONFIG_ZSMALLOC=y
 CONFIG_ZONE_DEVICE=y
 CONFIG_DEVICE_PRIVATE=y
+CONFIG_USERFAULTFD=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_PACKET_DIAG=y
@@ -167,7 +159,6 @@ CONFIG_BRIDGE_NETFILTER=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_SECMARK=y
 CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CONNTRACK_PROCFS is not set
 CONFIG_NF_CONNTRACK_EVENTS=y
 CONFIG_NF_CONNTRACK_TIMEOUT=y
 CONFIG_NF_CONNTRACK_TIMESTAMP=y
@@ -178,7 +169,6 @@ CONFIG_NETFILTER_NETLINK_GLUE_CT=y
 CONFIG_NF_TABLES=m
 CONFIG_NF_TABLES_NETDEV=y
 CONFIG_NFT_CT=m
-CONFIG_NFT_COUNTER=m
 CONFIG_NFT_CONNLIMIT=m
 CONFIG_NFT_LOG=m
 CONFIG_NFT_LIMIT=m
@@ -270,7 +260,6 @@ CONFIG_IP_NF_TARGET_MASQUERADE=m
 CONFIG_IP_NF_TARGET_NETMAP=m
 CONFIG_IP_NF_TARGET_REDIRECT=m
 CONFIG_IP_NF_MANGLE=m
-CONFIG_IP_NF_TARGET_CLUSTERIP=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
@@ -312,7 +301,6 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_DEVTMPFS=y
 CONFIG_DEVTMPFS_MOUNT=y
 # CONFIG_PREVENT_FIRMWARE_BUILD is not set
-CONFIG_EFI_VARS=y
 CONFIG_PARPORT=y
 CONFIG_PARPORT_PC=y
 CONFIG_PARPORT_SERIAL=y
@@ -363,7 +351,6 @@ CONFIG_E1000=y
 CONFIG_E1000E=y
 CONFIG_IGB=y
 CONFIG_IGBVF=y
-CONFIG_IXGB=y
 CONFIG_IXGBE=y
 CONFIG_I40E=y
 CONFIG_SKY2=y
@@ -401,14 +388,14 @@ CONFIG_SENSORS_K10TEMP=m
 CONFIG_WATCHDOG=y
 CONFIG_RC_CORE=y
 CONFIG_RC_DECODERS=y
+CONFIG_IR_JVC_DECODER=y
+CONFIG_IR_MCE_KBD_DECODER=y
 CONFIG_IR_NEC_DECODER=y
 CONFIG_IR_RC5_DECODER=y
 CONFIG_IR_RC6_DECODER=y
-CONFIG_IR_JVC_DECODER=y
-CONFIG_IR_SONY_DECODER=y
 CONFIG_IR_SANYO_DECODER=y
 CONFIG_IR_SHARP_DECODER=y
-CONFIG_IR_MCE_KBD_DECODER=y
+CONFIG_IR_SONY_DECODER=y
 CONFIG_IR_XMP_DECODER=y
 CONFIG_AGP=y
 CONFIG_AGP_AMD64=y
@@ -422,7 +409,6 @@ CONFIG_HSA_AMD_P2P=y
 CONFIG_DRM_AST=m
 CONFIG_FB=y
 CONFIG_BACKLIGHT_CLASS_DEVICE=y
-CONFIG_FRAMEBUFFER_CONSOLE=y
 CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y
 CONFIG_HID_BATTERY_STRENGTH=y
 CONFIG_HIDRAW=y
@@ -456,7 +442,6 @@ CONFIG_RTC_CLASS=y
 # CONFIG_RTC_HCTOSYS is not set
 CONFIG_DMADEVICES=y
 CONFIG_DMABUF_MOVE_NOTIFY=y
-# CONFIG_X86_PLATFORM_DEVICES is not set
 CONFIG_AMD_IOMMU=y
 CONFIG_INTEL_IOMMU=y
 # CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON is not set
@@ -473,9 +458,7 @@ CONFIG_XFS_WARN=y
 CONFIG_FANOTIFY=y
 CONFIG_QUOTA=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
-# CONFIG_PRINT_QUOTA_WARNING is not set
 CONFIG_QFMT_V2=y
-CONFIG_AUTOFS4_FS=y
 CONFIG_FUSE_FS=m
 CONFIG_CUSE=m
 CONFIG_OVERLAY_FS=y
@@ -509,22 +492,21 @@ CONFIG_SECURITY=y
 CONFIG_SECURITY_NETWORK=y
 CONFIG_SECURITY_SELINUX=y
 CONFIG_SECURITY_SELINUX_BOOTPARAM=y
-CONFIG_SECURITY_SELINUX_DISABLE=y
-CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
 CONFIG_LSM="yama,loadpin,safesetid,integrity,selinux,smack,

[PATCH] drm/amdkfd: Fix memory leak in create_process failure

2024-04-10 Thread Felix Kuehling

Fix memory leak due to a leaked mmget reference on an error handling
code path that is triggered when attempting to create KFD processes
while a GPU reset is in progress.

Fixes: 0ab2d7532b05 ("drm/amdkfd: prepare per-process debug enable and disable")
CC: Xiaogang Chen 
Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 717a60d7a4ea..b79986412cd8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -819,9 +819,9 @@ struct kfd_process *kfd_create_process(struct task_struct 
*thread)
mutex_lock(_processes_mutex);
 
if (kfd_is_locked()) {
-   mutex_unlock(_processes_mutex);
pr_debug("KFD is locked! Cannot create process");
-   return ERR_PTR(-EINVAL);
+   process = ERR_PTR(-EINVAL);
+   goto out;
}
 
/* A prior open of /dev/kfd could have already created the process. */
-- 
2.34.1

Re: [PATCH] drm/amdkfd: make sure VM is ready for updating operations

2024-04-09 Thread Felix Kuehling



On 2024-04-08 3:55, Christian König wrote:

Am 07.04.24 um 06:52 schrieb Lang Yu:
When VM is in evicting state, amdgpu_vm_update_range would return 
-EBUSY.

Then restore_process_worker runs into a dead loop.

Fixes: 2fdba514ad5a ("drm/amdgpu: Auto-validate DMABuf imports in 
compute VMs")


Mhm, while it would be good to have this case handled as error it 
should never occur in practice since we should have validated the VM 
before validating the DMA-bufs.


@Felix isn't that something we have taken care of?


The problem I saw when I implemented Auto-validate was, that migration 
of a BO invalidates its DMABuf attachments. So I need to validate the 
DMABuf attachments after validating the BOs they attach to. This 
auto-validation happens in amdgpu_vm_validate. So I needed to do the VM 
validation after the BO validation. The problem now seems to be that the 
BO validation happens in the same loop as the page table update. And the 
page table update fails if the VM is not valid.


I never saw this problem in my testing, probably because I never got my 
page tables evicted?


Anyway, I think the solution is to split the BO validation and page 
table update into two separate loops in amdgpu_amdkfd_restore_process_pos:


1. Validate BOs
2. Validate VM (and DMABuf attachments)
3. Update page tables for the BOs validated above

Regards,
  Felix




Regards,
Christian.




Signed-off-by: Lang Yu 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 6 ++
  1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

index 0ae9fd844623..8c71fe07807a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2900,6 +2900,12 @@ int 
amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence 
__rcu *

    amdgpu_sync_create(_obj);
  +    ret = process_validate_vms(process_info, NULL);
+    if (ret) {
+    pr_debug("Validating VMs failed, ret: %d\n", ret);
+    goto validate_map_fail;
+    }
+
  /* Validate BOs and map them to GPUVM (update VM page tables). */
  list_for_each_entry(mem, _info->kfd_bo_list,
  validate_list) {

Re: [PATCH 1/2] amd/amdkfd: sync all devices to wait all processes being evicted

2024-04-03 Thread Felix Kuehling




On 2024-04-03 14:12, Zhigang Luo wrote:

If there are more than one device doing reset in parallel, the first
device will call kfd_suspend_all_processes() to evict all processes
on all devices, this call takes time to finish. other device will
start reset and recover without waiting. if the process has not been
evicted before doing recover, it will be restored, then caused page
fault.

Signed-off-by: Zhigang Luo 


This patch is

Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdkfd/kfd_device.c | 17 ++---
  1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 041ec3de55e7..719d6d365e15 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -960,7 +960,6 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
  {
struct kfd_node *node;
int i;
-   int count;
  
  	if (!kfd->init_complete)

return;
@@ -968,12 +967,10 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
/* for runtime suspend, skip locking kfd */
if (!run_pm) {
mutex_lock(_processes_mutex);
-   count = ++kfd_locked;
-   mutex_unlock(_processes_mutex);
-
/* For first KFD device suspend all the KFD processes */
-   if (count == 1)
+   if (++kfd_locked == 1)
kfd_suspend_all_processes();
+   mutex_unlock(_processes_mutex);
}
  
  	for (i = 0; i < kfd->num_nodes; i++) {

@@ -984,7 +981,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
  
  int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)

  {
-   int ret, count, i;
+   int ret, i;
  
  	if (!kfd->init_complete)

return 0;
@@ -998,12 +995,10 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
/* for runtime resume, skip unlocking kfd */
if (!run_pm) {
mutex_lock(_processes_mutex);
-   count = --kfd_locked;
-   mutex_unlock(_processes_mutex);
-
-   WARN_ONCE(count < 0, "KFD suspend / resume ref. error");
-   if (count == 0)
+   if (--kfd_locked == 0)
ret = kfd_resume_all_processes();
+   WARN_ONCE(kfd_locked < 0, "KFD suspend / resume ref. error");
+   mutex_unlock(_processes_mutex);
}
  
  	return ret;

Re: [PATCH 1/2] amd/amdkfd: sync all devices to wait all processes being evicted

2024-04-02 Thread Felix Kuehling


On 2024-04-01 17:53, Zhigang Luo wrote:

If there are more than one device doing reset in parallel, the first
device will call kfd_suspend_all_processes() to evict all processes
on all devices, this call takes time to finish. other device will
start reset and recover without waiting. if the process has not been
evicted before doing recover, it will be restored, then caused page
fault.

Signed-off-by: Zhigang Luo
Change-Id: Ib1eddb56b69ecd41fe703abd169944154f48b0cd


Please remove the Change-Id: before you push. Other than that, this patch is



---
  drivers/gpu/drm/amd/amdkfd/kfd_device.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 041ec3de55e7..55f89c858c7a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -969,11 +969,11 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
if (!run_pm) {
mutex_lock(_processes_mutex);
count = ++kfd_locked;
-   mutex_unlock(_processes_mutex);
  
  		/* For first KFD device suspend all the KFD processes */

if (count == 1)
kfd_suspend_all_processes();


This could be simplified now. The variable "count" was only needed for 
the broken attempt to do call suspend outside the lock. Now you can just do:


mutex_lock(_processes_mutex);
if (++kfd_locked == 1)
kfd_suspend_all_processes();
mutex_unlock(_processes_mutex);

To be consistent, we probably need to make a similar change in 
kgd2kfd_resume and run kfd_resume_all_processes under the lock as well. 
Otherwise there could be a race condition between suspend and resume.


Regards,
  Felix



+   mutex_unlock(_processes_mutex);
}
  
  	for (i = 0; i < kfd->num_nodes; i++) {

Re: Proposal to add CRIU support to DRM render nodes

2024-04-01 Thread Felix Kuehling

On 2024-04-01 12:56, Tvrtko Ursulin wrote:

On 01/04/2024 17:37, Felix Kuehling wrote:

On 2024-04-01 11:09, Tvrtko Ursulin wrote:

On 28/03/2024 20:42, Felix Kuehling wrote:

On 2024-03-28 12:03, Tvrtko Ursulin wrote:

Hi Felix,

I had one more thought while browsing around the amdgpu CRIU
plugin. It appears it relies on the KFD support being compiled in
and /dev/kfd present, correct? AFAICT at least, it relies on that
to figure out the amdgpu DRM node.

In would be probably good to consider designing things without
that dependency. So that checkpointing an application which does
not use /dev/kfd is possible. Or if the kernel does not even have
the KFD support compiled in.

Yeah, if we want to support graphics apps that don't use KFD, we
should definitely do that. Currently we get a lot of topology
information from KFD, not even from the /dev/kfd device but from
the sysfs nodes exposed by KFD. We'd need to get GPU device info
from the render nodes instead. And if KFD is available, we may need
to integrate both sources of information.

It could perhaps mean no more than adding some GPU discovery code
into CRIU. Which shuold be flexible enough to account for things
like re-assigned minor numbers due driver reload.

Do you mean adding GPU discovery to the core CRIU, or to the
plugin. I was thinking this is still part of the plugin.

Yes I agree. I was only thinking about adding some DRM device
discovery code in a more decoupled fashion from the current plugin,
for both the reason discussed above (decoupling a bit from reliance
on kfd sysfs), and then also if/when a new DRM driver might want to
implement this the code could be move to some common plugin area.

I am not sure how feasible that would be though. The "gpu id"
concept and it's matching in the current kernel code and CRIU plugin
- is that value tied to the physical GPU instance or how it works?

The concept of the GPU ID is that it's stable while the system is up,
even when devices get added and removed dynamically. It was baked
into the API early on, but I don't think we ever fully validated
device hot plug. I think the closest we're getting is with our latest
MI GPUs and dynamic partition mode change.

Doesn't it read the saved gpu id from the image file while doing
restore and tries to open the render node to match it? Maybe I am
misreading the code.. But if it does, does it imply that in practice
it could be stable across reboots? Or that it is not possible to
restore to a different instance of maybe the same GPU model installed
in a system?

Ah, the idea is, that when you restore on a different system, you may
get different GPU IDs. Or you may checkpoint an app running on GPU 1 but
restore it on GPU 2 on the same system. That's why we need to translate
GPU IDs in restored applications. User mode still uses the old GPU IDs,
but the kernel mode driver translates them to the actual GPU IDs of the
GPUs that the process was restored on.

This also highlights another aspect on those spatially partitioned
GPUs. GPU IDs identify device partitions, not devices. Similarly,
each partition has its own render node, and the KFD topology info in
sysfs points to the render-minor number corresponding to each GPU ID.

I am not familiar with this. This is not SR-IOV but some other kind of
partitioning? Would you have any links where I could read more?

Right, the bare-metal driver can partition a PF spatially without SRIOV.
SRIOV can also use spatial partitioning and expose each partition
through its own VF, but that's not useful for bare metal. Spatial
partitioning is new in MI300. There is some high-level info in this
whitepaper:
https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-3-white-paper.pdf.

Regards,
Felix

Regards,

Tvrtko

Otherwise I am eagerly awaiting to hear more about the design
specifics around dma-buf handling. And also seeing how to extend
to other DRM related anonymous fds.

I've been pretty far under-water lately. I hope I'll find time to
work on this more, but it's probably going to be at least a few weeks.

Got it.

Regards,

Tvrtko

Regards,
Felix

Regards,

Tvrtko

On 15/03/2024 18:36, Tvrtko Ursulin wrote:

On 15/03/2024 02:33, Felix Kuehling wrote:

On 2024-03-12 5:45, Tvrtko Ursulin wrote:

On 11/03/2024 14:48, Tvrtko Ursulin wrote:

Hi Felix,

On 06/12/2023 21:23, Felix Kuehling wrote:
Executive Summary: We need to add CRIU support to DRM render
nodes in order to maintain CRIU support for ROCm application
once they start relying on render nodes for more GPU memory
management. In this email I'm providing some background why
we are doing this, and outlining some of the problems we need
to solve to checkpoint and restore render node state and
shared memory (DMABuf) state. I have some thoughts on the API
design, leaning on what we did for KFD, but would like to get
feedback from the DRI

Re: Proposal to add CRIU support to DRM render nodes

2024-04-01 Thread Felix Kuehling


On 2024-04-01 11:09, Tvrtko Ursulin wrote:


On 28/03/2024 20:42, Felix Kuehling wrote:


On 2024-03-28 12:03, Tvrtko Ursulin wrote:


Hi Felix,

I had one more thought while browsing around the amdgpu CRIU plugin. 
It appears it relies on the KFD support being compiled in and 
/dev/kfd present, correct? AFAICT at least, it relies on that to 
figure out the amdgpu DRM node.


In would be probably good to consider designing things without that 
dependency. So that checkpointing an application which does not use 
/dev/kfd is possible. Or if the kernel does not even have the KFD 
support compiled in.


Yeah, if we want to support graphics apps that don't use KFD, we 
should definitely do that. Currently we get a lot of topology 
information from KFD, not even from the /dev/kfd device but from the 
sysfs nodes exposed by KFD. We'd need to get GPU device info from the 
render nodes instead. And if KFD is available, we may need to 
integrate both sources of information.





It could perhaps mean no more than adding some GPU discovery code 
into CRIU. Which shuold be flexible enough to account for things 
like re-assigned minor numbers due driver reload.


Do you mean adding GPU discovery to the core CRIU, or to the plugin. 
I was thinking this is still part of the plugin.


Yes I agree. I was only thinking about adding some DRM device 
discovery code in a more decoupled fashion from the current plugin, 
for both the reason discussed above (decoupling a bit from reliance on 
kfd sysfs), and then also if/when a new DRM driver might want to 
implement this the code could be move to some common plugin area.


I am not sure how feasible that would be though. The "gpu id" concept 
and it's matching in the current kernel code and CRIU plugin - is that 
value tied to the physical GPU instance or how it works?


The concept of the GPU ID is that it's stable while the system is up, 
even when devices get added and removed dynamically. It was baked into 
the API early on, but I don't think we ever fully validated device hot 
plug. I think the closest we're getting is with our latest MI GPUs and 
dynamic partition mode change.


This also highlights another aspect on those spatially partitioned GPUs. 
GPU IDs identify device partitions, not devices. Similarly, each 
partition has its own render node, and the KFD topology info in sysfs 
points to the render-minor number corresponding to each GPU ID.


Regards,
  Felix




Otherwise I am eagerly awaiting to hear more about the design 
specifics around dma-buf handling. And also seeing how to extend to 
other DRM related anonymous fds.


I've been pretty far under-water lately. I hope I'll find time to 
work on this more, but it's probably going to be at least a few weeks.


Got it.

Regards,

Tvrtko



Regards,
   Felix




Regards,

Tvrtko

On 15/03/2024 18:36, Tvrtko Ursulin wrote:


On 15/03/2024 02:33, Felix Kuehling wrote:


On 2024-03-12 5:45, Tvrtko Ursulin wrote:


On 11/03/2024 14:48, Tvrtko Ursulin wrote:


Hi Felix,

On 06/12/2023 21:23, Felix Kuehling wrote:
Executive Summary: We need to add CRIU support to DRM render 
nodes in order to maintain CRIU support for ROCm application 
once they start relying on render nodes for more GPU memory 
management. In this email I'm providing some background why we 
are doing this, and outlining some of the problems we need to 
solve to checkpoint and restore render node state and shared 
memory (DMABuf) state. I have some thoughts on the API design, 
leaning on what we did for KFD, but would like to get feedback 
from the DRI community regarding that API and to what extent 
there is interest in making that generic.


We are working on using DRM render nodes for virtual address 
mappings in ROCm applications to implement the CUDA11-style VM 
API and improve interoperability between graphics and compute. 
This uses DMABufs for sharing buffer objects between KFD and 
multiple render node devices, as well as between processes. In 
the long run this also provides a path to moving all or most 
memory management from the KFD ioctl API to libdrm.


Once ROCm user mode starts using render nodes for virtual 
address management, that creates a problem for checkpointing 
and restoring ROCm applications with CRIU. Currently there is 
no support for checkpointing and restoring render node state, 
other than CPU virtual address mappings. Support will be needed 
for checkpointing GEM buffer objects and handles, their GPU 
virtual address mappings and memory sharing relationships 
between devices and processes.


Eventually, if full CRIU support for graphics applications is 
desired, more state would need to be captured, including 
scheduler contexts and BO lists. Most of this state is 
driver-specific.


After some internal discussions we decided to take our design 
process public as this potentially touches DRM GEM and DMABuf 
APIs and may have implications for other drivers in the future.


One basic question before going in

Re: Proposal to add CRIU support to DRM render nodes

2024-03-28 Thread Felix Kuehling




On 2024-03-28 12:03, Tvrtko Ursulin wrote:


Hi Felix,

I had one more thought while browsing around the amdgpu CRIU plugin. 
It appears it relies on the KFD support being compiled in and /dev/kfd 
present, correct? AFAICT at least, it relies on that to figure out the 
amdgpu DRM node.


In would be probably good to consider designing things without that 
dependency. So that checkpointing an application which does not use 
/dev/kfd is possible. Or if the kernel does not even have the KFD 
support compiled in.


Yeah, if we want to support graphics apps that don't use KFD, we should 
definitely do that. Currently we get a lot of topology information from 
KFD, not even from the /dev/kfd device but from the sysfs nodes exposed 
by KFD. We'd need to get GPU device info from the render nodes instead. 
And if KFD is available, we may need to integrate both sources of 
information.





It could perhaps mean no more than adding some GPU discovery code into 
CRIU. Which shuold be flexible enough to account for things like 
re-assigned minor numbers due driver reload.


Do you mean adding GPU discovery to the core CRIU, or to the plugin. I 
was thinking this is still part of the plugin.





Otherwise I am eagerly awaiting to hear more about the design 
specifics around dma-buf handling. And also seeing how to extend to 
other DRM related anonymous fds.


I've been pretty far under-water lately. I hope I'll find time to work 
on this more, but it's probably going to be at least a few weeks.


Regards,
  Felix




Regards,

Tvrtko

On 15/03/2024 18:36, Tvrtko Ursulin wrote:


On 15/03/2024 02:33, Felix Kuehling wrote:


On 2024-03-12 5:45, Tvrtko Ursulin wrote:


On 11/03/2024 14:48, Tvrtko Ursulin wrote:


Hi Felix,

On 06/12/2023 21:23, Felix Kuehling wrote:
Executive Summary: We need to add CRIU support to DRM render 
nodes in order to maintain CRIU support for ROCm application once 
they start relying on render nodes for more GPU memory 
management. In this email I'm providing some background why we 
are doing this, and outlining some of the problems we need to 
solve to checkpoint and restore render node state and shared 
memory (DMABuf) state. I have some thoughts on the API design, 
leaning on what we did for KFD, but would like to get feedback 
from the DRI community regarding that API and to what extent 
there is interest in making that generic.


We are working on using DRM render nodes for virtual address 
mappings in ROCm applications to implement the CUDA11-style VM 
API and improve interoperability between graphics and compute. 
This uses DMABufs for sharing buffer objects between KFD and 
multiple render node devices, as well as between processes. In 
the long run this also provides a path to moving all or most 
memory management from the KFD ioctl API to libdrm.


Once ROCm user mode starts using render nodes for virtual address 
management, that creates a problem for checkpointing and 
restoring ROCm applications with CRIU. Currently there is no 
support for checkpointing and restoring render node state, other 
than CPU virtual address mappings. Support will be needed for 
checkpointing GEM buffer objects and handles, their GPU virtual 
address mappings and memory sharing relationships between devices 
and processes.


Eventually, if full CRIU support for graphics applications is 
desired, more state would need to be captured, including 
scheduler contexts and BO lists. Most of this state is 
driver-specific.


After some internal discussions we decided to take our design 
process public as this potentially touches DRM GEM and DMABuf 
APIs and may have implications for other drivers in the future.


One basic question before going into any API details: Is there a 
desire to have CRIU support for other DRM drivers?


This sounds like a very interesting feature on the overall, 
although I cannot answer on the last question here.


I forgot to finish this thought. I cannot answer / don't know of 
any concrete plans, but I think feature is pretty cool and if 
amdgpu gets it working I wouldn't be surprised if other drivers 
would get interested.


Thanks, that's good to hear!




Funnily enough, it has a tiny relation to an i915 feature I 
recently implemented on Mesa's request, which is to be able to 
"upload" the GPU context from the GPU hang error state and replay 
the hanging request. It is kind of (at a stretch) a very special 
tiny subset of checkout and restore so I am not mentioning it as a 
curiosity.


And there is also another partical conceptual intersect with the 
(at the moment not yet upstream) i915 online debugger. This part 
being in the area of discovering and enumerating GPU resources 
beloning to the client.


I don't see an immediate design or code sharing opportunities 
though but just mentioning.


I did spend some time reading your plugin and kernel 
implementation out of curiousity and have some comments and 
questions.


With that out of the way, some cons

Re: [PATCH] drm/amdgpu: use vm_update_mode=0 as default in sriov for gfx10.3 onwards

2024-03-28 Thread Felix Kuehling




On 2024-03-28 13:59, Danijel Slivka wrote:

Apply this rule to all newer asics in sriov case.
For asic with VF MMIO access protection avoid using CPU for VM table updates.
CPU pagetable updates have issues with HDP flush as VF MMIO access protection
blocks write to BIF_BX_DEV0_EPF0_VF0_HDP_MEM_COHERENCY_FLUSH_CNTL register
during sriov runtime.


Please mention that you moved the check to amdgpu_device_init to ensure 
that it runs after amdgpu_device_ip_early_init where the IP versions are 
discovered.





Signed-off-by: Danijel Slivka 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c   | 6 --
  2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 12dc71a6b5db..59ee902a1eaa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4072,6 +4072,13 @@ int amdgpu_device_init(struct amdgpu_device *adev,
/* Enable TMZ based on IP_VERSION */
amdgpu_gmc_tmz_set(adev);
  
+	if (amdgpu_sriov_vf(adev) &&

+   (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)))


Please fix the indentation. The second line should be aligned with the 
open parenthesis from the previous line. You could also remove the extra 
parentheses around the comparison. They're not needed, and IMO they make 
the code less readable.


With that fixed, the patch is

Reviewed-by: Felix Kuehling 



+   /* VF MMIO access (except mailbox range) from CPU
+* will be blocked during sriov runtime
+*/
+   adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT;
+
amdgpu_gmc_noretry_set(adev);
/* Need to get xgmi info early to decide the reset behavior*/
if (adev->gmc.xgmi.supported) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index aed60aaf1a55..6f01de220c44 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -724,12 +724,6 @@ void amdgpu_detect_virtualization(struct amdgpu_device 
*adev)
adev->virt.caps |= AMDGPU_PASSTHROUGH_MODE;
}
  
-	if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)

-   /* VF MMIO access (except mailbox range) from CPU
-* will be blocked during sriov runtime
-*/
-   adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT;
-
/* we have the ability to check now */
if (amdgpu_sriov_vf(adev)) {
switch (adev->asic_type) {

Re: [PATCH 1/2] drm/amdgpu: always allocate cleared VRAM for KFD allocations

2024-03-26 Thread Felix Kuehling




On 2024-03-26 11:52, Alex Deucher wrote:

This adds allocation latency, but aligns better with user
expectations.  The latency should improve with the drm buddy
clearing patches that Arun has been working on.


If we submit this before the clear-page-tracking patches are in, this 
will cause unacceptable performance regressions for ROCm applications.


Regards,
  Felix




Signed-off-by: Alex Deucher 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 0ae9fd844623..f9a4ea082821 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1709,7 +1709,8 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
alloc_domain = AMDGPU_GEM_DOMAIN_GTT;
alloc_flags = 0;
} else {
-   alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
+   alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE |
+   AMDGPU_GEM_CREATE_VRAM_CLEARED;
alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) 
?
AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0;
}

Re: [PATCH] drm/amdgpu : Increase the mes log buffer size as per new MES FW version

2024-03-26 Thread Felix Kuehling




On 2024-03-25 19:33, Liu, Shaoyun wrote:

[AMD Official Use Only - General]

It can  cause page fault  when the  log size exceed the  page size .


I'd consider that a breaking change in the firmware that should be 
avoided. Is there a way the updated driver can tell the FW the log size 
that it allocated, so that old drivers continue to work with new firmware?


Regards,
  Felix




-Original Message-
From: Kuehling, Felix 
Sent: Monday, March 25, 2024 2:58 PM
To: Liu, Shaoyun ; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu : Increase the mes log buffer size as per new 
MES FW version


On 2024-03-22 12:49, shaoyunl wrote:

  From MES version 0x54, the log entry increased and require the log
buffer size to be increased. The 16k is maximum size agreed

What happens when you run the new firmware on an old kernel that only allocates 
4KB?

Regards,
Felix



Signed-off-by: shaoyunl 
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 5 ++---
   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 1 +
   2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 9ace848e174c..78e4f88f5134 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -103,7 +103,7 @@ static int amdgpu_mes_event_log_init(struct amdgpu_device 
*adev)
   if (!amdgpu_mes_log_enable)
   return 0;

- r = amdgpu_bo_create_kernel(adev, PAGE_SIZE, PAGE_SIZE,
+ r = amdgpu_bo_create_kernel(adev, AMDGPU_MES_LOG_BUFFER_SIZE,
+PAGE_SIZE,
   AMDGPU_GEM_DOMAIN_GTT,
   >mes.event_log_gpu_obj,
   >mes.event_log_gpu_addr, @@ -1548,12 
+1548,11 @@
static int amdgpu_debugfs_mes_event_log_show(struct seq_file *m, void *unused)
   uint32_t *mem = (uint32_t *)(adev->mes.event_log_cpu_addr);

   seq_hex_dump(m, "", DUMP_PREFIX_OFFSET, 32, 4,
-  mem, PAGE_SIZE, false);
+  mem, AMDGPU_MES_LOG_BUFFER_SIZE, false);

   return 0;
   }

-
   DEFINE_SHOW_ATTRIBUTE(amdgpu_debugfs_mes_event_log);

   #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index 7d4f93fea937..4c8fc3117ef8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -52,6 +52,7 @@ enum amdgpu_mes_priority_level {

   #define AMDGPU_MES_PROC_CTX_SIZE 0x1000 /* one page area */
   #define AMDGPU_MES_GANG_CTX_SIZE 0x1000 /* one page area */
+#define AMDGPU_MES_LOG_BUFFER_SIZE 0x4000 /* Maximu log buffer size
+for MES */

   struct amdgpu_mes_funcs;

Re: [PATCH] drm/amd/amdgpu: Enable IH Retry CAM by register read

2024-03-26 Thread Felix Kuehling


On 2024-03-26 12:04, Alam, Dewan wrote:

[AMD Official Use Only - General]

Looping in +@Zhang, Zhaochen

CAM control register can only be written by PF. VF can only read the register. 
In SRIOV VF, the write won't work.
In SRIOV case, CAM's enablement is controlled by the host. Hence, we think the 
enablement status should be decided by the register reading.


Thank you for clarifying that. With that in mind, I would suggest 
changes to the commit headline and description to avoid confusion:


drm/amdgpu: Confirm IH retry CAM enablement by reading the register

Under SRIOV, the IH CAM cannot be enabled by the guest. The host controls
this register. In the guest driver, read the register to confirm whether
the CAM was enabled.

Regards,
  Felix




Thanks,
Dewan

-Original Message-
From: Kuehling, Felix
Sent: Wednesday, March 13, 2024 3:46 PM
To: Alam, Dewan;amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking
Subject: Re: [PATCH] drm/amd/amdgpu: Enable IH Retry CAM by register read

On 2024-03-13 13:43, Dewan Alam wrote:

IH Retry CAM should be enabled by register reads instead of always being set to 
true.

This explanation sounds odd. Your code is still writing the register first. 
What's the reason for reading back the register? I assume it's not needed for 
enabling the CAM, but to check whether it was enabled successfully. What are 
the configurations where it cannot be enabled successfully?

Two more nit-picks inline ...



Signed-off-by: Dewan Alam
---
   drivers/gpu/drm/amd/amdgpu/vega20_ih.c | 15 +++
   1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
index b9e785846637..c330f5a88a06 100644
--- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
@@ -337,13 +337,20 @@ static int vega20_ih_irq_init(struct
amdgpu_device *adev)

   /* Enable IH Retry CAM */
   if (amdgpu_ip_version(adev, OSSSYS_HWIP, 0) == IP_VERSION(4, 4, 0) ||
- amdgpu_ip_version(adev, OSSSYS_HWIP, 0) == IP_VERSION(4, 4, 2))
+ amdgpu_ip_version(adev, OSSSYS_HWIP, 0) == IP_VERSION(4, 4, 2))
+{
   WREG32_FIELD15(OSSSYS, 0, IH_RETRY_INT_CAM_CNTL_ALDEBARAN,
  ENABLE, 1);
- else
+ adev->irq.retry_cam_enabled = REG_GET_FIELD(
+ RREG32_SOC15(OSSSYS, 0,
+ mmIH_RETRY_INT_CAM_CNTL_ALDEBARAN),
+ IH_RETRY_INT_CAM_CNTL_ALDEBARAN, ENABLE);
+ } else {

Indentation looks wrong here.


   WREG32_FIELD15(OSSSYS, 0, IH_RETRY_INT_CAM_CNTL, ENABLE, 1);
-
- adev->irq.retry_cam_enabled = true;
+ adev->irq.retry_cam_enabled = REG_GET_FIELD(
+ RREG32_SOC15(OSSSYS, 0,
+ mmIH_RETRY_INT_CAM_CNTL),
+ IH_RETRY_INT_CAM_CNTL, ENABLE);
+ }

Wrong indentation.

Regards,
Felix


   /* enable interrupts */
   ret = vega20_ih_toggle_interrupts(adev, true);

Re: [PATCH 2/3] amd/amdgpu: wait no process running in kfd before resuming device

2024-03-26 Thread Felix Kuehling


On 2024-03-26 10:53, Philip Yang wrote:



On 2024-03-25 14:45, Felix Kuehling wrote:

On 2024-03-22 15:57, Zhigang Luo wrote:
it will cause page fault after device recovered if there is a 
process running.


Signed-off-by: Zhigang Luo 
Change-Id: Ib1eddb56b69ecd41fe703abd169944154f48b0cd
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 70261eb9b0bb..2867e9186e44 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4974,6 +4974,8 @@ static int amdgpu_device_reset_sriov(struct 
amdgpu_device *adev,

  retry:
  amdgpu_amdkfd_pre_reset(adev);
  +    amdgpu_amdkfd_wait_no_process_running(adev);
+


This waits for the processes to be terminated. What would cause the 
processes to be terminated? Why do the processes need to be 
terminated? Isn't it enough if the processes are removed from the 
runlist in pre-reset, so they can no longer execute on the GPU?


mode 1 reset on SRIOV is much faster then BM, kgd2kfd_pre_reset sends 
GPU reset event to user space, don't remove queues from the runlist, 
after mode1 reset is done, there is queue still running and generate 
vm fault because the GPU page table is gone.


I think seeing a page fault during the reset is not a problem. Seeing a 
page fault after the reset would be a bug. The process should not be on 
the runlist after the reset is done.


Waiting for the process to terminate first looks like a workaround, when 
the real bug is maybe that we're not updating the process state 
correctly in pre-reset. All currently running processes should be put 
into evicted state, so they are not put back on the runlist after the reset.


Regards,
  Felix



Regards,

Philip



Regards,
  Felix



amdgpu_device_stop_pending_resets(adev);
    if (from_hypervisor)

Re: [PATCH] drm/amdgpu : Increase the mes log buffer size as per new MES FW version

2024-03-25 Thread Felix Kuehling




On 2024-03-22 12:49, shaoyunl wrote:

 From MES version 0x54, the log entry increased and require the log buffer
size to be increased. The 16k is maximum size agreed


What happens when you run the new firmware on an old kernel that only 
allocates 4KB?


Regards,
  Felix




Signed-off-by: shaoyunl 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 5 ++---
  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 1 +
  2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 9ace848e174c..78e4f88f5134 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -103,7 +103,7 @@ static int amdgpu_mes_event_log_init(struct amdgpu_device 
*adev)
if (!amdgpu_mes_log_enable)
return 0;
  
-	r = amdgpu_bo_create_kernel(adev, PAGE_SIZE, PAGE_SIZE,

+   r = amdgpu_bo_create_kernel(adev, AMDGPU_MES_LOG_BUFFER_SIZE, PAGE_SIZE,
AMDGPU_GEM_DOMAIN_GTT,
>mes.event_log_gpu_obj,
>mes.event_log_gpu_addr,
@@ -1548,12 +1548,11 @@ static int amdgpu_debugfs_mes_event_log_show(struct 
seq_file *m, void *unused)
uint32_t *mem = (uint32_t *)(adev->mes.event_log_cpu_addr);
  
  	seq_hex_dump(m, "", DUMP_PREFIX_OFFSET, 32, 4,

-mem, PAGE_SIZE, false);
+mem, AMDGPU_MES_LOG_BUFFER_SIZE, false);
  
  	return 0;

  }
  
-

  DEFINE_SHOW_ATTRIBUTE(amdgpu_debugfs_mes_event_log);
  
  #endif

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index 7d4f93fea937..4c8fc3117ef8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -52,6 +52,7 @@ enum amdgpu_mes_priority_level {
  
  #define AMDGPU_MES_PROC_CTX_SIZE 0x1000 /* one page area */

  #define AMDGPU_MES_GANG_CTX_SIZE 0x1000 /* one page area */
+#define AMDGPU_MES_LOG_BUFFER_SIZE 0x4000 /* Maximu log buffer size for MES */
  
  struct amdgpu_mes_funcs;

Re: [PATCH 2/3] amd/amdgpu: wait no process running in kfd before resuming device

2024-03-25 Thread Felix Kuehling


On 2024-03-22 15:57, Zhigang Luo wrote:

it will cause page fault after device recovered if there is a process running.

Signed-off-by: Zhigang Luo 
Change-Id: Ib1eddb56b69ecd41fe703abd169944154f48b0cd
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 70261eb9b0bb..2867e9186e44 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4974,6 +4974,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device 
*adev,
  retry:
amdgpu_amdkfd_pre_reset(adev);
  
+	amdgpu_amdkfd_wait_no_process_running(adev);

+


This waits for the processes to be terminated. What would cause the 
processes to be terminated? Why do the processes need to be terminated? 
Isn't it enough if the processes are removed from the runlist in 
pre-reset, so they can no longer execute on the GPU?


Regards,
  Felix



amdgpu_device_stop_pending_resets(adev);
  
  	if (from_hypervisor)

Re: [PATCH] drm/amdkfd: Cleanup workqueue during module unload

2024-03-21 Thread Felix Kuehling




On 2024-03-20 18:52, Mukul Joshi wrote:

Destroy the high priority workqueue that handles interrupts
during KFD node cleanup.

Signed-off-by: Mukul Joshi 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
index dd3c43c1ad70..9b6b6e882593 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
@@ -104,6 +104,8 @@ void kfd_interrupt_exit(struct kfd_node *node)
 */
flush_workqueue(node->ih_wq);
  
+	destroy_workqueue(node->ih_wq);

+
kfifo_free(>ih_fifo);
  }

Re: [PATCH] drm/amdkfd: range check cp bad op exception interrupts

2024-03-21 Thread Felix Kuehling




On 2024-03-13 10:21, Jonathan Kim wrote:

Due to a CP interrupt bug, bad packet garbage exception codes are raised.
Do a range check so that the debugger and runtime do not receive garbage
codes.
Update the user api to guard exception code type checking as well.

Signed-off-by: Jonathan Kim 
Tested-by: Jesse Zhang 


Reviewed-by: Felix Kuehling 



---
  .../gpu/drm/amd/amdkfd/kfd_int_process_v10.c|  3 ++-
  .../gpu/drm/amd/amdkfd/kfd_int_process_v11.c|  3 ++-
  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c |  3 ++-
  include/uapi/linux/kfd_ioctl.h  | 17 ++---
  4 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
index a8e76287dde0..013d0a073b9b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
@@ -339,7 +339,8 @@ static void event_interrupt_wq_v10(struct kfd_node *dev,
break;
}
kfd_signal_event_interrupt(pasid, context_id0 & 
0x7f, 23);
-   } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) {
+   } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE &&
+  
KFD_DBG_EC_TYPE_IS_PACKET(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0))) {
kfd_set_dbg_ev_from_interrupt(dev, pasid,
KFD_DEBUG_DOORBELL_ID(context_id0),

KFD_EC_MASK(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0)),
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
index 7e2859736a55..fe2ad0c0de95 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
@@ -328,7 +328,8 @@ static void event_interrupt_wq_v11(struct kfd_node *dev,
/* CP */
if (source_id == SOC15_INTSRC_CP_END_OF_PIPE)
kfd_signal_event_interrupt(pasid, context_id0, 32);
-   else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE)
+   else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE &&
+
KFD_DBG_EC_TYPE_IS_PACKET(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0)))
kfd_set_dbg_ev_from_interrupt(dev, pasid,
KFD_CTXID0_DOORBELL_ID(context_id0),

KFD_EC_MASK(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0)),
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index ff7392336795..5483211c5d3d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -388,7 +388,8 @@ static void event_interrupt_wq_v9(struct kfd_node *dev,
break;
}
kfd_signal_event_interrupt(pasid, sq_int_data, 24);
-   } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) {
+   } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE &&
+  
KFD_DBG_EC_TYPE_IS_PACKET(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0))) {
kfd_set_dbg_ev_from_interrupt(dev, pasid,
KFD_DEBUG_DOORBELL_ID(context_id0),

KFD_EC_MASK(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0)),
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 9ce46edc62a5..2040a470ddb4 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -913,14 +913,25 @@ enum kfd_dbg_trap_exception_code {
 KFD_EC_MASK(EC_DEVICE_NEW))
  #define KFD_EC_MASK_PROCESS   (KFD_EC_MASK(EC_PROCESS_RUNTIME) |  \
 KFD_EC_MASK(EC_PROCESS_DEVICE_REMOVE))
+#define KFD_EC_MASK_PACKET 
(KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_DIM_INVALID) |\
+
KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_GROUP_SEGMENT_SIZE_INVALID) | \
+
KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_CODE_INVALID) |   \
+KFD_EC_MASK(EC_QUEUE_PACKET_RESERVED) |
\
+KFD_EC_MASK(EC_QUEUE_PACKET_UNSUPPORTED) | 
\
+
KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID) |\
+
KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID) |   \
+
KFD_EC_MASK(EC_QUEUE_PACKET_VENDOR_UNSUPPORTED))
  
  /* Checks for exception code types for KFD search */

+#define KFD_DBG_EC_IS_VALID(ecode) (ecode > EC_NONE && ecode < EC_MAX)
  #define KFD_DBG_EC_TYPE_IS_QUEUE(ecode)   
\
-

Re: [PATCH] drm/amdkfd: Check cgroup when returning DMABuf info

2024-03-20 Thread Felix Kuehling


On 2024-03-18 16:12, Felix Kuehling wrote:


On 2024-03-15 14:17, Mukul Joshi wrote:

Check cgroup permissions when returning DMA-buf info and
based on cgroup check return the id of the GPU that has
access to the BO.

Signed-off-by: Mukul Joshi 
---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c

index dfa8c69532d4..f9631f4b1a02 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1523,7 +1523,7 @@ static int kfd_ioctl_get_dmabuf_info(struct 
file *filep,
    /* Find a KFD GPU device that supports the get_dmabuf_info 
query */

  for (i = 0; kfd_topology_enum_kfd_devices(i, ) == 0; i++)
-    if (dev)
+    if (dev && !kfd_devcgroup_check_permission(dev))
  break;
  if (!dev)
  return -EINVAL;
@@ -1545,7 +1545,7 @@ static int kfd_ioctl_get_dmabuf_info(struct 
file *filep,

  if (xcp_id >= 0)
  args->gpu_id = dmabuf_adev->kfd.dev->nodes[xcp_id]->id;
  else
-    args->gpu_id = dmabuf_adev->kfd.dev->nodes[0]->id;
+    args->gpu_id = dev->id;


If I remember correctly, this was meant as a fallback in case for GTT 
BOs where the exporting partition wasn't known and the application 
didn't have access to the first partition. I think the way you wrote 
this, it could also change the behaviour (report the wrong GPU ID) on 
single-partition GPUs, which is probably not intended.


Never mind. I double checked: On single-partition GPUs, bo->xcp_id 
always seems to be 0. So your code won't change the behaviour here. The 
patch is


Reviewed-by: Felix Kuehling 




Maybe this would preserve the behaviour for that case:

...
-    else
+    else if 
(!kfd_devcgroup_check_permission(dmabuf_adev->kfd.dev->nodes[0]))

 args->gpu_id = dmabuf_adev->kfd.dev->nodes[0]->id;
+    else
+    args->gpu_id = dev->id;

Or maybe a more general solution would make DMABuf import work when 
the exporter is really unknown or not even a GPU. This came up not so 
long ago in the context of interop with 3rd-party devices. This may 
require user mode changes as well.


Regards,
  Felix



  args->flags = flags;
    /* Copy metadata buffer to user mode */

Re: [PATCH] drm/amdkfd: Check cgroup when returning DMABuf info

2024-03-20 Thread Felix Kuehling




On 2024-03-20 15:09, Joshi, Mukul wrote:

[AMD Official Use Only - General]


-Original Message-
From: Kuehling, Felix 
Sent: Monday, March 18, 2024 4:13 PM
To: Joshi, Mukul ; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdkfd: Check cgroup when returning DMABuf info


On 2024-03-15 14:17, Mukul Joshi wrote:

Check cgroup permissions when returning DMA-buf info and based on
cgroup check return the id of the GPU that has access to the BO.

Signed-off-by: Mukul Joshi 
---
   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 4 ++--
   1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index dfa8c69532d4..f9631f4b1a02 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1523,7 +1523,7 @@ static int kfd_ioctl_get_dmabuf_info(struct file
*filep,

 /* Find a KFD GPU device that supports the get_dmabuf_info query */
 for (i = 0; kfd_topology_enum_kfd_devices(i, ) == 0; i++)
-   if (dev)
+   if (dev && !kfd_devcgroup_check_permission(dev))
 break;
 if (!dev)
 return -EINVAL;
@@ -1545,7 +1545,7 @@ static int kfd_ioctl_get_dmabuf_info(struct file

*filep,

 if (xcp_id >= 0)
 args->gpu_id = dmabuf_adev->kfd.dev->nodes[xcp_id]->id;
 else
-   args->gpu_id = dmabuf_adev->kfd.dev->nodes[0]->id;
+   args->gpu_id = dev->id;

If I remember correctly, this was meant as a fallback in case for GTT BOs where
the exporting partition wasn't known and the application didn't have access to
the first partition. I think the way you wrote this, it could also change the
behaviour (report the wrong GPU ID) on single-partition GPUs, which is
probably not intended. Maybe this would preserve the behaviour for that
case:


Can you please explain why this could be a issue on a single partition GPU?


What would xcp_id be on a single-partition GPU? If it's < 0, then your 
patch changes the behaviour. Instead or returning the GPU ID from the 
GPU where the memory was allocated, it returns some arbitrary GPU that 
the application has access to.


Regards,
  Felix




Regards,
Mukul


   ...
- else
+ else if (!kfd_devcgroup_check_permission(dmabuf_adev->kfd.dev-

nodes[0]))

   args->gpu_id = dmabuf_adev->kfd.dev->nodes[0]->id;
+ else
+ args->gpu_id = dev->id;

Or maybe a more general solution would make DMABuf import work when
the
exporter is really unknown or not even a GPU. This came up not so long
ago in the context of interop with 3rd-party devices. This may require
user mode changes as well.

Regards,
Felix



 args->flags = flags;

 /* Copy metadata buffer to user mode */

Re: [PATCH] drm/amdkfd: Check cgroup when returning DMABuf info

2024-03-18 Thread Felix Kuehling




On 2024-03-15 14:17, Mukul Joshi wrote:

Check cgroup permissions when returning DMA-buf info and
based on cgroup check return the id of the GPU that has
access to the BO.

Signed-off-by: Mukul Joshi 
---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index dfa8c69532d4..f9631f4b1a02 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1523,7 +1523,7 @@ static int kfd_ioctl_get_dmabuf_info(struct file *filep,
  
  	/* Find a KFD GPU device that supports the get_dmabuf_info query */

for (i = 0; kfd_topology_enum_kfd_devices(i, ) == 0; i++)
-   if (dev)
+   if (dev && !kfd_devcgroup_check_permission(dev))
break;
if (!dev)
return -EINVAL;
@@ -1545,7 +1545,7 @@ static int kfd_ioctl_get_dmabuf_info(struct file *filep,
if (xcp_id >= 0)
args->gpu_id = dmabuf_adev->kfd.dev->nodes[xcp_id]->id;
else
-   args->gpu_id = dmabuf_adev->kfd.dev->nodes[0]->id;
+   args->gpu_id = dev->id;


If I remember correctly, this was meant as a fallback in case for GTT 
BOs where the exporting partition wasn't known and the application 
didn't have access to the first partition. I think the way you wrote 
this, it could also change the behaviour (report the wrong GPU ID) on 
single-partition GPUs, which is probably not intended. Maybe this would 
preserve the behaviour for that case:


...
-   else
+   else if 
(!kfd_devcgroup_check_permission(dmabuf_adev->kfd.dev->nodes[0]))
args->gpu_id = dmabuf_adev->kfd.dev->nodes[0]->id;
+   else
+   args->gpu_id = dev->id;

Or maybe a more general solution would make DMABuf import work when the 
exporter is really unknown or not even a GPU. This came up not so long 
ago in the context of interop with 3rd-party devices. This may require 
user mode changes as well.


Regards,
  Felix



args->flags = flags;
  
  	/* Copy metadata buffer to user mode */

Re: [PATCH 05/10] drivers: use new capable_any functionality

2024-03-15 Thread Felix Kuehling


On 2024-03-15 7:37, Christian Göttsche wrote:

Use the new added capable_any function in appropriate cases, where a
task is required to have any of two capabilities.

Reorder CAP_SYS_ADMIN last.

Signed-off-by: Christian Göttsche 
Acked-by: Alexander Gordeev  (s390 portion)


Acked-by: Felix Kuehling  (amdkfd portion)



---
v4:
Additional usage in kfd_ioctl()
v3:
rename to capable_any()
---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 3 +--
  drivers/net/caif/caif_serial.c   | 2 +-
  drivers/s390/block/dasd_eckd.c   | 2 +-
  3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index dfa8c69532d4..8c7ebca01c17 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -3290,8 +3290,7 @@ static long kfd_ioctl(struct file *filep, unsigned int 
cmd, unsigned long arg)
 * more priviledged access.
 */
if (unlikely(ioctl->flags & KFD_IOC_FLAG_CHECKPOINT_RESTORE)) {
-   if (!capable(CAP_CHECKPOINT_RESTORE) &&
-   !capable(CAP_SYS_ADMIN)) {
+   if (!capable_any(CAP_CHECKPOINT_RESTORE, CAP_SYS_ADMIN)) {
retcode = -EACCES;
goto err_i1;
}
diff --git a/drivers/net/caif/caif_serial.c b/drivers/net/caif/caif_serial.c
index ed3a589def6b..e908b9ce57dc 100644
--- a/drivers/net/caif/caif_serial.c
+++ b/drivers/net/caif/caif_serial.c
@@ -326,7 +326,7 @@ static int ldisc_open(struct tty_struct *tty)
/* No write no play */
if (tty->ops->write == NULL)
return -EOPNOTSUPP;
-   if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_TTY_CONFIG))
+   if (!capable_any(CAP_SYS_TTY_CONFIG, CAP_SYS_ADMIN))
return -EPERM;
  
  	/* release devices to avoid name collision */

diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index 373c1a86c33e..8f9a5136306a 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -5384,7 +5384,7 @@ static int dasd_symm_io(struct dasd_device *device, void 
__user *argp)
char psf0, psf1;
int rc;
  
-	if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RAWIO))

+   if (!capable_any(CAP_SYS_RAWIO, CAP_SYS_ADMIN))
return -EACCES;
psf0 = psf1 = 0;

Re: Proposal to add CRIU support to DRM render nodes

2024-03-14 Thread Felix Kuehling




On 2024-03-12 5:45, Tvrtko Ursulin wrote:


On 11/03/2024 14:48, Tvrtko Ursulin wrote:


Hi Felix,

On 06/12/2023 21:23, Felix Kuehling wrote:
Executive Summary: We need to add CRIU support to DRM render nodes 
in order to maintain CRIU support for ROCm application once they 
start relying on render nodes for more GPU memory management. In 
this email I'm providing some background why we are doing this, and 
outlining some of the problems we need to solve to checkpoint and 
restore render node state and shared memory (DMABuf) state. I have 
some thoughts on the API design, leaning on what we did for KFD, but 
would like to get feedback from the DRI community regarding that API 
and to what extent there is interest in making that generic.


We are working on using DRM render nodes for virtual address 
mappings in ROCm applications to implement the CUDA11-style VM API 
and improve interoperability between graphics and compute. This uses 
DMABufs for sharing buffer objects between KFD and multiple render 
node devices, as well as between processes. In the long run this 
also provides a path to moving all or most memory management from 
the KFD ioctl API to libdrm.


Once ROCm user mode starts using render nodes for virtual address 
management, that creates a problem for checkpointing and restoring 
ROCm applications with CRIU. Currently there is no support for 
checkpointing and restoring render node state, other than CPU 
virtual address mappings. Support will be needed for checkpointing 
GEM buffer objects and handles, their GPU virtual address mappings 
and memory sharing relationships between devices and processes.


Eventually, if full CRIU support for graphics applications is 
desired, more state would need to be captured, including scheduler 
contexts and BO lists. Most of this state is driver-specific.


After some internal discussions we decided to take our design 
process public as this potentially touches DRM GEM and DMABuf APIs 
and may have implications for other drivers in the future.


One basic question before going into any API details: Is there a 
desire to have CRIU support for other DRM drivers?


This sounds like a very interesting feature on the overall, although 
I cannot answer on the last question here.


I forgot to finish this thought. I cannot answer / don't know of any 
concrete plans, but I think feature is pretty cool and if amdgpu gets 
it working I wouldn't be surprised if other drivers would get interested.


Thanks, that's good to hear!




Funnily enough, it has a tiny relation to an i915 feature I recently 
implemented on Mesa's request, which is to be able to "upload" the 
GPU context from the GPU hang error state and replay the hanging 
request. It is kind of (at a stretch) a very special tiny subset of 
checkout and restore so I am not mentioning it as a curiosity.


And there is also another partical conceptual intersect with the (at 
the moment not yet upstream) i915 online debugger. This part being in 
the area of discovering and enumerating GPU resources beloning to the 
client.


I don't see an immediate design or code sharing opportunities though 
but just mentioning.


I did spend some time reading your plugin and kernel implementation 
out of curiousity and have some comments and questions.


With that out of the way, some considerations for a possible DRM 
CRIU API (either generic of AMDGPU driver specific): The API goes 
through several phases during checkpoint and restore:


Checkpoint:

 1. Process-info (enumerates objects and sizes so user mode can 
allocate

    memory for the checkpoint, stops execution on the GPU)
 2. Checkpoint (store object metadata for BOs, queues, etc.)
 3. Unpause (resumes execution after the checkpoint is complete)

Restore:

 1. Restore (restore objects, VMAs are not in the right place at 
this time)
 2. Resume (final fixups after the VMAs are sorted out, resume 
execution)


Btw is check-pointing guaranteeing all relevant activity is idled? 
For instance dma_resv objects are free of fences which would need to 
restored for things to continue executing sensibly? Or how is that 
handled?


In our compute use cases, we suspend user mode queues. This can include 
CWSR (compute-wave-save-restore) where the state of in-flight waves is 
stored in memory and can be reloaded and resumed from memory later. We 
don't use any fences other than "eviction fences", that are signaled 
after the queues are suspended. And those fences are never handed to 
user mode. So we don't need to worry about any fence state in the 
checkpoint.


If we extended this to support the kernel mode command submission APIs, 
I would expect that we'd wait for all current submissions to complete, 
and stop new ones from being sent to the HW before taking the 
checkpoint. When we take the checkpoint in the CRIU plugin, the CPU 
threads are already frozen and cannot submit any more work. If we wait 
for all currently pending submissions to dra

Re: [PATCH 2/2] drm/amdkfd: Check preemption status on all XCDs

2024-03-14 Thread Felix Kuehling


On 2024-03-14 12:00, Mukul Joshi wrote:

This patch adds the following functionality:
- Check the queue preemption status on all XCDs in a partition
   for GFX 9.4.3.
- Update the queue preemption debug message to print the queue
   doorbell id for which preemption failed.
- Change the signature of check preemption failed function to
   return a bool instead of uint32_t and pass the MQD manager
   as an argument.

Suggested-by: Jay Cornwall
Signed-off-by: Mukul Joshi
---
  .../drm/amd/amdkfd/kfd_device_queue_manager.c |  3 +--
  drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c  | 18 +
  drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h  |  4 ++-
  .../gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c  |  4 +--
  .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  4 +--
  .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  |  4 +--
  .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   | 25 ---
  .../gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c   |  4 +--
  8 files changed, 52 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 1ce398ab0b3d..151fabf84040 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1997,8 +1997,7 @@ static int unmap_queues_cpsch(struct device_queue_manager 
*dqm,
 * check those fields
 */
mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ];
-   if 
(mqd_mgr->check_preemption_failed(dqm->packet_mgr.priv_queue->queue->mqd)) {
-   dev_err(dev, "HIQ MQD's queue_doorbell_id0 is not 0, Queue 
preemption time out\n");
+   if (mqd_mgr->check_preemption_failed(mqd_mgr, 
dqm->packet_mgr.priv_queue->queue->mqd)) {
while (halt_if_hws_hang)
schedule();
return -ETIME;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
index 050a6936ff84..cbec8c87c984 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
@@ -290,3 +290,21 @@ uint64_t kfd_mqd_stride(struct mqd_manager *mm,
  {
return mm->mqd_size;
  }
+
+bool kfd_check_hiq_mqd_doorbell_id(struct kfd_node *node, uint32_t doorbell_id,
+  uint32_t inst)
+{
+   if (doorbell_id) {
+   struct device *dev = node->adev->dev;
+
+   if (KFD_GC_VERSION(node) == IP_VERSION(9, 4, 3))


Could this be made more generic? E.g.:

if (node->adev->xcp_mgr && node->adev->xcp_mgr->num_xcps > 0)

Other than that, the series is

Reviewed-by: Felix Kuehling 



+   dev_err(dev, "XCC %d: Queue preemption failed for queue with 
doorbell_id: %x\n",
+   inst, doorbell_id);
+   else
+   dev_err(dev, "Queue preemption failed for queue with 
doorbell_id: %x\n",
+   doorbell_id);
+   return true;
+   }
+
+   return false;
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
index ba3eebb2ca6d..17cc1f25c8d0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
@@ -119,7 +119,7 @@ struct mqd_manager {
  #if defined(CONFIG_DEBUG_FS)
int (*debugfs_show_mqd)(struct seq_file *m, void *data);
  #endif
-   uint32_t (*check_preemption_failed)(void *mqd);
+   bool (*check_preemption_failed)(struct mqd_manager *mm, void *mqd);
uint64_t (*mqd_stride)(struct mqd_manager *mm,
struct queue_properties *p);
  
@@ -198,4 +198,6 @@ void kfd_get_hiq_xcc_mqd(struct kfd_node *dev,

  uint64_t kfd_hiq_mqd_stride(struct kfd_node *dev);
  uint64_t kfd_mqd_stride(struct mqd_manager *mm,
struct queue_properties *q);
+bool kfd_check_hiq_mqd_doorbell_id(struct kfd_node *node, uint32_t doorbell_id,
+  uint32_t inst);
  #endif /* KFD_MQD_MANAGER_H_ */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
index 8f9f56f7a8b0..05f3ac2eaef9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
@@ -206,11 +206,11 @@ static void __update_mqd(struct mqd_manager *mm, void 
*mqd,
q->is_active = QUEUE_IS_ACTIVE(*q);
  }
  
-static uint32_t check_preemption_failed(void *mqd)

+static bool check_preemption_failed(struct mqd_manager *mm, void *mqd)
  {
struct cik_mqd *m = (struct cik_mqd *)mqd;
  
-	return m->queue_doorbell_id0;

+   return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0);
  }
  
  static void update_

Re: [PATCH AUTOSEL 5.15 3/5] drm/amdgpu: Enable gpu reset for S3 abort cases on Raven series

2024-03-13 Thread Felix Kuehling


On 2024-03-11 11:14, Sasha Levin wrote:

From: Prike Liang 

[ Upstream commit c671ec01311b4744b377f98b0b4c6d033fe569b3 ]

Currently, GPU resets can now be performed successfully on the Raven
series. While GPU reset is required for the S3 suspend abort case.
So now can enable gpu reset for S3 abort cases on the Raven series.


This looks suspicious to me. I'm not sure what conditions made the GPU 
reset successful. But unless all the changes involved were also 
backported, this should probably not be applied to older kernel 
branches. I'm speculating it may be related to the removal of AMD IOMMUv2.


Regards,
  Felix




Signed-off-by: Prike Liang 
Acked-by: Alex Deucher 
Signed-off-by: Alex Deucher 
Signed-off-by: Sasha Levin 
---
  drivers/gpu/drm/amd/amdgpu/soc15.c | 45 +-
  1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c 
b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 6a3486f52d698..ef5b3eedc8615 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -605,11 +605,34 @@ soc15_asic_reset_method(struct amdgpu_device *adev)
return AMD_RESET_METHOD_MODE1;
  }
  
+static bool soc15_need_reset_on_resume(struct amdgpu_device *adev)

+{
+   u32 sol_reg;
+
+   sol_reg = RREG32_SOC15(MP0, 0, mmMP0_SMN_C2PMSG_81);
+
+   /* Will reset for the following suspend abort cases.
+* 1) Only reset limit on APU side, dGPU hasn't checked yet.
+* 2) S3 suspend abort and TOS already launched.
+*/
+   if (adev->flags & AMD_IS_APU && adev->in_s3 &&
+   !adev->suspend_complete &&
+   sol_reg)
+   return true;
+
+   return false;
+}
+
  static int soc15_asic_reset(struct amdgpu_device *adev)
  {
/* original raven doesn't have full asic reset */
-   if ((adev->apu_flags & AMD_APU_IS_RAVEN) ||
-   (adev->apu_flags & AMD_APU_IS_RAVEN2))
+   /* On the latest Raven, the GPU reset can be performed
+* successfully. So now, temporarily enable it for the
+* S3 suspend abort case.
+*/
+   if (((adev->apu_flags & AMD_APU_IS_RAVEN) ||
+   (adev->apu_flags & AMD_APU_IS_RAVEN2)) &&
+   !soc15_need_reset_on_resume(adev))
return 0;
  
  	switch (soc15_asic_reset_method(adev)) {

@@ -1490,24 +1513,6 @@ static int soc15_common_suspend(void *handle)
return soc15_common_hw_fini(adev);
  }
  
-static bool soc15_need_reset_on_resume(struct amdgpu_device *adev)

-{
-   u32 sol_reg;
-
-   sol_reg = RREG32_SOC15(MP0, 0, mmMP0_SMN_C2PMSG_81);
-
-   /* Will reset for the following suspend abort cases.
-* 1) Only reset limit on APU side, dGPU hasn't checked yet.
-* 2) S3 suspend abort and TOS already launched.
-*/
-   if (adev->flags & AMD_IS_APU && adev->in_s3 &&
-   !adev->suspend_complete &&
-   sol_reg)
-   return true;
-
-   return false;
-}
-
  static int soc15_common_resume(void *handle)
  {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;

Re: [PATCH] drm/amdgpu: Do a basic health check before reset

2024-03-13 Thread Felix Kuehling




On 2024-03-13 5:41, Lijo Lazar wrote:

Check if the device is present in the bus before trying to recover. It
could be that device itself is lost from the bus in some hang
situations.

Signed-off-by: Lijo Lazar 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24 ++
  1 file changed, 24 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 1e9454e6e4cb..b37113b79483 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5536,6 +5536,23 @@ static inline void 
amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
  
  }
  
+static int amdgpu_device_health_check(struct list_head *device_list_handle)

+{
+   struct amdgpu_device *tmp_adev;
+   int ret = 0;
+   u32 status;
+
+   list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+   pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, );
+   if (PCI_POSSIBLE_ERROR(status)) {
+   dev_err(tmp_adev->dev, "device lost from bus!");
+   ret = -ENODEV;


You could just return here. What's the point of looking for other 
devices if you're going to return an error anyway?


Regards,
  Felix



+   }
+   }
+
+   return ret;
+}
+
  /**
   * amdgpu_device_gpu_recover - reset the asic and recover scheduler
   *
@@ -5607,6 +5624,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
device_list_handle = _list;
}
  
+	if (!amdgpu_sriov_vf(adev)) {

+   r = amdgpu_device_health_check(device_list_handle);
+   if (r)
+   goto end_reset;
+   }
+
/* We need to lock reset domain only once both for XGMI and single 
device */
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
reset_list);
@@ -5772,6 +5795,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
reset_list);
amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
  
+end_reset:

if (hive) {
mutex_unlock(>hive_lock);
amdgpu_put_xgmi_hive(hive);

Re: [PATCH] drm/amd/amdgpu: Enable IH Retry CAM by register read

2024-03-13 Thread Felix Kuehling


On 2024-03-13 13:43, Dewan Alam wrote:

IH Retry CAM should be enabled by register reads instead of always being set to 
true.
This explanation sounds odd. Your code is still writing the register 
first. What's the reason for reading back the register? I assume it's 
not needed for enabling the CAM, but to check whether it was enabled 
successfully. What are the configurations where it cannot be enabled 
successfully?


Two more nit-picks inline ...




Signed-off-by: Dewan Alam 
---
  drivers/gpu/drm/amd/amdgpu/vega20_ih.c | 15 +++
  1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c 
b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
index b9e785846637..c330f5a88a06 100644
--- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
@@ -337,13 +337,20 @@ static int vega20_ih_irq_init(struct amdgpu_device *adev)
  
  	/* Enable IH Retry CAM */

if (amdgpu_ip_version(adev, OSSSYS_HWIP, 0) == IP_VERSION(4, 4, 0) ||
-   amdgpu_ip_version(adev, OSSSYS_HWIP, 0) == IP_VERSION(4, 4, 2))
+   amdgpu_ip_version(adev, OSSSYS_HWIP, 0) == IP_VERSION(4, 4, 2)) {
WREG32_FIELD15(OSSSYS, 0, IH_RETRY_INT_CAM_CNTL_ALDEBARAN,
   ENABLE, 1);
-   else
+   adev->irq.retry_cam_enabled = REG_GET_FIELD(
+   RREG32_SOC15(OSSSYS, 0,
+   mmIH_RETRY_INT_CAM_CNTL_ALDEBARAN),
+   IH_RETRY_INT_CAM_CNTL_ALDEBARAN, ENABLE);
+   } else {


Indentation looks wrong here.


WREG32_FIELD15(OSSSYS, 0, IH_RETRY_INT_CAM_CNTL, ENABLE, 1);
-
-   adev->irq.retry_cam_enabled = true;
+   adev->irq.retry_cam_enabled = REG_GET_FIELD(
+   RREG32_SOC15(OSSSYS, 0,
+   mmIH_RETRY_INT_CAM_CNTL),
+   IH_RETRY_INT_CAM_CNTL, ENABLE);
+   }


Wrong indentation.

Regards,
  Felix

  
  	/* enable interrupts */

ret = vega20_ih_toggle_interrupts(adev, true);

Re: [PATCH v3] drm/amdgpu: Init zone device and drm client after mode-1 reset on reload

2024-03-12 Thread Felix Kuehling


On 2024-03-08 14:00, Ahmad Rehman wrote:

In passthrough environment, when amdgpu is reloaded after unload, mode-1
is triggered after initializing the necessary IPs, That init does not
include KFD, and KFD init waits until the reset is completed. KFD init
is called in the reset handler, but in this case, the zone device and
drm client is not initialized, causing app to create kernel panic.

v2: Removing the init KFD condition from amdgpu_amdkfd_drm_client_create.
As the previous version has the potential of creating DRM client twice.

v3: v2 patch results in SDMA engine hung as DRM open causes VM clear to SDMA
before SDAM init. Adding the condition to in drm client creation, on top of v1,
to guard against drm client creation call multiple times.

Signed-off-by: Ahmad Rehman 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 4 ++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c| 5 -
  2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index f5f2945711be..4389d24f36e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -146,8 +146,8 @@ int amdgpu_amdkfd_drm_client_create(struct amdgpu_device 
*adev)
  {
int ret;
  
-	if (!adev->kfd.init_complete)

-   return 0;
+   if (!adev->kfd.init_complete || adev->kfd.client.dev)
+return 0;
  
  	ret = drm_client_init(>ddev, >kfd.client, "kfd",

  _client_funcs);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 15b188aaf681..80b9642f2bc4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2479,8 +2479,11 @@ static void amdgpu_drv_delayed_reset_work_handler(struct 
work_struct *work)
}
for (i = 0; i < mgpu_info.num_dgpu; i++) {
adev = mgpu_info.gpu_ins[i].adev;
-   if (!adev->kfd.init_complete)
+   if (!adev->kfd.init_complete) {
+   kgd2kfd_init_zone_device(adev);
amdgpu_amdkfd_device_init(adev);
+   amdgpu_amdkfd_drm_client_create(adev);
+   }
amdgpu_ttm_set_buffer_funcs_status(adev, true);
}
  }

Re: [PATCH] drm/amdgpu: Handle duplicate BOs during process restore

2024-03-11 Thread Felix Kuehling


On 2024-03-11 12:33, Christian König wrote:

Am 11.03.24 um 16:33 schrieb Felix Kuehling:

On 2024-03-11 11:25, Joshi, Mukul wrote:

[AMD Official Use Only - General]


-Original Message-
From: Christian König 
Sent: Monday, March 11, 2024 2:50 AM
To: Joshi, Mukul ; amd-gfx@lists.freedesktop.org
Cc: Kuehling, Felix 
Subject: Re: [PATCH] drm/amdgpu: Handle duplicate BOs during process
restore

Caution: This message originated from an External Source. Use 
proper caution

when opening attachments, clicking links, or responding.


Am 08.03.24 um 17:22 schrieb Mukul Joshi:

In certain situations, some apps can import a BO multiple times
(through IPC for example). To restore such processes successfully, we
need to tell drm to ignore duplicate BOs.
While at it, also add additional logging to prevent silent failures
when process restore fails.

Signed-off-by: Mukul Joshi 
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 14

++

   1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index bf8e6653341f..65d808d8b5da 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2869,14 +2869,16 @@ int
amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence
__rcu *

   mutex_lock(_info->lock);

- drm_exec_init(, 0);
+ drm_exec_init(, DRM_EXEC_IGNORE_DUPLICATES);
   drm_exec_until_all_locked() {
   list_for_each_entry(peer_vm, 
_info->vm_list_head,

   vm_list_node) {
   ret = amdgpu_vm_lock_pd(peer_vm, , 2);
drm_exec_retry_on_contention();
- if (unlikely(ret))
+ if (unlikely(ret)) {
+ pr_err("Locking VM PD failed, ret:
+ %d\n", ret);
   goto ttm_reserve_fail;
+ }
That's a bad idea. Locking can always be interrupted and that would 
print an

error here.


Thanks Christian. Will send out a patch to change it to pr_debug.


We cannot get interrupted here because we're in a worker thread. We 
should be running in non-interruptible mode.


Ah! Ok in that case this isn't necessary.

But in general I think we should avoid error printing like that. If we 
want to know where something failed there is a function tracker for that.


In this case, it was hard to know that something failed at all. The 
problem manifested as a soft-hang in an application, and it took several 
teams several days to track it down to an eviction/restore problem in 
kernel mode. A failure to reserve BOs seems like the type of problem 
that is not expected here, and would justify an error or warning message 
in the kernel log. That would have helped track down this issue much faster.


Regards,
  Felix




Regards,
Christian.



Regards,
  Felix




Regards,
Mukul


Regards,
Christian.


   }

   /* Reserve all BOs and page tables/directory. Add all
BOs from @@ -2889,8 +2891,10 @@ int

amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence
__rcu *

   gobj = >bo->tbo.base;
   ret = drm_exec_prepare_obj(, gobj, 1);
drm_exec_retry_on_contention();
- if (unlikely(ret))
+ if (unlikely(ret)) {
+ pr_err("drm_exec_prepare_obj failed,
+ ret: %d\n", ret);
   goto ttm_reserve_fail;
+ }
   }
   }

@@ -2950,8 +2954,10 @@ int

amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence
__rcu *

    * validations above would invalidate DMABuf imports again.
    */
   ret = process_validate_vms(process_info, );
- if (ret)
+ if (ret) {
+ pr_err("Validating VMs failed, ret: %d\n", ret);
   goto validate_map_fail;
+ }

   /* Update mappings not managed by KFD */
   list_for_each_entry(peer_vm, _info->vm_list_head,

Re: [PATCH] drm/amdgpu: Handle duplicate BOs during process restore

2024-03-11 Thread Felix Kuehling


On 2024-03-11 11:25, Joshi, Mukul wrote:

[AMD Official Use Only - General]


-Original Message-
From: Christian König 
Sent: Monday, March 11, 2024 2:50 AM
To: Joshi, Mukul ; amd-gfx@lists.freedesktop.org
Cc: Kuehling, Felix 
Subject: Re: [PATCH] drm/amdgpu: Handle duplicate BOs during process
restore

Caution: This message originated from an External Source. Use proper caution
when opening attachments, clicking links, or responding.


Am 08.03.24 um 17:22 schrieb Mukul Joshi:

In certain situations, some apps can import a BO multiple times
(through IPC for example). To restore such processes successfully, we
need to tell drm to ignore duplicate BOs.
While at it, also add additional logging to prevent silent failures
when process restore fails.

Signed-off-by: Mukul Joshi 
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 14

++

   1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index bf8e6653341f..65d808d8b5da 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2869,14 +2869,16 @@ int
amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence
__rcu *

   mutex_lock(_info->lock);

- drm_exec_init(, 0);
+ drm_exec_init(, DRM_EXEC_IGNORE_DUPLICATES);
   drm_exec_until_all_locked() {
   list_for_each_entry(peer_vm, _info->vm_list_head,
   vm_list_node) {
   ret = amdgpu_vm_lock_pd(peer_vm, , 2);
   drm_exec_retry_on_contention();
- if (unlikely(ret))
+ if (unlikely(ret)) {
+ pr_err("Locking VM PD failed, ret:
+ %d\n", ret);
   goto ttm_reserve_fail;
+ }

That's a bad idea. Locking can always be interrupted and that would print an
error here.


Thanks Christian. Will send out a patch to change it to pr_debug.


We cannot get interrupted here because we're in a worker thread. We 
should be running in non-interruptible mode.


Regards,
  Felix




Regards,
Mukul


Regards,
Christian.


   }

   /* Reserve all BOs and page tables/directory. Add all
BOs from @@ -2889,8 +2891,10 @@ int

amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence
__rcu *

   gobj = >bo->tbo.base;
   ret = drm_exec_prepare_obj(, gobj, 1);
   drm_exec_retry_on_contention();
- if (unlikely(ret))
+ if (unlikely(ret)) {
+ pr_err("drm_exec_prepare_obj failed,
+ ret: %d\n", ret);
   goto ttm_reserve_fail;
+ }
   }
   }

@@ -2950,8 +2954,10 @@ int

amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence
__rcu *

* validations above would invalidate DMABuf imports again.
*/
   ret = process_validate_vms(process_info, );
- if (ret)
+ if (ret) {
+ pr_err("Validating VMs failed, ret: %d\n", ret);
   goto validate_map_fail;
+ }

   /* Update mappings not managed by KFD */
   list_for_each_entry(peer_vm, _info->vm_list_head,

Re: [PATCH] drm/amdgpu: Handle duplicate BOs during process restore

2024-03-08 Thread Felix Kuehling


On 2024-03-08 11:22, Mukul Joshi wrote:

In certain situations, some apps can import a BO multiple times
(through IPC for example). To restore such processes successfully,
we need to tell drm to ignore duplicate BOs.
While at it, also add additional logging to prevent silent failures
when process restore fails.

Signed-off-by: Mukul Joshi 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 14 ++
  1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index bf8e6653341f..65d808d8b5da 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2869,14 +2869,16 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, 
struct dma_fence __rcu *
  
  	mutex_lock(_info->lock);
  
-	drm_exec_init(, 0);

+   drm_exec_init(, DRM_EXEC_IGNORE_DUPLICATES);
drm_exec_until_all_locked() {
list_for_each_entry(peer_vm, _info->vm_list_head,
vm_list_node) {
ret = amdgpu_vm_lock_pd(peer_vm, , 2);
drm_exec_retry_on_contention();
-   if (unlikely(ret))
+   if (unlikely(ret)) {
+   pr_err("Locking VM PD failed, ret: %d\n", ret);


pr_err makes sense here as it indicates a persistent problem that would 
cause soft hangs, like in this case.




goto ttm_reserve_fail;
+   }
}
  
  		/* Reserve all BOs and page tables/directory. Add all BOs from

@@ -2889,8 +2891,10 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, 
struct dma_fence __rcu *
gobj = >bo->tbo.base;
ret = drm_exec_prepare_obj(, gobj, 1);
drm_exec_retry_on_contention();
-   if (unlikely(ret))
+   if (unlikely(ret)) {
+   pr_err("drm_exec_prepare_obj failed, ret: 
%d\n", ret);


Same here, pr_err is fine.



goto ttm_reserve_fail;
+   }
}
}
  
@@ -2950,8 +2954,10 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu *

 * validations above would invalidate DMABuf imports again.
 */
ret = process_validate_vms(process_info, );
-   if (ret)
+   if (ret) {
+   pr_err("Validating VMs failed, ret: %d\n", ret);


I'd make this a pr_debug to avoid spamming the log. validation can fail 
intermittently and rescheduling the worker is there to handle it.


With that fixed, the patch is

Reviewed-by: Felix Kuehling 



goto validate_map_fail;
+   }
  
  	/* Update mappings not managed by KFD */

list_for_each_entry(peer_vm, _info->vm_list_head,

Re: [PATCH v5 1/2] drm/amdgpu: implement TLB flush fence

2024-03-07 Thread Felix Kuehling


On 2024-03-07 1:39, Sharma, Shashank wrote:


On 07/03/2024 00:54, Felix Kuehling wrote:


On 2024-03-06 09:41, Shashank Sharma wrote:

From: Christian König 

The problem is that when (for example) 4k pages are replaced
with a single 2M page we need to wait for change to be flushed
out by invalidating the TLB before the PT can be freed.

Solve this by moving the TLB flush into a DMA-fence object which
can be used to delay the freeing of the PT BOs until it is signaled.

V2: (Shashank)
 - rebase
 - set dma_fence_error only in case of error
 - add tlb_flush fence only when PT/PD BO is locked (Felix)
 - use vm->pasid when f is NULL (Mukul)

V4: - add a wait for (f->dependency) in tlb_fence_work (Christian)
 - move the misplaced fence_create call to the end (Philip)

V5: - free the f->dependency properly (Christian)

Cc: Christian Koenig 
Cc: Felix Kuehling 
Cc: Rajneesh Bhardwaj 
Cc: Alex Deucher 
Reviewed-by: Shashank Sharma 
Signed-off-by: Christian König 
Signed-off-by: Shashank Sharma 
---
  drivers/gpu/drm/amd/amdgpu/Makefile   |   3 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    |  10 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h    |   4 +
  .../gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c  | 112 
++

  4 files changed, 128 insertions(+), 1 deletion(-)
  create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile

index fa26a4e3a99d..91ab4cf29b5b 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -70,7 +70,8 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o 
amdgpu_kms.o \

  amdgpu_cs.o amdgpu_bios.o amdgpu_benchmark.o \
  atombios_dp.o amdgpu_afmt.o amdgpu_trace_points.o \
  atombios_encoders.o amdgpu_sa.o atombios_i2c.o \
-    amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_ib.o 
amdgpu_pll.o \
+    amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o 
amdgpu_vm_tlb_fence.o \

+    amdgpu_ib.o amdgpu_pll.o \
  amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
  amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o 
amdgpu_virt.o \

  amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

index 0960e0a665d3..310aae6fb49b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -988,6 +988,15 @@ int amdgpu_vm_update_range(struct amdgpu_device 
*adev, struct amdgpu_vm *vm,

    r = vm->update_funcs->commit(, fence);
  +    /* Prepare a TLB flush fence to be attached to PTs */
+    if (!unlocked && params.needs_flush && vm->is_compute_context) {
+    amdgpu_vm_tlb_fence_create(adev, vm, fence);


This schedules a TLB flush after "fence" signals and replaces "fence" 
with a new one that will signal after the TLB flush is done. That 
part I understand.


I'm not sure why this only applies to compute contexts.



+
+    /* Makes sure no PD/PT is freed before the flush */
+    dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence,
+   DMA_RESV_USAGE_BOOKKEEP);


But what's the point of adding the fence to the page table 
reservation? This is after the BOs have already been freed. Maybe it 
would make more sense to move this into the next patch, where the 
freeing is done after this point.


To make it easier for code review, the split of the patches is like:
- one patch introduces function creating tlb_flush_fence and uses it

- the second patch does the rework and movement of freeing of the 
buffer after the patch attach.


If we move this change into next patch, in this patch we will just 
create the fence, where one can argue why create the fence if no one 
is using it.


May be, we can make 'changes in freeing of buffers' as first patch in 
sequence, and make this second patch in the series, so that you know 
the background of changes better.


Sure. I don't think it's super important. I was just trying to 
understand how the two patches fit together. I think it makes sense now. 
I discussed this also with Philip offline. We think there may be an 
easier way to solve the "wait for TLB flush before freeing BOs" thing, 
but I believe using the new TLB flush fence is architecturally cleaner, 
and that fence will be useful to solve some other issues that are either 
still lingering, or currently have only some ugly workarounds. I'll need 
to dig through the code and my memory to remember the details.


I'm still not sure whether the creation of the TLB flush fence should be 
limited to compute contexts, but I'm happy to get them at least there 
for now. The series is


Acked-by: Felix Kuehling 

Regards,
  Felix




- Shashank



Regards,
  Felix



+    }
+
  error_unlock:
  amdgpu_vm_eviction_unlock(vm);
  drm_dev_exit(idx);
@@ -2237,6 +2246

Re: [PATCH v5 1/2] drm/amdgpu: implement TLB flush fence

2024-03-06 Thread Felix Kuehling




On 2024-03-06 09:41, Shashank Sharma wrote:

From: Christian König 

The problem is that when (for example) 4k pages are replaced
with a single 2M page we need to wait for change to be flushed
out by invalidating the TLB before the PT can be freed.

Solve this by moving the TLB flush into a DMA-fence object which
can be used to delay the freeing of the PT BOs until it is signaled.

V2: (Shashank)
 - rebase
 - set dma_fence_error only in case of error
 - add tlb_flush fence only when PT/PD BO is locked (Felix)
 - use vm->pasid when f is NULL (Mukul)

V4: - add a wait for (f->dependency) in tlb_fence_work (Christian)
 - move the misplaced fence_create call to the end (Philip)

V5: - free the f->dependency properly (Christian)

Cc: Christian Koenig 
Cc: Felix Kuehling 
Cc: Rajneesh Bhardwaj 
Cc: Alex Deucher 
Reviewed-by: Shashank Sharma 
Signed-off-by: Christian König 
Signed-off-by: Shashank Sharma 
---
  drivers/gpu/drm/amd/amdgpu/Makefile   |   3 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c|  10 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h|   4 +
  .../gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c  | 112 ++
  4 files changed, 128 insertions(+), 1 deletion(-)
  create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index fa26a4e3a99d..91ab4cf29b5b 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -70,7 +70,8 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o 
amdgpu_kms.o \
amdgpu_cs.o amdgpu_bios.o amdgpu_benchmark.o \
atombios_dp.o amdgpu_afmt.o amdgpu_trace_points.o \
atombios_encoders.o amdgpu_sa.o atombios_i2c.o \
-   amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_ib.o amdgpu_pll.o \
+   amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_vm_tlb_fence.o \
+   amdgpu_ib.o amdgpu_pll.o \
amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o \
amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 0960e0a665d3..310aae6fb49b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -988,6 +988,15 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, 
struct amdgpu_vm *vm,
  
  	r = vm->update_funcs->commit(, fence);
  
+	/* Prepare a TLB flush fence to be attached to PTs */

+   if (!unlocked && params.needs_flush && vm->is_compute_context) {
+   amdgpu_vm_tlb_fence_create(adev, vm, fence);


This schedules a TLB flush after "fence" signals and replaces "fence" 
with a new one that will signal after the TLB flush is done. That part I 
understand.


I'm not sure why this only applies to compute contexts.



+
+   /* Makes sure no PD/PT is freed before the flush */
+   dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence,
+  DMA_RESV_USAGE_BOOKKEEP);


But what's the point of adding the fence to the page table reservation? 
This is after the BOs have already been freed. Maybe it would make more 
sense to move this into the next patch, where the freeing is done after 
this point.


Regards,
  Felix



+   }
+
  error_unlock:
amdgpu_vm_eviction_unlock(vm);
drm_dev_exit(idx);
@@ -2237,6 +2246,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct 
amdgpu_vm *vm,
  
  	mutex_init(>eviction_lock);

vm->evicting = false;
+   vm->tlb_fence_context = dma_fence_context_alloc(1);
  
  	r = amdgpu_vm_pt_create(adev, vm, adev->vm_manager.root_level,

false, , xcp_id);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 64b3f69efa57..298f604b8e5f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -341,6 +341,7 @@ struct amdgpu_vm {
atomic64_t  tlb_seq;
uint64_ttlb_seq_va;
uint64_t*tlb_seq_cpu_addr;
+   uint64_ttlb_fence_context;
  
  	atomic64_t		kfd_last_flushed_seq;
  
@@ -594,5 +595,8 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,

  uint64_t addr,
  uint32_t status,
  unsigned int vmhub);
+void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev,
+struct amdgpu_vm *vm,
+struct dma_fence **fence);
  
  #endif

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
new file mode 100644
index 0

Re: [PATCH 2/3] drm/amdgpu: sdma support for sriov cpx mode

2024-03-05 Thread Felix Kuehling

On 2024-03-05 14:49, Dhume, Samir wrote:

[AMD Official Use Only - General]

-Original Message-
From: Kuehling, Felix 
Sent: Monday, March 4, 2024 6:47 PM
To: Dhume, Samir ; amd-gfx@lists.freedesktop.org
Cc: Lazar, Lijo ; Wan, Gavin ;
Liu, Leo ; Deucher, Alexander

Subject: Re: [PATCH 2/3] drm/amdgpu: sdma support for sriov cpx mode

On 2024-03-04 10:19, Samir Dhume wrote:

Signed-off-by: Samir Dhume 

Please add a meaningful commit description to all the patches in the series.
See one more comment below.

Right!

---
   drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 34

+++-

   1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c

b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c

index fec5a3d1c4bc..f666ececbe7d 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -82,17 +82,37 @@ static unsigned sdma_v4_4_2_seq_to_irq_id(int

seq_num)

 }
   }

-static int sdma_v4_4_2_irq_id_to_seq(unsigned client_id)
+static int sdma_v4_4_2_irq_id_to_seq(struct amdgpu_device *adev,

unsigned client_id)

   {
+
+   struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr;
+   bool sriov_cpx_odd = false;
+   int mode;
+
+   if (amdgpu_sriov_vf(adev)) {
+   mode = xcp_mgr->funcs->query_partition_mode(xcp_mgr);

This queries an MMIO register for the current mode. Is that really
necessary to do in the interrupt handler? Could we use the partition
mode stored in xcp_mgr->mode instead?

The design appears to be that even when the host sets the mode to DPX/QPX/CPX, 
each guest sets itself to be in the SPX mode and xcp_mgr->mode is set to SPX.
But I can use a new field in xcp_mgr to reflect the system mode set by the host 
and remove the MMIO access from the interrupt handler.

Can you clarify what it means when the host and guest see a different 
partition mode? Is this the case, where the host partitions the device 
into several VFs, and the guest partitions those VFs further into 
smaller partitions? As far as I know, that finer partitioning in the 
guest is actually controlled by the host as well. If the guest sees SPX 
mode, it means it doesn't partition the VF into smaller pieces.

Instead of looking at the partition mode, would it make more sense to 
just query the number of XCDs in the partition (from the xcc_mask)? That 
should give the right answer regardless of how the host partitioned the GPU.

Regards,
  Felix

Thanks,
samir

Regards,
Felix

+
+   if (mode == AMDGPU_CPX_PARTITION_MODE) {
+   if (adev->gfx.funcs->get_xcc_id(adev, 0) & 0x1)
+   sriov_cpx_odd = true;
+   }
+   }
+
 switch (client_id) {
 case SOC15_IH_CLIENTID_SDMA0:
 return 0;
 case SOC15_IH_CLIENTID_SDMA1:
 return 1;
 case SOC15_IH_CLIENTID_SDMA2:
-   return 2;
+   if (sriov_cpx_odd)
+   return 0;
+   else
+   return 2;
 case SOC15_IH_CLIENTID_SDMA3:
-   return 3;
+   if (sriov_cpx_odd)
+   return 1;
+   else
+   return 3;
 default:
 return -EINVAL;
 }
@@ -1541,7 +1561,7 @@ static int sdma_v4_4_2_process_trap_irq(struct

amdgpu_device *adev,

 uint32_t instance, i;

 DRM_DEBUG("IH: SDMA trap\n");
-   instance = sdma_v4_4_2_irq_id_to_seq(entry->client_id);
+   instance = sdma_v4_4_2_irq_id_to_seq(adev, entry->client_id);

 /* Client id gives the SDMA instance in AID. To know the exact SDMA
  * instance, interrupt entry gives the node id which corresponds to the

AID instance.

@@ -1584,7 +1604,7 @@ static int

sdma_v4_4_2_process_ras_data_cb(struct amdgpu_device *adev,

 if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA))
 goto out;

-   instance = sdma_v4_4_2_irq_id_to_seq(entry->client_id);
+   instance = sdma_v4_4_2_irq_id_to_seq(adev, entry->client_id);
 if (instance < 0)
 goto out;

@@ -1603,7 +1623,7 @@ static int

sdma_v4_4_2_process_illegal_inst_irq(struct amdgpu_device *adev,

 DRM_ERROR("Illegal instruction in SDMA command stream\n");

-   instance = sdma_v4_4_2_irq_id_to_seq(entry->client_id);
+   instance = sdma_v4_4_2_irq_id_to_seq(adev, entry->client_id);
 if (instance < 0)
 return 0;

@@ -1647,7 +1667,7 @@ static int sdma_v4_4_2_print_iv_entry(struct

amdgpu_device *adev,

 struct amdgpu_task_info task_info;
 u64 addr;

-   instance = sdma_v4_4_2_irq_id_to_seq(entry->client_id);
+   instance = sdma_v4_4_2_irq_id_to_seq(adev, entry->client_id);
 if (instance < 0 || instance >= adev->sdma.num_instances) {
 dev_err(adev->dev, "sdma instance invalid %d\n", instance);
 return -EINVAL;

Re: [PATCH] drm/amdkfd: make kfd_class constant

2024-03-05 Thread Felix Kuehling


On 2024-03-05 7:15, Ricardo B. Marliere wrote:

Since commit 43a7206b0963 ("driver core: class: make class_register() take
a const *"), the driver core allows for struct class to be in read-only
memory, so move the kfd_class structure to be declared at build time
placing it into read-only memory, instead of having to be dynamically
allocated at boot time.

Cc: Greg Kroah-Hartman 
Suggested-by: Greg Kroah-Hartman 
Signed-off-by: Ricardo B. Marliere 


The patch looks good to me. Do you want me to apply this to Alex's 
amd-staging-drm-next?


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 21 +++--
  1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f030cafc5a0a..dfa8c69532d4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -63,8 +63,10 @@ static const struct file_operations kfd_fops = {
  };
  
  static int kfd_char_dev_major = -1;

-static struct class *kfd_class;
  struct device *kfd_device;
+static const struct class kfd_class = {
+   .name = kfd_dev_name,
+};
  
  static inline struct kfd_process_device *kfd_lock_pdd_by_id(struct kfd_process *p, __u32 gpu_id)

  {
@@ -94,14 +96,13 @@ int kfd_chardev_init(void)
if (err < 0)
goto err_register_chrdev;
  
-	kfd_class = class_create(kfd_dev_name);

-   err = PTR_ERR(kfd_class);
-   if (IS_ERR(kfd_class))
+   err = class_register(_class);
+   if (err)
goto err_class_create;
  
-	kfd_device = device_create(kfd_class, NULL,

-   MKDEV(kfd_char_dev_major, 0),
-   NULL, kfd_dev_name);
+   kfd_device = device_create(_class, NULL,
+  MKDEV(kfd_char_dev_major, 0),
+  NULL, kfd_dev_name);
err = PTR_ERR(kfd_device);
if (IS_ERR(kfd_device))
goto err_device_create;
@@ -109,7 +110,7 @@ int kfd_chardev_init(void)
return 0;
  
  err_device_create:

-   class_destroy(kfd_class);
+   class_unregister(_class);
  err_class_create:
unregister_chrdev(kfd_char_dev_major, kfd_dev_name);
  err_register_chrdev:
@@ -118,8 +119,8 @@ int kfd_chardev_init(void)
  
  void kfd_chardev_exit(void)

  {
-   device_destroy(kfd_class, MKDEV(kfd_char_dev_major, 0));
-   class_destroy(kfd_class);
+   device_destroy(_class, MKDEV(kfd_char_dev_major, 0));
+   class_unregister(_class);
unregister_chrdev(kfd_char_dev_major, kfd_dev_name);
kfd_device = NULL;
  }

---
base-commit: 8bc75586ea01f1c645063d3472c115ecab03e76c
change-id: 20240305-class_cleanup-drm-amd-bdc7255b7540

Best regards,

Re: [PATCH] drm/amdgpu: Init zone device and drm client after mode-1 reset on reload

2024-03-05 Thread Felix Kuehling

On 2024-03-04 19:20, Rehman, Ahmad wrote:

[AMD Official Use Only - General]

Hey,

Due to mode-1 reset (pending_reset), the amdgpu_amdkfd_device_init 
will not be called and hence adev->kfd.init_complete will not be set. 
The function amdgpu_amdkfd_drm_client_create has condition:

if (!adev->kfd.init_complete)
              return 0;
So, in probe function, when we return from device_init the KFD is not 
initialized and amdgpu_amdkfd_drm_client_create returns without doing 
anything.

I think your change could result in calling 
amdgpu_amdkfd_drm_client_create multiple times. IIRC, one purpose of 
moving the call to amdgpu_pci_probe was to ensure that it is only called 
once, because it only gets unregistered once when the driver is 
unloaded. Maybe it would be better to remove the if 
(!adev->kfd.init_complete) condition from 
amdgpu_amdkfd_drm_client_create. That way we would always create the 
client at probe and it would be ready when it's needed after the GPU reset.

There is a chance that the client would get created unnecessarily if KFD 
init never succeeds. But that should be rare, and it's not a big 
resource waste.

There were some comments on a previous code review, that creating the 
DRM client too early could cause problems. But I don't understand what 
that problem could be. As I understand it, the adev->kfd.client is just 
a place to put GEM handles for KFD BOs that we don't want to expose to 
user mode. I see no harm in creating this client too early or when it's 
not needed.

Regards,
  Felix

Thanks,
Ahmad

*From:* Kuehling, Felix 
*Sent:* Monday, March 4, 2024 6:39 PM
*To:* Rehman, Ahmad ; 
amd-gfx@lists.freedesktop.org 

*Cc:* Wan, Gavin 
*Subject:* Re: [PATCH] drm/amdgpu: Init zone device and drm client 
after mode-1 reset on reload

On 2024-03-04 17:05, Ahmad Rehman wrote:
> In passthrough environment, when amdgpu is reloaded after unload, mode-1
> is triggered after initializing the necessary IPs, That init does not
> include KFD, and KFD init waits until the reset is completed. KFD init
> is called in the reset handler, but in this case, the zone device and
> drm client is not initialized, causing app to create kernel panic.
>
> Signed-off-by: Ahmad Rehman 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 5 -
>   1 file changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

> index 15b188aaf681..80b9642f2bc4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -2479,8 +2479,11 @@ static void 
amdgpu_drv_delayed_reset_work_handler(struct work_struct *work)

>    }
>    for (i = 0; i < mgpu_info.num_dgpu; i++) {
>    adev = mgpu_info.gpu_ins[i].adev;
> - if (!adev->kfd.init_complete)
> + if (!adev->kfd.init_complete) {
> + kgd2kfd_init_zone_device(adev);
>    amdgpu_amdkfd_device_init(adev);
> + amdgpu_amdkfd_drm_client_create(adev);
I don't see what's preventing the DRM client initialization in the
reset-on-driver-load case. It only needs to be created once and that
happens in amdgpu_pci_probe. Am I missing anything?

Regards,
   Felix

> + }
>    amdgpu_ttm_set_buffer_funcs_status(adev, true);
>    }
>   }

Re: [PATCH 2/3] drm/amdgpu: sdma support for sriov cpx mode

2024-03-04 Thread Felix Kuehling




On 2024-03-04 10:19, Samir Dhume wrote:

Signed-off-by: Samir Dhume 


Please add a meaningful commit description to all the patches in the 
series. See one more comment below.




---
  drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 34 +++-
  1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index fec5a3d1c4bc..f666ececbe7d 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -82,17 +82,37 @@ static unsigned sdma_v4_4_2_seq_to_irq_id(int seq_num)
}
  }
  
-static int sdma_v4_4_2_irq_id_to_seq(unsigned client_id)

+static int sdma_v4_4_2_irq_id_to_seq(struct amdgpu_device *adev, unsigned 
client_id)
  {
+
+   struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr;
+   bool sriov_cpx_odd = false;
+   int mode;
+
+   if (amdgpu_sriov_vf(adev)) {
+   mode = xcp_mgr->funcs->query_partition_mode(xcp_mgr);


This queries an MMIO register for the current mode. Is that really 
necessary to do in the interrupt handler? Could we use the partition 
mode stored in xcp_mgr->mode instead?


Regards,
  Felix



+
+   if (mode == AMDGPU_CPX_PARTITION_MODE) {
+   if (adev->gfx.funcs->get_xcc_id(adev, 0) & 0x1)
+   sriov_cpx_odd = true;
+   }
+   }
+
switch (client_id) {
case SOC15_IH_CLIENTID_SDMA0:
return 0;
case SOC15_IH_CLIENTID_SDMA1:
return 1;
case SOC15_IH_CLIENTID_SDMA2:
-   return 2;
+   if (sriov_cpx_odd)
+   return 0;
+   else
+   return 2;
case SOC15_IH_CLIENTID_SDMA3:
-   return 3;
+   if (sriov_cpx_odd)
+   return 1;
+   else
+   return 3;
default:
return -EINVAL;
}
@@ -1541,7 +1561,7 @@ static int sdma_v4_4_2_process_trap_irq(struct 
amdgpu_device *adev,
uint32_t instance, i;
  
  	DRM_DEBUG("IH: SDMA trap\n");

-   instance = sdma_v4_4_2_irq_id_to_seq(entry->client_id);
+   instance = sdma_v4_4_2_irq_id_to_seq(adev, entry->client_id);
  
  	/* Client id gives the SDMA instance in AID. To know the exact SDMA

 * instance, interrupt entry gives the node id which corresponds to the 
AID instance.
@@ -1584,7 +1604,7 @@ static int sdma_v4_4_2_process_ras_data_cb(struct 
amdgpu_device *adev,
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA))
goto out;
  
-	instance = sdma_v4_4_2_irq_id_to_seq(entry->client_id);

+   instance = sdma_v4_4_2_irq_id_to_seq(adev, entry->client_id);
if (instance < 0)
goto out;
  
@@ -1603,7 +1623,7 @@ static int sdma_v4_4_2_process_illegal_inst_irq(struct amdgpu_device *adev,
  
  	DRM_ERROR("Illegal instruction in SDMA command stream\n");
  
-	instance = sdma_v4_4_2_irq_id_to_seq(entry->client_id);

+   instance = sdma_v4_4_2_irq_id_to_seq(adev, entry->client_id);
if (instance < 0)
return 0;
  
@@ -1647,7 +1667,7 @@ static int sdma_v4_4_2_print_iv_entry(struct amdgpu_device *adev,

struct amdgpu_task_info task_info;
u64 addr;
  
-	instance = sdma_v4_4_2_irq_id_to_seq(entry->client_id);

+   instance = sdma_v4_4_2_irq_id_to_seq(adev, entry->client_id);
if (instance < 0 || instance >= adev->sdma.num_instances) {
dev_err(adev->dev, "sdma instance invalid %d\n", instance);
return -EINVAL;

Re: [PATCH] drm/amdgpu: Init zone device and drm client after mode-1 reset on reload

2024-03-04 Thread Felix Kuehling




On 2024-03-04 17:05, Ahmad Rehman wrote:

In passthrough environment, when amdgpu is reloaded after unload, mode-1
is triggered after initializing the necessary IPs, That init does not
include KFD, and KFD init waits until the reset is completed. KFD init
is called in the reset handler, but in this case, the zone device and
drm client is not initialized, causing app to create kernel panic.

Signed-off-by: Ahmad Rehman 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 5 -
  1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 15b188aaf681..80b9642f2bc4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2479,8 +2479,11 @@ static void amdgpu_drv_delayed_reset_work_handler(struct 
work_struct *work)
}
for (i = 0; i < mgpu_info.num_dgpu; i++) {
adev = mgpu_info.gpu_ins[i].adev;
-   if (!adev->kfd.init_complete)
+   if (!adev->kfd.init_complete) {
+   kgd2kfd_init_zone_device(adev);
amdgpu_amdkfd_device_init(adev);
+   amdgpu_amdkfd_drm_client_create(adev);
I don't see what's preventing the DRM client initialization in the 
reset-on-driver-load case. It only needs to be created once and that 
happens in amdgpu_pci_probe. Am I missing anything?


Regards,
  Felix



+   }
amdgpu_ttm_set_buffer_funcs_status(adev, true);
}
  }

Re: [PATCH V3] Revert "drm/amdgpu: remove vm sanity check from amdgpu_vm_make_compute" for Raven

2024-03-01 Thread Felix Kuehling


On 2024-02-29 01:04, Jesse.Zhang wrote:

fix the issue:
"amdgpu: Failed to create process VM object".

[Why]when amdgpu initialized, seq64 do mampping and update bo mapping in vm 
page table.
But when clifo run. It also initializes a vm for a process device through the 
function kfd_process_device_init_vm and ensure the root PD is clean through the 
function amdgpu_vm_pt_is_root_clean.
So they have a conflict, and clinfo  always failed.

v1:
   - remove all the pte_supports_ats stuff from the amdgpu_vm code (Felix)

Signed-off-by: Jesse Zhang 
The headline should be updated. This is no longer a revert of the quoted 
patch. Other than that, this patch looks reasonable to me. One more 
comment inline.



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c| 23 --
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h|  3 --
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 56 +--
  3 files changed, 1 insertion(+), 81 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index ed4a8c5d26d7..d004ace79536 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1385,10 +1385,6 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
struct amdgpu_bo_va_mapping, list);
list_del(>list);
  
-		if (vm->pte_support_ats &&

-   mapping->start < AMDGPU_GMC_HOLE_START)
-   init_pte_value = AMDGPU_PTE_DEFAULT_ATC;
-
r = amdgpu_vm_update_range(adev, vm, false, false, true, false,
   resv, mapping->start, mapping->last,
   init_pte_value, 0, 0, NULL, NULL,
@@ -2264,7 +2260,6 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct 
amdgpu_vm *vm,
if (r)
return r;
  
-	vm->pte_support_ats = false;

vm->is_compute_context = false;
  
  	vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode &

@@ -2350,30 +2345,12 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct 
amdgpu_vm *vm,
   */
  int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm)
  {
-   bool pte_support_ats = (adev->asic_type == CHIP_RAVEN);
int r;
  
  	r = amdgpu_bo_reserve(vm->root.bo, true);

if (r)
return r;
  
-	/* Check if PD needs to be reinitialized and do it before

-* changing any other state, in case it fails.
-*/
-   if (pte_support_ats != vm->pte_support_ats) {
-   /* Sanity checks */
-   if (!amdgpu_vm_pt_is_root_clean(adev, vm)) {
-   r = -EINVAL;
-   goto unreserve_bo;
-   }
-
-   vm->pte_support_ats = pte_support_ats;
-   r = amdgpu_vm_pt_clear(adev, vm, to_amdgpu_bo_vm(vm->root.bo),
-  false);
-   if (r)
-   goto unreserve_bo;
-   }
-
/* Update VM state */
vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode &
AMDGPU_VM_USE_CPU_FOR_COMPUTE);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 42f6ddec50c1..9f6b5e1ccf34 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -357,9 +357,6 @@ struct amdgpu_vm {
/* Functions to use for VM table updates */
const struct amdgpu_vm_update_funcs *update_funcs;
  
-	/* Flag to indicate ATS support from PTE for GFX9 */

-   boolpte_support_ats;
-
/* Up to 128 pending retry page faults */
DECLARE_KFIFO(faults, u64, 128);
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c

index a160265ddc07..2835cb3f76eb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -89,22 +89,6 @@ static unsigned int amdgpu_vm_pt_num_entries(struct 
amdgpu_device *adev,
return AMDGPU_VM_PTE_COUNT(adev);
  }
  
-/**

- * amdgpu_vm_pt_num_ats_entries - return the number of ATS entries in the root 
PD
- *
- * @adev: amdgpu_device pointer
- *
- * Returns:
- * The number of entries in the root page directory which needs the ATS 
setting.
- */
-static unsigned int amdgpu_vm_pt_num_ats_entries(struct amdgpu_device *adev)
-{
-   unsigned int shift;
-
-   shift = amdgpu_vm_pt_level_shift(adev, adev->vm_manager.root_level);
-   return AMDGPU_GMC_HOLE_START >> (shift + AMDGPU_GPU_PAGE_SHIFT);
-}
-
  /**
   * amdgpu_vm_pt_entries_mask - the mask to get the entry number of a PD/PT
   *
@@ -379,7 +363,7 @@ int amdgpu_vm_pt_clear(struct amdgpu_device *adev, struct 
amdgpu_vm *vm,
struct ttm_operation_ctx ctx = { true, false };
struct amdgpu_vm_update_params params;
struct amdgpu_bo *ancestor = >bo;
-   unsigned int entries,

Re: [PATCH v3] drm/amdgpu: change vm->task_info handling

2024-03-01 Thread Felix Kuehling


On 2024-02-05 12:05, Shashank Sharma wrote:

This patch changes the handling and lifecycle of vm->task_info object.
The major changes are:
- vm->task_info is a dynamically allocated ptr now, and its uasge is
   reference counted.
- introducing two new helper funcs for task_info lifecycle management
 - amdgpu_vm_get_task_info: reference counts up task_info before
   returning this info
 - amdgpu_vm_put_task_info: reference counts down task_info
- last put to task_info() frees task_info from the vm.

This patch also does logistical changes required for existing usage
of vm->task_info.

V2: Do not block all the prints when task_info not found (Felix)
V3: (Felix)
- Fix wrong indentation
- No debug message for -ENOMEM
- Add NULL check for task_info
- Do not duplicate the debug messages (ti vs no ti)
- Get first reference of task_info in vm_init(), put last
  in vm_fini()

Cc: Christian Koenig
Cc: Alex Deucher
Cc: Felix Kuehling
Signed-off-by: Shashank Sharma


One nit-pick and one bug inline. With those fixed, the patch

Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |   9 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  18 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c   |  12 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 158 ++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  21 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c   |   2 +-
  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  24 +--
  drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c  |  23 +--
  drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c   |  20 ++-
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  23 +--
  drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c  |  23 +--
  drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c|  22 +--
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c |  20 +--
  13 files changed, 251 insertions(+), 124 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 0e61ebdb3f3e..f9eb12697b95 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1775,9 +1775,14 @@ static int amdgpu_debugfs_vm_info_show(struct seq_file 
*m, void *unused)
list_for_each_entry(file, >filelist, lhead) {
struct amdgpu_fpriv *fpriv = file->driver_priv;
struct amdgpu_vm *vm = >vm;
+   struct amdgpu_task_info *ti;
+
+   ti = amdgpu_vm_get_task_info_vm(vm);
+   if (ti) {
+   seq_printf(m, "pid:%d\tProcess:%s --\n", ti->pid, 
ti->process_name);
+   amdgpu_vm_put_task_info(ti);
+   }
  
-		seq_printf(m, "pid:%d\tProcess:%s --\n",

-   vm->task_info.pid, vm->task_info.process_name);
r = amdgpu_bo_reserve(vm->root.bo, true);
if (r)
break;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 1f357198533f..e6e6d56398f2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -35,7 +35,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
drm_sched_job *s_job)
  {
struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
struct amdgpu_job *job = to_amdgpu_job(s_job);
-   struct amdgpu_task_info ti;
+   struct amdgpu_task_info *ti;
struct amdgpu_device *adev = ring->adev;
int idx;
int r;
@@ -48,7 +48,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
drm_sched_job *s_job)
return DRM_GPU_SCHED_STAT_ENODEV;
}
  
-	memset(, 0, sizeof(struct amdgpu_task_info));

+
adev->job_hang = true;
  
  	if (amdgpu_gpu_recovery &&

@@ -58,12 +58,16 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
drm_sched_job *s_job)
goto exit;
}
  
-	amdgpu_vm_get_task_info(ring->adev, job->pasid, );

DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
- job->base.sched->name, atomic_read(>fence_drv.last_seq),
- ring->fence_drv.sync_seq);
-   DRM_ERROR("Process information: process %s pid %d thread %s pid %d\n",
- ti.process_name, ti.tgid, ti.task_name, ti.pid);
+  job->base.sched->name, 
atomic_read(>fence_drv.last_seq),
+  ring->fence_drv.sync_seq);
+
+   ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid);
+   if (ti) {
+   DRM_ERROR("Process information: process %s pid %d thread %s pid 
%d\n",
+ ti->process_name, ti->tgid, ti->task_name, ti->pid);
+   amdgpu_vm_put_task_info(ti);
+   }
  
  	dma_fence_set_error(_job->s_fence->

Re: [PATCH] Revert "drm/amdgpu: remove vm sanity check from amdgpu_vm_make_compute" for Raven

2024-02-28 Thread Felix Kuehling




On 2024-02-28 01:41, Christian König wrote:

Am 28.02.24 um 06:04 schrieb Jesse.Zhang:

fix the issue when run clinfo:
"amdgpu: Failed to create process VM object".

when amdgpu initialized, seq64 do mampping and update bo mapping in 
vm page table.
But when clifo run. It also initializes a vm for a process device 
through the function kfd_process_device_init_vm
and ensure the root PD is clean through the function 
amdgpu_vm_pt_is_root_clean.

So they have a conflict, and clinfo  always failed.


Big NAK for this, you removed the check but didn't solved the problem 
in any way.


When Raven still needs the ats feature than it is intentional that 
this fails.


I agree. I think we should just remove all the pte_supports_ats stuff 
from the amdgpu_vm code. We no longer use IOMMUv2. So there is no point 
setting invalid PTEs to fail over to ATS any more. As far as I can see, 
this will require changes in amdgpu_vm_clear_freed, amdgpu_vm_init, 
amdgpu_vm_make_compute. Then you can remove amdgpu_vm.pte_support_ats 
from the struct and remove amdgpu_vm_pt_is_root_clean.


Regards,
  Felix




Regards,
Christian.



Signed-off-by: Jesse Zhang 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 6 --
  1 file changed, 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

index ed4a8c5d26d7..0bc0bc75be15 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2361,12 +2361,6 @@ int amdgpu_vm_make_compute(struct 
amdgpu_device *adev, struct amdgpu_vm *vm)

   * changing any other state, in case it fails.
   */
  if (pte_support_ats != vm->pte_support_ats) {
-    /* Sanity checks */
-    if (!amdgpu_vm_pt_is_root_clean(adev, vm)) {
-    r = -EINVAL;
-    goto unreserve_bo;
-    }
-
  vm->pte_support_ats = pte_support_ats;
  r = amdgpu_vm_pt_clear(adev, vm, to_amdgpu_bo_vm(vm->root.bo),
 false);

Re: [PATCH] drm/amdkfd: Increase the size of the memory reserved for the TBA

2024-02-23 Thread Felix Kuehling




On 2024-02-23 14:05, Laurent Morichetti wrote:

In a future commit, the cwsr trap handler code size for gfx10.1 will
increase to slightly above the one page mark. Since the TMA does not
need to be page aligned, and only 2 pointers are stored in it, push
the TMA offset by 2 KiB and keep the TBA+TMA reserved memory size
to two pages.

Signed-off-by: Laurent Morichetti 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdkfd/kfd_device.c | 23 ---
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h   |  6 +++---
  2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 4d399c0c8a57..041ec3de55e7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -466,34 +466,43 @@ static void kfd_cwsr_init(struct kfd_dev *kfd)
  {
if (cwsr_enable && kfd->device_info.supports_cwsr) {
if (KFD_GC_VERSION(kfd) < IP_VERSION(9, 0, 1)) {
-   BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE);
+   BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex)
+> KFD_CWSR_TMA_OFFSET);
kfd->cwsr_isa = cwsr_trap_gfx8_hex;
kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex);
} else if (KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 1)) {
-   BUILD_BUG_ON(sizeof(cwsr_trap_arcturus_hex) > 
PAGE_SIZE);
+   BUILD_BUG_ON(sizeof(cwsr_trap_arcturus_hex)
+> KFD_CWSR_TMA_OFFSET);
kfd->cwsr_isa = cwsr_trap_arcturus_hex;
kfd->cwsr_isa_size = sizeof(cwsr_trap_arcturus_hex);
} else if (KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 2)) {
-   BUILD_BUG_ON(sizeof(cwsr_trap_aldebaran_hex) > 
PAGE_SIZE);
+   BUILD_BUG_ON(sizeof(cwsr_trap_aldebaran_hex)
+> KFD_CWSR_TMA_OFFSET);
kfd->cwsr_isa = cwsr_trap_aldebaran_hex;
kfd->cwsr_isa_size = sizeof(cwsr_trap_aldebaran_hex);
} else if (KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 3)) {
-   BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_4_3_hex) > 
PAGE_SIZE);
+   BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_4_3_hex)
+> KFD_CWSR_TMA_OFFSET);
kfd->cwsr_isa = cwsr_trap_gfx9_4_3_hex;
kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_4_3_hex);
} else if (KFD_GC_VERSION(kfd) < IP_VERSION(10, 1, 1)) {
-   BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_hex) > PAGE_SIZE);
+   BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_hex)
+> KFD_CWSR_TMA_OFFSET);
kfd->cwsr_isa = cwsr_trap_gfx9_hex;
kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_hex);
} else if (KFD_GC_VERSION(kfd) < IP_VERSION(10, 3, 0)) {
-   BUILD_BUG_ON(sizeof(cwsr_trap_nv1x_hex) > PAGE_SIZE);
+   BUILD_BUG_ON(sizeof(cwsr_trap_nv1x_hex)
+> KFD_CWSR_TMA_OFFSET);
kfd->cwsr_isa = cwsr_trap_nv1x_hex;
kfd->cwsr_isa_size = sizeof(cwsr_trap_nv1x_hex);
} else if (KFD_GC_VERSION(kfd) < IP_VERSION(11, 0, 0)) {
-   BUILD_BUG_ON(sizeof(cwsr_trap_gfx10_hex) > PAGE_SIZE);
+   BUILD_BUG_ON(sizeof(cwsr_trap_gfx10_hex)
+> KFD_CWSR_TMA_OFFSET);
kfd->cwsr_isa = cwsr_trap_gfx10_hex;
kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx10_hex);
} else {
+   /* The gfx11 cwsr trap handler must fit inside a single
+  page. */
BUILD_BUG_ON(sizeof(cwsr_trap_gfx11_hex) > PAGE_SIZE);
kfd->cwsr_isa = cwsr_trap_gfx11_hex;
kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx11_hex);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 80320b8603fc..42d40560cd30 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -99,11 +99,11 @@
  /*
   * Size of the per-process TBA+TMA buffer: 2 pages
   *
- * The first page is the TBA used for the CWSR ISA code. The second
- * page is used as TMA for user-mode trap handler setup in daisy-chain mode.
+ * The first chunk is the TBA used for the CWSR ISA code. The second
+ * chunk is used as TMA for user-mode trap handler setup in daisy-chain mode.
   */

Re: [PATCH] drm/amdkfd: fix process reference drop on debug ioctl

2024-02-21 Thread Felix Kuehling




On 2024-02-21 05:54, Jonathan Kim wrote:

Prevent dropping the KFD process reference at the end of a debug
IOCTL call where the acquired process value is an error.

Signed-off-by: Jonathan Kim 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 80e90fdef291..824e660283b2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2935,6 +2935,7 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
if (IS_ERR_OR_NULL(target)) {
pr_debug("Cannot find process PID %i to debug\n", args->pid);
r = target ? PTR_ERR(target) : -ESRCH;
+   target = NULL;
goto out;
}

Re: [PATCH 1/2] drm/amdkfd: Document and define SVM event tracing macro

2024-02-16 Thread Felix Kuehling




On 2024-02-15 10:18, Philip Yang wrote:

Document how to use SMI system management interface to receive SVM
events.

Define SVM events message string format macro that could use by user
mode for sscanf to parse the event. Add it to uAPI header file to make
it obvious that is changing uAPI in future.

No functional changes.

Signed-off-by: Philip Yang 
---
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 51 +++---
  include/uapi/linux/kfd_ioctl.h  | 77 -
  2 files changed, 102 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index d9953c2b2661..85465eb303a9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -225,15 +225,16 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, 
bool post_reset)
event = KFD_SMI_EVENT_GPU_PRE_RESET;
++(dev->reset_seq_num);
}
-   kfd_smi_event_add(0, dev, event, "%x\n", dev->reset_seq_num);
+   kfd_smi_event_add(0, dev, event,
+ KFD_EVENT_FMT_UPDATE_GPU_RESET(dev->reset_seq_num));
  }
  
  void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,

 uint64_t throttle_bitmask)
  {
-   kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, "%llx:%llx\n",
- throttle_bitmask,
- amdgpu_dpm_get_thermal_throttling_counter(dev->adev));
+   kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE,
+ 
KFD_EVENT_FMT_UPDATE_THERMAL_THROTTLING(throttle_bitmask,
+ 
amdgpu_dpm_get_thermal_throttling_counter(dev->adev)));
  }
  
  void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid)

@@ -246,8 +247,8 @@ void kfd_smi_event_update_vmfault(struct kfd_node *dev, 
uint16_t pasid)
if (!task_info.pid)
return;
  
-	kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n",

- task_info.pid, task_info.task_name);
+   kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT,
+ KFD_EVENT_FMT_VMFAULT(task_info.pid, 
task_info.task_name));
  }
  
  void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid,

@@ -255,16 +256,16 @@ void kfd_smi_event_page_fault_start(struct kfd_node 
*node, pid_t pid,
ktime_t ts)
  {
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_START,
- "%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid,
- address, node->id, write_fault ? 'W' : 'R');
+ KFD_EVENT_FMT_PAGEFAULT_START(ktime_to_ns(ts), pid,
+ address, node->id, write_fault ? 'W' : 'R'));
  }
  
  void kfd_smi_event_page_fault_end(struct kfd_node *node, pid_t pid,

  unsigned long address, bool migration)
  {
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_END,
- "%lld -%d @%lx(%x) %c\n", ktime_get_boottime_ns(),
- pid, address, node->id, migration ? 'M' : 'U');
+ KFD_EVENT_FMT_PAGEFAULT_END(ktime_get_boottime_ns(),
+ pid, address, node->id, migration ? 'M' : 'U'));
  }
  
  void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid,

@@ -274,9 +275,9 @@ void kfd_smi_event_migration_start(struct kfd_node *node, 
pid_t pid,
   uint32_t trigger)
  {
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_START,
- "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n",
- ktime_get_boottime_ns(), pid, start, end - start,
- from, to, prefetch_loc, preferred_loc, trigger);
+ KFD_EVENT_FMT_MIGRATE_START(ktime_get_boottime_ns(),
+ pid, start, end - start, from, to, prefetch_loc,
+ preferred_loc, trigger));
  }
  
  void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid,

@@ -284,24 +285,23 @@ void kfd_smi_event_migration_end(struct kfd_node *node, 
pid_t pid,
 uint32_t from, uint32_t to, uint32_t trigger)
  {
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_END,
- "%lld -%d @%lx(%lx) %x->%x %d\n",
- ktime_get_boottime_ns(), pid, start, end - start,
- from, to, trigger);
+ KFD_EVENT_FMT_MIGRATE_END(ktime_get_boottime_ns(), 
pid,
+ start, end - start, from, to, trigger));
  }
  
  void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid,

  uint32_t trigger)
  {
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_QUEUE_EVICTION,
- "%lld -%d

[PATCH v3] drm/amdkfd: Relocate TBA/TMA to opposite side of VM hole

2024-02-13 Thread Felix Kuehling

The TBA and TMA, along with an unused IB allocation, reside at low
addresses in the VM address space. A stray VM fault which hits these
pages must be serviced by making their page table entries invalid.
The scheduler depends upon these pages being resident and fails,
preventing a debugger from inspecting the failure state.

By relocating these pages above 47 bits in the VM address space they
can only be reached when bits [63:48] are set to 1. This makes it much
less likely for a misbehaving program to generate accesses to them.
The current placement at VA (PAGE_SIZE*2) is readily hit by a NULL
access with a small offset.

v2:
- Move it to the reserved space to avoid concflicts with Mesa
- Add macros to make reserved space management easier

v3:
- Move VM  max PFN calculation into AMDGPU_VA_RESERVED macros

Cc: Arunpravin Paneer Selvam 
Cc: Christian Koenig 
Signed-off-by: Jay Cornwall 
Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c  |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c|  6 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h   | 11 +++-
 drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c | 29 ++--
 4 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
index 823d31f4a2a3..b0fb14a4b43c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
@@ -28,9 +28,8 @@
 
 uint64_t amdgpu_csa_vaddr(struct amdgpu_device *adev)
 {
-   uint64_t addr = adev->vm_manager.max_pfn << AMDGPU_GPU_PAGE_SHIFT;
+   uint64_t addr = AMDGPU_VA_RESERVED_CSA_START(adev);
 
-   addr -= AMDGPU_VA_RESERVED_CSA_SIZE;
addr = amdgpu_gmc_sign_extend(addr);
 
return addr;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c
index 3d0d56087d41..4b9afc4df031 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c
@@ -45,11 +45,7 @@
  */
 static inline u64 amdgpu_seq64_get_va_base(struct amdgpu_device *adev)
 {
-   u64 addr = adev->vm_manager.max_pfn << AMDGPU_GPU_PAGE_SHIFT;
-
-   addr -= AMDGPU_VA_RESERVED_TOP;
-
-   return addr;
+   return AMDGPU_VA_RESERVED_SEQ64_START(adev);
 }
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 2c4053b29bb3..42f6ddec50c1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -137,9 +137,18 @@ struct amdgpu_mem_stats;
 
 /* Reserve space at top/bottom of address space for kernel use */
 #define AMDGPU_VA_RESERVED_CSA_SIZE(2ULL << 20)
+#define AMDGPU_VA_RESERVED_CSA_START(adev) (((adev)->vm_manager.max_pfn \
+ << AMDGPU_GPU_PAGE_SHIFT)  \
+- AMDGPU_VA_RESERVED_CSA_SIZE)
 #define AMDGPU_VA_RESERVED_SEQ64_SIZE  (2ULL << 20)
+#define AMDGPU_VA_RESERVED_SEQ64_START(adev)   
(AMDGPU_VA_RESERVED_CSA_START(adev) \
+- 
AMDGPU_VA_RESERVED_SEQ64_SIZE)
+#define AMDGPU_VA_RESERVED_TRAP_SIZE   (2ULL << 12)
+#define AMDGPU_VA_RESERVED_TRAP_START(adev)
(AMDGPU_VA_RESERVED_SEQ64_START(adev) \
+- AMDGPU_VA_RESERVED_TRAP_SIZE)
 #define AMDGPU_VA_RESERVED_BOTTOM  (1ULL << 16)
-#define AMDGPU_VA_RESERVED_TOP (AMDGPU_VA_RESERVED_SEQ64_SIZE 
+ \
+#define AMDGPU_VA_RESERVED_TOP (AMDGPU_VA_RESERVED_TRAP_SIZE + 
\
+AMDGPU_VA_RESERVED_SEQ64_SIZE 
+ \
 AMDGPU_VA_RESERVED_CSA_SIZE)
 
 /* See vm_update_mode */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
index 6604a3f99c5e..4a64307bc438 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
@@ -36,6 +36,7 @@
 #include 
 #include 
 #include 
+#include "amdgpu_vm.h"
 
 /*
  * The primary memory I/O features being added for revisions of gfxip
@@ -326,10 +327,16 @@ static void kfd_init_apertures_vi(struct 
kfd_process_device *pdd, uint8_t id)
 * with small reserved space for kernel.
 * Set them to CANONICAL addresses.
 */
-   pdd->gpuvm_base = SVM_USER_BASE;
+   pdd->gpuvm_base = max(SVM_USER_BASE, AMDGPU_VA_RESERVED_BOTTOM);
pdd->gpuvm_limit =
pdd->dev->kfd->shared_resources.gpuvm_size - 1;
 
+   /* dGPUs: the reserved space for kernel
+* before SVM
+*/
+   pdd->qpd.cwsr_base = SVM_CWSR_BASE;
+   pdd->qpd.ib_base = SVM_IB_BASE;
+
pdd->scratch_base = MAKE_SCRATCH_APP_BASE_VI();
pdd->scra

Re: [Patch v2 1/2] drm/amdkfd: update SIMD distribution algo for GFXIP 9.4.2 onwards

2024-02-13 Thread Felix Kuehling




On 2024-02-13 16:39, Rajneesh Bhardwaj wrote:

In certain cooperative group dispatch scenarios the default SPI resource
allocation may cause reduced per-CU workgroup occupancy. Set
COMPUTE_RESOURCE_LIMITS.FORCE_SIMD_DIST=1 to mitigate soft hang
scenarions.

Suggested-by: Joseph Greathouse 
Signed-off-by: Rajneesh Bhardwaj 


Reviewed-by: Felix Kuehling 



---
* Change the enum bitfield to 4 to avoid ORing condition of previous
   member flags.
* Incorporate review feedback from Felix from
   https://www.mail-archive.com/amd-gfx@lists.freedesktop.org/msg102840.html
   and split one of the suggested gfx11 changes as a seperate patch.


  drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c| 9 +
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h  | 1 +
  drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 4 +++-
  3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
index 42d881809dc7..697b6d530d12 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
@@ -303,6 +303,15 @@ static void update_mqd(struct mqd_manager *mm, void *mqd,
update_cu_mask(mm, mqd, minfo, 0);
set_priority(m, q);
  
+	if (minfo && KFD_GC_VERSION(mm->dev) >= IP_VERSION(9, 4, 2)) {

+   if (minfo->update_flag & UPDATE_FLAG_IS_GWS)
+   m->compute_resource_limits |=
+   COMPUTE_RESOURCE_LIMITS__FORCE_SIMD_DIST_MASK;
+   else
+   m->compute_resource_limits &=
+   ~COMPUTE_RESOURCE_LIMITS__FORCE_SIMD_DIST_MASK;
+   }
+
q->is_active = QUEUE_IS_ACTIVE(*q);
  }
  
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

index 677281c0793e..80320b8603fc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -532,6 +532,7 @@ struct queue_properties {
  enum mqd_update_flag {
UPDATE_FLAG_DBG_WA_ENABLE = 1,
UPDATE_FLAG_DBG_WA_DISABLE = 2,
+   UPDATE_FLAG_IS_GWS = 4, /* quirk for gfx9 IP */
  };
  
  struct mqd_update_info {

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 43eff221eae5..4858112f9a53 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -95,6 +95,7 @@ void kfd_process_dequeue_from_device(struct 
kfd_process_device *pdd)
  int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid,
void *gws)
  {
+   struct mqd_update_info minfo = {0};
struct kfd_node *dev = NULL;
struct process_queue_node *pqn;
struct kfd_process_device *pdd;
@@ -146,9 +147,10 @@ int pqm_set_gws(struct process_queue_manager *pqm, 
unsigned int qid,
}
  
  	pdd->qpd.num_gws = gws ? dev->adev->gds.gws_size : 0;

+   minfo.update_flag = gws ? UPDATE_FLAG_IS_GWS : 0;
  
  	return pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm,

-   pqn->q, NULL);
+   pqn->q, );
  }
  
  void kfd_process_dequeue_from_all_devices(struct kfd_process *p)

Re: [PATCH 2/2] drm/amdgpu: Fix implicit assumtion in gfx11 debug flags

2024-02-13 Thread Felix Kuehling


On 2024-02-09 20:49, Rajneesh Bhardwaj wrote:

Gfx11 debug flags mask is currently set with an implicit assumption that
no other mqd update flags exist. This needs to be fixed with newly
introduced flag UPDATE_FLAG_IS_GWS by the previous patch.

Signed-off-by: Rajneesh Bhardwaj 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
index d722cbd31783..826bc4f6c8a7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
@@ -55,8 +55,8 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
m = get_mqd(mqd);
  
  	if (has_wa_flag) {

-   uint32_t wa_mask = minfo->update_flag == 
UPDATE_FLAG_DBG_WA_ENABLE ?
-   0x : 0x;
+   uint32_t wa_mask =
+   (minfo->update_flag & UPDATE_FLAG_DBG_WA_ENABLE) ? 
0x : 0x;
  
  		m->compute_static_thread_mgmt_se0 = wa_mask;

m->compute_static_thread_mgmt_se1 = wa_mask;

Re: [PATCH 1/2] drm/amdkfd: update SIMD distribution algo for GFXIP 9.4.2 onwards

2024-02-13 Thread Felix Kuehling




On 2024-02-09 20:49, Rajneesh Bhardwaj wrote:

In certain cooperative group dispatch scenarios the default SPI resource
allocation may cause reduced per-CU workgroup occupancy. Set
COMPUTE_RESOURCE_LIMITS.FORCE_SIMD_DIST=1 to mitigate soft hang
scenarions.

Suggested-by: Joseph Greathouse 
Signed-off-by: Rajneesh Bhardwaj 
---
* Incorporate review feedback from Felix from
   https://www.mail-archive.com/amd-gfx@lists.freedesktop.org/msg102840.html
   and split one of the suggested gfx11 changes as a seperate patch.


  drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c| 9 +
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h  | 1 +
  drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 4 +++-
  3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
index 42d881809dc7..697b6d530d12 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
@@ -303,6 +303,15 @@ static void update_mqd(struct mqd_manager *mm, void *mqd,
update_cu_mask(mm, mqd, minfo, 0);
set_priority(m, q);
  
+	if (minfo && KFD_GC_VERSION(mm->dev) >= IP_VERSION(9, 4, 2)) {

+   if (minfo->update_flag & UPDATE_FLAG_IS_GWS)
+   m->compute_resource_limits |=
+   COMPUTE_RESOURCE_LIMITS__FORCE_SIMD_DIST_MASK;
+   else
+   m->compute_resource_limits &=
+   ~COMPUTE_RESOURCE_LIMITS__FORCE_SIMD_DIST_MASK;
+   }
+
q->is_active = QUEUE_IS_ACTIVE(*q);
  }
  
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

index 677281c0793e..65b504813576 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -532,6 +532,7 @@ struct queue_properties {
  enum mqd_update_flag {
UPDATE_FLAG_DBG_WA_ENABLE = 1,
UPDATE_FLAG_DBG_WA_DISABLE = 2,
+   UPDATE_FLAG_IS_GWS = 3, /* quirk for gfx9 IP */


This flat needs to be a separate bit. So it should be defined as 4. 
Otherwise it looks just like UPDATE_FLAG_DBG_WA_ENABLE | 
UPDATE_FLAG_DBG_WA_DISABLE. I agree that defining bit-masks in an enum 
is not ideal, but I've seen the same in other places.


Regards,
  Felix



  };
  
  struct mqd_update_info {

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 43eff221eae5..4858112f9a53 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -95,6 +95,7 @@ void kfd_process_dequeue_from_device(struct 
kfd_process_device *pdd)
  int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid,
void *gws)
  {
+   struct mqd_update_info minfo = {0};
struct kfd_node *dev = NULL;
struct process_queue_node *pqn;
struct kfd_process_device *pdd;
@@ -146,9 +147,10 @@ int pqm_set_gws(struct process_queue_manager *pqm, 
unsigned int qid,
}
  
  	pdd->qpd.num_gws = gws ? dev->adev->gds.gws_size : 0;

+   minfo.update_flag = gws ? UPDATE_FLAG_IS_GWS : 0;
  
  	return pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm,

-   pqn->q, NULL);
+   pqn->q, );
  }
  
  void kfd_process_dequeue_from_all_devices(struct kfd_process *p)

Re: [Patch v2] drm/amdkfd: update SIMD distribution algo for GFXIP 9.4.2 onwards

2024-02-08 Thread Felix Kuehling




On 2024-02-08 15:01, Bhardwaj, Rajneesh wrote:


On 2/8/2024 2:41 PM, Felix Kuehling wrote:


On 2024-02-07 23:14, Rajneesh Bhardwaj wrote:
In certain cooperative group dispatch scenarios the default SPI 
resource

allocation may cause reduced per-CU workgroup occupancy. Set
COMPUTE_RESOURCE_LIMITS.FORCE_SIMD_DIST=1 to mitigate soft hang
scenarions.

Suggested-by: Joseph Greathouse 
Signed-off-by: Rajneesh Bhardwaj 
---
* Found a bug in the previous reviewed version
https://lists.freedesktop.org/archives/amd-gfx/2024-February/104101.html 


   since the q->is_gws is unset for keeping the count.
* updated pqm_set_gws to pass minfo holding gws state for the active
   queues and use that to apply the FORCE_SIMD_DIST_MASK.

  drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c    | 4 
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h  | 1 +
  drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 4 +++-
  3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c

index 42d881809dc7..0b71db4c96b5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
@@ -303,6 +303,10 @@ static void update_mqd(struct mqd_manager *mm, 
void *mqd,

  update_cu_mask(mm, mqd, minfo, 0);
  set_priority(m, q);
  +    if (minfo && KFD_GC_VERSION(mm->dev) >= IP_VERSION(9, 4, 2))
+    m->compute_resource_limits = minfo->gws ?
+    COMPUTE_RESOURCE_LIMITS__FORCE_SIMD_DIST_MASK : 0;
+


This looks OK because we don't set anything else in 
m->compute_resource_limits. If that ever changes, we have to be more 
careful here to not wipe out other fields in that register.



Yes, Should I change it to below and send a v3?

 m->compute_resource_limits |= minfo->gws ?
    COMPUTE_RESOURCE_LIMITS__FORCE_SIMD_DIST_MASK : 0;


I think you need to do

if (minfo->gws)
m->compute_resource_limits |= 
COMPUTE_RESOURCE_LIMITS__FORCE_SIMD_DIST_MASK;
else
m->compute_resource_limits &= 
~COMPUTE_RESOURCE_LIMITS__FORCE_SIMD_DIST_MASK;

That way you can clear the resource limit when GWS is disable for the queue.









  q->is_active = QUEUE_IS_ACTIVE(*q);
  }
  diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

index 677281c0793e..f4b327a2d4a8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -542,6 +542,7 @@ struct mqd_update_info {
  } cu_mask;
  };
  enum mqd_update_flag update_flag;
+    bool gws;


Instead of adding a new bool, can we add a flag to mqd_update_flag?


Maybe, I initially thought about it but then I chose the bool approach 
since  those debug flags are generic KFD non per-Asic flags while this 
bool is per-Asic request so I felt they didn't fit together. On the 
other hand, those flags and this bool are both quirks anyways so maybe 
they can be together.   Please let me know your preference.


I'd prefer to used the flags. They are currently used for a GFX11 quirk, 
now we can add another flag for a GFX9 quirk.


The GFX11 code currently has an implicit assumption that no other flags 
exist. That would need to be fixed:


if (has_wa_flag) {
-   uint32_t wa_mask = minfo->update_flag == 
UPDATE_FLAG_DBG_WA_ENABLE ?
+   uint32_t wa_mask = (minfo->update_flag & 
UPDATE_FLAG_DBG_WA_ENABLE) ?
0x : 0x;

Regards,
  Felix







Looks good to me otherwise.

Regards,
  Felix



  };
    /**
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c

index 43eff221eae5..5416a110ced9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -95,6 +95,7 @@ void kfd_process_dequeue_from_device(struct 
kfd_process_device *pdd)

  int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid,
  void *gws)
  {
+    struct mqd_update_info minfo = {0};
  struct kfd_node *dev = NULL;
  struct process_queue_node *pqn;
  struct kfd_process_device *pdd;
@@ -146,9 +147,10 @@ int pqm_set_gws(struct process_queue_manager 
*pqm, unsigned int qid,

  }
    pdd->qpd.num_gws = gws ? dev->adev->gds.gws_size : 0;
+    minfo.gws = !!gws;
    return 
pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm,

-    pqn->q, NULL);
+    pqn->q, );
  }
    void kfd_process_dequeue_from_all_devices(struct kfd_process *p)

Re: [Patch v2] drm/amdkfd: update SIMD distribution algo for GFXIP 9.4.2 onwards

2024-02-08 Thread Felix Kuehling




On 2024-02-07 23:14, Rajneesh Bhardwaj wrote:

In certain cooperative group dispatch scenarios the default SPI resource
allocation may cause reduced per-CU workgroup occupancy. Set
COMPUTE_RESOURCE_LIMITS.FORCE_SIMD_DIST=1 to mitigate soft hang
scenarions.

Suggested-by: Joseph Greathouse 
Signed-off-by: Rajneesh Bhardwaj 
---
* Found a bug in the previous reviewed version
   https://lists.freedesktop.org/archives/amd-gfx/2024-February/104101.html
   since the q->is_gws is unset for keeping the count.
* updated pqm_set_gws to pass minfo holding gws state for the active
   queues and use that to apply the FORCE_SIMD_DIST_MASK.

  drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c| 4 
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h  | 1 +
  drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 4 +++-
  3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
index 42d881809dc7..0b71db4c96b5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
@@ -303,6 +303,10 @@ static void update_mqd(struct mqd_manager *mm, void *mqd,
update_cu_mask(mm, mqd, minfo, 0);
set_priority(m, q);
  
+	if (minfo && KFD_GC_VERSION(mm->dev) >= IP_VERSION(9, 4, 2))

+   m->compute_resource_limits = minfo->gws ?
+   COMPUTE_RESOURCE_LIMITS__FORCE_SIMD_DIST_MASK : 0;
+


This looks OK because we don't set anything else in 
m->compute_resource_limits. If that ever changes, we have to be more 
careful here to not wipe out other fields in that register.




q->is_active = QUEUE_IS_ACTIVE(*q);
  }
  
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

index 677281c0793e..f4b327a2d4a8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -542,6 +542,7 @@ struct mqd_update_info {
} cu_mask;
};
enum mqd_update_flag update_flag;
+   bool gws;


Instead of adding a new bool, can we add a flag to mqd_update_flag?

Looks good to me otherwise.

Regards,
  Felix



  };
  
  /**

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 43eff221eae5..5416a110ced9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -95,6 +95,7 @@ void kfd_process_dequeue_from_device(struct 
kfd_process_device *pdd)
  int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid,
void *gws)
  {
+   struct mqd_update_info minfo = {0};
struct kfd_node *dev = NULL;
struct process_queue_node *pqn;
struct kfd_process_device *pdd;
@@ -146,9 +147,10 @@ int pqm_set_gws(struct process_queue_manager *pqm, 
unsigned int qid,
}
  
  	pdd->qpd.num_gws = gws ? dev->adev->gds.gws_size : 0;

+   minfo.gws = !!gws;
  
  	return pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm,

-   pqn->q, NULL);
+   pqn->q, );
  }
  
  void kfd_process_dequeue_from_all_devices(struct kfd_process *p)

Re: [PATCH v2] drm/amdkfd: Initialize kfd_gpu_cache_info for KFD topology

2024-02-06 Thread Felix Kuehling


On 2024-02-07 0:32, Joseph Greathouse wrote:

The current kfd_gpu_cache_info structure is only partially
filled in for some architectures. This means that for devices
where we do not fill in some fields, we can returned
uninitialized values through  the KFD topology.
Zero out the kfd_gpu_cache_info before asking the remaining
fields to be filled in by lower-level functions.

Fixes: 04756ac9a24c ("drm/amdkfd: Add cache line sizes to KFD topology")
Signed-off-by: Joseph Greathouse 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 3df2a8ad86fb..5cb0465493b8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -1707,6 +1707,7 @@ static void kfd_fill_cache_non_crat_info(struct 
kfd_topology_device *dev, struct
  
  	gpu_processor_id = dev->node_props.simd_id_base;
  
+	memset(cache_info, 0, sizeof(cache_info));

pcache_info = cache_info;
num_of_cache_types = kfd_get_gpu_cache_info(kdev, _info);
if (!num_of_cache_types) {

Re: [PATCH] drm/amdkfd: Don't divide L2 cache by partition mode

2024-02-06 Thread Felix Kuehling




On 2024-02-06 16:24, Kent Russell wrote:

Partition mode only affects L3 cache size. After removing the L2 check in
the previous patch, make sure we aren't dividing all cache sizes by
partition mode, just L3.

Fixes: a75bfb3c4045 ("drm/amdkfd: Fix L2 cache size reporting in GFX9.4.3")
The fixes tag looks wrong. I can't find the commit a75bfb3c4045 
anywhere. Did your previous patch actually make it into the branch yet? 
Maybe you can still abandon it in Gerrit.


Regards,
  Felix




Signed-off-by: Kent Russell 
---
  drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 8 
  1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 64bf2a56f010..533b8292b136 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -1640,10 +1640,10 @@ static int fill_in_l2_l3_pcache(struct 
kfd_cache_properties **props_ext,
else
mode = UNKNOWN_MEMORY_PARTITION_MODE;
  
-		if (mode)

-   pcache->cache_size = pcache_info[cache_type].cache_size 
/ mode;
-   else
-   pcache->cache_size = pcache_info[cache_type].cache_size;
+   pcache->cache_size = pcache_info[cache_type].cache_size;
+   /* Partition mode only affects L3 cache size */
+   if (mode && pcache->cache_level == 3)
+   pcache->cache_size /= mode;
  
  		if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_DATA_CACHE)

pcache->cache_type |= HSA_CACHE_TYPE_DATA;

Re: [PATCH] drm/amdkfd: Initialize kfd_gpu_cache_info for KFD topology

2024-02-06 Thread Felix Kuehling




On 2024-02-06 15:55, Joseph Greathouse wrote:

The current kfd_gpu_cache_info structure is only partially
filled in for some architectures. This means that for devices
where we do not fill in some fields, we can returned
uninitialized values through  the KFD topology.
Zero out the kfd_gpu_cache_info before asking the remaining
fields to be filled in by lower-level functions.

Signed-off-by: Joseph Greathouse 


This fixes your previous patch "drm/amdkfd: Add cache line sizes to KFD 
topology". Alex, I think the previous patch hasn't gone upstream yet. Do 
you want a Fixes: tag or is is possible to squash this with Joe's 
previous patch before upstreaming?


One nit-pick below.



---
  drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 3df2a8ad86fb..67c1e7f84750 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -1707,6 +1707,7 @@ static void kfd_fill_cache_non_crat_info(struct 
kfd_topology_device *dev, struct
  
  	gpu_processor_id = dev->node_props.simd_id_base;
  
+	memset(cache_info, 0, sizeof(struct kfd_gpu_cache_info) * KFD_MAX_CACHE_TYPES);


Just use sizeof(cache_info). No need to calculate the size of the array 
and risk getting it wrong.


Regards,
  Felix



pcache_info = cache_info;
num_of_cache_types = kfd_get_gpu_cache_info(kdev, _info);
if (!num_of_cache_types) {

Re: [PATCH 1/2] drm/amdgpu: Unmap only clear the page table leaves

2024-02-02 Thread Felix Kuehling




On 2024-02-01 11:50, Philip Yang wrote:

SVM migration unmap pages from GPU and then update mapping to GPU to
recover page fault. Currently unmap clears the PDE entry for range
length >= huge page and free PTB bo, update mapping to alloc new PT bo.
There is race bug that the freed entry bo maybe still on the pt_free
list, reused when updating mapping and then freed, leave invalid PDE
entry and cause GPU page fault.

By setting the update to clear only one PDE entry or clear PTB, to
avoid unmap to free PTE bo. This fixes the race bug and improve the
unmap and map to GPU performance. Update mapping to huge page will
still free the PTB bo.

With this change, the vm->pt_freed list and work is not needed. Add
WARN_ON(unlocked) in amdgpu_vm_pt_free_dfs to catch if unmap to free the
PTB.

Signed-off-by: Philip Yang 


As we discussed offline, I think this is the wrong approach. This can 
lead to resource leaks when lots of virtual address space is released 
bug the page tables remain allocated indefinitely.


I think we need some solution that either

* prevents reuse of page tables that are about to be free
* prevents reused page tables from being freed by the worker (e.g. a ref 
count or cancelling the work)


Regards,
  Felix



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c|  4 ---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h|  4 ---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 43 ++-
  3 files changed, 10 insertions(+), 41 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 82e5fd66a10d..3bde77dfc63f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2256,8 +2256,6 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct 
amdgpu_vm *vm,
spin_lock_init(>status_lock);
INIT_LIST_HEAD(>freed);
INIT_LIST_HEAD(>done);
-   INIT_LIST_HEAD(>pt_freed);
-   INIT_WORK(>pt_free_work, amdgpu_vm_pt_free_work);
INIT_KFIFO(vm->faults);
  
  	r = amdgpu_vm_init_entities(adev, vm);

@@ -2446,8 +2444,6 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct 
amdgpu_vm *vm)
  
  	amdgpu_amdkfd_gpuvm_destroy_cb(adev, vm);
  
-	flush_work(>pt_free_work);

-
root = amdgpu_bo_ref(vm->root.bo);
amdgpu_bo_reserve(root, true);
amdgpu_vm_set_pasid(adev, vm, 0);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index cdb61f1e7c35..74fe211b9ecd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -316,10 +316,6 @@ struct amdgpu_vm {
/* BOs which are invalidated, has been updated in the PTs */
struct list_headdone;
  
-	/* PT BOs scheduled to free and fill with zero if vm_resv is not hold */

-   struct list_headpt_freed;
-   struct work_struct  pt_free_work;
-
/* contains the page directory */
struct amdgpu_vm_bo_base root;
struct dma_fence*last_update;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index a160265ddc07..a3d609655ce3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -657,27 +657,6 @@ static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base 
*entry)
amdgpu_bo_unref(>bo);
  }
  
-void amdgpu_vm_pt_free_work(struct work_struct *work)

-{
-   struct amdgpu_vm_bo_base *entry, *next;
-   struct amdgpu_vm *vm;
-   LIST_HEAD(pt_freed);
-
-   vm = container_of(work, struct amdgpu_vm, pt_free_work);
-
-   spin_lock(>status_lock);
-   list_splice_init(>pt_freed, _freed);
-   spin_unlock(>status_lock);
-
-   /* flush_work in amdgpu_vm_fini ensure vm->root.bo is valid. */
-   amdgpu_bo_reserve(vm->root.bo, true);
-
-   list_for_each_entry_safe(entry, next, _freed, vm_status)
-   amdgpu_vm_pt_free(entry);
-
-   amdgpu_bo_unreserve(vm->root.bo);
-}
-
  /**
   * amdgpu_vm_pt_free_dfs - free PD/PT levels
   *
@@ -696,17 +675,7 @@ static void amdgpu_vm_pt_free_dfs(struct amdgpu_device 
*adev,
struct amdgpu_vm_pt_cursor cursor;
struct amdgpu_vm_bo_base *entry;
  
-	if (unlocked) {

-   spin_lock(>status_lock);
-   for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
-   list_move(>vm_status, >pt_freed);
-
-   if (start)
-   list_move(>entry->vm_status, >pt_freed);
-   spin_unlock(>status_lock);
-   schedule_work(>pt_free_work);
-   return;
-   }
+   WARN_ON(unlocked);
  
  	for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)

amdgpu_vm_pt_free(entry);
@@ -1009,7 +978,15 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params 
*params,
incr = (uint64_t)AMDGPU_GPU_PAGE_SIZE << shift;
mask =

Re: [PATCH] drm/amdkfd: update SIMD distribution algo for GFXIP 9.4.2 onwards

2024-02-02 Thread Felix Kuehling

On 2024-02-02 17:24, Greathouse, Joseph wrote:

[AMD Official Use Only - General]

-Original Message-
From: Kuehling, Felix 
Sent: Friday, February 2, 2024 10:21 AM
To: Bhardwaj, Rajneesh ; 
amd-gfx@lists.freedesktop.org
Cc: Greathouse, Joseph 
Subject: Re: [PATCH] drm/amdkfd: update SIMD distribution algo for GFXIP 9.4.2 
onwards

On 2024-02-01 13:54, Rajneesh Bhardwaj wrote:

In certain cooperative group dispatch scenarios the default SPI
resource allocation may cause reduced per-CU workgroup occupancy. Set
COMPUTE_RESOURCE_LIMITS.FORCE_SIMD_DIST=1 to mitigate soft hang
scenarions.

Suggested-by: Joseph Greathouse 
Signed-off-by: Rajneesh Bhardwaj 
---
   drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 4 
   1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
index 42d881809dc7..4b28e7dcb62f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
@@ -303,6 +303,10 @@ static void update_mqd(struct mqd_manager *mm, void *mqd,
 update_cu_mask(mm, mqd, minfo, 0);
 set_priority(m, q);

+   if (KFD_GC_VERSION(mm->dev) >= IP_VERSION(9, 4, 2))

Are we sure this is only needed on GFX 9.4.2 and later GPUs? Does it affect 
older GFX 9.x GPUs as well? What about GFX 10 and 11? It
seems the register bit exists for all those GPUs?

On gfx9 devices, it is only necessary for GFX 9.4.2 and beyond. This was a side 
effect of the move from 10 wave-slots per SIMD to 8 wave-slots per SIMD.

Checking with the hardware group (and running some basic tests against the 
problem we saw on gfx9 parts), this should not be necessary for gfx10 parts, 
either those with 20 wave-slots per SIMD or 16.

Thanks for checking. The patch ls

Reviewed-by: Felix Kuehling 

Thanks,
-Joe

Regards,
Felix

+   m->compute_resource_limits = q->is_gws ?
+   COMPUTE_RESOURCE_LIMITS__FORCE_SIMD_DIST_MASK : 0;
+
 q->is_active = QUEUE_IS_ACTIVE(*q);
   }

Re: [PATCH] drm/amdkfd: update SIMD distribution algo for GFXIP 9.4.2 onwards

2024-02-02 Thread Felix Kuehling




On 2024-02-01 13:54, Rajneesh Bhardwaj wrote:

In certain cooperative group dispatch scenarios the default SPI resource
allocation may cause reduced per-CU workgroup occupancy. Set
COMPUTE_RESOURCE_LIMITS.FORCE_SIMD_DIST=1 to mitigate soft hang
scenarions.

Suggested-by: Joseph Greathouse 
Signed-off-by: Rajneesh Bhardwaj 
---
  drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 4 
  1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
index 42d881809dc7..4b28e7dcb62f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
@@ -303,6 +303,10 @@ static void update_mqd(struct mqd_manager *mm, void *mqd,
update_cu_mask(mm, mqd, minfo, 0);
set_priority(m, q);
  
+	if (KFD_GC_VERSION(mm->dev) >= IP_VERSION(9, 4, 2))


Are we sure this is only needed on GFX 9.4.2 and later GPUs? Does it 
affect older GFX 9.x GPUs as well? What about GFX 10 and 11? It seems 
the register bit exists for all those GPUs?


Regards,
  Felix



+   m->compute_resource_limits = q->is_gws ?
+   COMPUTE_RESOURCE_LIMITS__FORCE_SIMD_DIST_MASK : 0;
+
q->is_active = QUEUE_IS_ACTIVE(*q);
  }

Re: [PATCH v3] drm/amdkfd: reserve the BO before validating it

2024-01-30 Thread Felix Kuehling


On 2024-01-30 04:45, Lang Yu wrote:

Fixes: 410f08516e0f ("drm/amdkfd: Move dma unmapping after TLB flush")

v2: Avoid unmapping attachment twice when ERESTARTSYS.

v3: Lock the BO before accessing ttm->sg to avoid race conditions.(Felix)

[   41.708711] WARNING: CPU: 0 PID: 1463 at drivers/gpu/drm/ttm/ttm_bo.c:846 
ttm_bo_validate+0x146/0x1b0 [ttm]
[   41.708989] Call Trace:
[   41.708992]  
[   41.708996]  ? show_regs+0x6c/0x80
[   41.709000]  ? ttm_bo_validate+0x146/0x1b0 [ttm]
[   41.709008]  ? __warn+0x93/0x190
[   41.709014]  ? ttm_bo_validate+0x146/0x1b0 [ttm]
[   41.709024]  ? report_bug+0x1f9/0x210
[   41.709035]  ? handle_bug+0x46/0x80
[   41.709041]  ? exc_invalid_op+0x1d/0x80
[   41.709048]  ? asm_exc_invalid_op+0x1f/0x30
[   41.709057]  ? amdgpu_amdkfd_gpuvm_dmaunmap_mem+0x2c/0x80 [amdgpu]
[   41.709185]  ? ttm_bo_validate+0x146/0x1b0 [ttm]
[   41.709197]  ? amdgpu_amdkfd_gpuvm_dmaunmap_mem+0x2c/0x80 [amdgpu]
[   41.709337]  ? srso_alias_return_thunk+0x5/0x7f
[   41.709346]  kfd_mem_dmaunmap_attachment+0x9e/0x1e0 [amdgpu]
[   41.709467]  amdgpu_amdkfd_gpuvm_dmaunmap_mem+0x56/0x80 [amdgpu]
[   41.709586]  kfd_ioctl_unmap_memory_from_gpu+0x1b7/0x300 [amdgpu]
[   41.709710]  kfd_ioctl+0x1ec/0x650 [amdgpu]
[   41.709822]  ? __pfx_kfd_ioctl_unmap_memory_from_gpu+0x10/0x10 [amdgpu]
[   41.709945]  ? srso_alias_return_thunk+0x5/0x7f
[   41.709949]  ? tomoyo_file_ioctl+0x20/0x30
[   41.709959]  __x64_sys_ioctl+0x9c/0xd0
[   41.709967]  do_syscall_64+0x3f/0x90
[   41.709973]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8

Signed-off-by: Lang Yu 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  2 +-
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 20 ---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  4 +++-
  3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 298fc52a35bc..e60f63ccf79a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -313,7 +313,7 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(struct 
amdgpu_device *adev,
  struct kgd_mem *mem, void *drm_priv);
  int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
struct amdgpu_device *adev, struct kgd_mem *mem, void 
*drm_priv);
-void amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv);
+int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv);
  int amdgpu_amdkfd_gpuvm_sync_memory(
struct amdgpu_device *adev, struct kgd_mem *mem, bool intr);
  int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 6f3a4cb2a9ef..ef71b12062a1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2088,21 +2088,35 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
return ret;
  }
  
-void amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv)

+int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv)
  {
struct kfd_mem_attachment *entry;
struct amdgpu_vm *vm;
+   int ret;
  
  	vm = drm_priv_to_vm(drm_priv);
  
  	mutex_lock(>lock);
  
+	ret = amdgpu_bo_reserve(mem->bo, true);

+   if (ret)
+   goto out;
+
list_for_each_entry(entry, >attachments, list) {
-   if (entry->bo_va->base.vm == vm)
-   kfd_mem_dmaunmap_attachment(mem, entry);
+   if (entry->bo_va->base.vm != vm)
+   continue;
+   if (entry->bo_va->base.bo->tbo.ttm &&
+   !entry->bo_va->base.bo->tbo.ttm->sg)
+   continue;
+
+   kfd_mem_dmaunmap_attachment(mem, entry);
}
  
+	amdgpu_bo_unreserve(mem->bo);

+out:
mutex_unlock(>lock);
+
+   return ret;
  }
  
  int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index ce4c52ec34d8..80e90fdef291 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1442,7 +1442,9 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file 
*filep,
kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT);
  
  		/* Remove dma mapping after tlb flush to avoid IO_PAGE_FAULT */

-   amdgpu_amdkfd_gpuvm_dmaunmap_mem(mem, peer_pdd->drm_priv);
+   err = amdgpu_amdkfd_gpuvm_dmaunmap_mem(mem, peer_pdd->drm_priv);
+   if (err)
+   goto sync_memory_failed;
}
  
  	mutex_unlock(>mutex);

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 3297 matches

Mail list logo