Re: [PATCH] drm/amdgpu: Update the impelmentation of AMDGPU_PTE_MTYPE_GFX12
On 2024-05-20 5:14, Shane Xiao wrote: > This patch changes the implementation of AMDGPU_PTE_MTYPE_GFX12, > clear the bits before setting the new one. > This fixed the potential issue that GFX12 setting memory to NC. > > v2: Clear mtype field before setting the new one (Alex) > > Signed-off-by: longlyao > Signed-off-by: Shane Xiao > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 7 +-- > drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 23 +++ > 2 files changed, 16 insertions(+), 14 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > index bc71b44387b2..99b246e82ed6 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > @@ -116,8 +116,11 @@ struct amdgpu_mem_stats; > #define AMDGPU_PTE_PRT_FLAG(adev)\ > ((amdgpu_ip_version((adev), GC_HWIP, 0) >= IP_VERSION(12, 0, 0)) ? > AMDGPU_PTE_PRT_GFX12 : AMDGPU_PTE_PRT) > > -#define AMDGPU_PTE_MTYPE_GFX12(a)((uint64_t)(a) << 54) > -#define AMDGPU_PTE_MTYPE_GFX12_MASK AMDGPU_PTE_MTYPE_GFX12(3ULL) > +#define AMDGPU_PTE_MTYPE_GFX12_SHIFT(mtype) ((uint64_t)(mytype) << 54) You have a typo here: mytype -> mtype . Regards, Felix > +#define AMDGPU_PTE_MTYPE_GFX12_MASK AMDGPU_PTE_MTYPE_GFX12_SHIFT(3ULL) > +#define AMDGPU_PTE_MTYPE_GFX12(flags, mtype) \ > + ((flags) & ((~AMDGPU_PTE_MTYPE_GFX12_MASK)) | \ > + AMDGPU_PTE_MTYPE_GFX12_SHIFT(mtype)) > > #define AMDGPU_PTE_IS_PTE(1ULL << 63) > > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c > b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c > index e2c6ec3cc4f3..f2d331d0181f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c > @@ -461,17 +461,17 @@ static uint64_t gmc_v12_0_map_mtype(struct > amdgpu_device *adev, uint32_t flags) > { > switch (flags) { > case AMDGPU_VM_MTYPE_DEFAULT: > - return AMDGPU_PTE_MTYPE_GFX12(MTYPE_NC); > + return AMDGPU_PTE_MTYPE_GFX12(0ULL,MTYPE_NC); > case AMDGPU_VM_MTYPE_NC: > - return AMDGPU_PTE_MTYPE_GFX12(MTYPE_NC); > + return AMDGPU_PTE_MTYPE_GFX12(0ULL,MTYPE_NC); > case AMDGPU_VM_MTYPE_WC: > - return AMDGPU_PTE_MTYPE_GFX12(MTYPE_WC); > + return AMDGPU_PTE_MTYPE_GFX12(0ULL,MTYPE_WC); > case AMDGPU_VM_MTYPE_CC: > - return AMDGPU_PTE_MTYPE_GFX12(MTYPE_CC); > + return AMDGPU_PTE_MTYPE_GFX12(0ULL,MTYPE_CC); > case AMDGPU_VM_MTYPE_UC: > - return AMDGPU_PTE_MTYPE_GFX12(MTYPE_UC); > + return AMDGPU_PTE_MTYPE_GFX12(0ULL,MTYPE_UC); > default: > - return AMDGPU_PTE_MTYPE_GFX12(MTYPE_NC); > + return AMDGPU_PTE_MTYPE_GFX12(0ULL,MTYPE_NC); > } > } > > @@ -509,8 +509,8 @@ static void gmc_v12_0_get_vm_pte(struct amdgpu_device > *adev, > *flags &= ~AMDGPU_PTE_EXECUTABLE; > *flags |= mapping->flags & AMDGPU_PTE_EXECUTABLE; > > - *flags &= ~AMDGPU_PTE_MTYPE_GFX12_MASK; > - *flags |= (mapping->flags & AMDGPU_PTE_MTYPE_GFX12_MASK); > + *flags = AMDGPU_PTE_MTYPE_GFX12(*flags, (mapping->flags & \ > + AMDGPU_PTE_MTYPE_GFX12_MASK) >> > AMDGPU_PTE_MTYPE_GFX12_SHIFT); > > if (mapping->flags & AMDGPU_PTE_PRT_GFX12) { > *flags |= AMDGPU_PTE_PRT_GFX12; > @@ -524,8 +524,7 @@ static void gmc_v12_0_get_vm_pte(struct amdgpu_device > *adev, > > if (bo->flags & (AMDGPU_GEM_CREATE_COHERENT | > AMDGPU_GEM_CREATE_UNCACHED)) > - *flags = (*flags & ~AMDGPU_PTE_MTYPE_GFX12_MASK) | > - AMDGPU_PTE_MTYPE_GFX12(MTYPE_UC); > + *flags = AMDGPU_PTE_MTYPE_GFX12(*flags, MTYPE_UC); > > bo_adev = amdgpu_ttm_adev(bo->tbo.bdev); > coherent = bo->flags & AMDGPU_GEM_CREATE_COHERENT; > @@ -534,7 +533,7 @@ static void gmc_v12_0_get_vm_pte(struct amdgpu_device > *adev, > > /* WA for HW bug */ > if (is_system || ((bo_adev != adev) && coherent)) > - *flags |= AMDGPU_PTE_MTYPE_GFX12(MTYPE_NC); > + *flags = AMDGPU_PTE_MTYPE_GFX12(*flags, MTYPE_NC); > > } > > @@ -707,7 +706,7 @@ static int gmc_v12_0_gart_init(struct amdgpu_device *adev) > return r; > > adev->gart.table_size = adev->gart.num_gpu_pages * 8; > - adev->gart.gart_pte_flags = AMDGPU_PTE_MTYPE_GFX12(MTYPE_UC) | > + adev->gart.gart_pte_flags = AMDGPU_PTE_MTYPE_GFX12(0ULL, MTYPE_UC) | > AMDGPU_PTE_EXECUTABLE | > AMDGPU_PTE_IS_PTE; >
Re: [PATCH] drm/kfd: Correct pined buffer handling at kfd restore and validate process
On 2024-05-13 11:18, Xiaogang.Chen wrote: > From: Xiaogang Chen > > This reverts 8a774fe912ff09e39c2d3a3589c729330113f388 "drm/amdgpu: avoid > restore > process run into dead loop" since buffer got pined is not related whether it Spelling: pined -> pinned Same in the commit headline. > needs mapping. And skip buffer validation at kfd driver if the buffer has been > pinned. > > Signed-off-by: Xiaogang Chen > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 9 + > 1 file changed, 5 insertions(+), 4 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > index 3314821e4cf3..80018738bd1c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > @@ -415,6 +415,10 @@ static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo > *bo, uint32_t domain, >"Called with userptr BO")) > return -EINVAL; > > + /* bo has been pined, not need validate it */ pined -> pinned With those typos fixed, the patch is Reviewed-by: Felix Kuehling > + if (bo->tbo.pin_count) > + return 0; > + > amdgpu_bo_placement_from_domain(bo, domain); > > ret = ttm_bo_validate(>tbo, >placement, ); > @@ -2736,7 +2740,7 @@ static int confirm_valid_user_pages_locked(struct > amdkfd_process_info *process_i > > /* keep mem without hmm range at userptr_inval_list */ > if (!mem->range) > - continue; > + continue; > > /* Only check mem with hmm range associated */ > valid = amdgpu_ttm_tt_get_user_pages_done( > @@ -2981,9 +2985,6 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, > struct dma_fence __rcu * > if (!attachment->is_mapped) > continue; > > - if (attachment->bo_va->base.bo->tbo.pin_count) > - continue; > - > kfd_mem_dmaunmap_attachment(mem, attachment); > ret = update_gpuvm_pte(mem, attachment, _obj); > if (ret) {
Re: [PATCH v2] drm/amdkfd: Check correct memory types for is_system variable
On 2024-05-10 10:06, Sreekant Somasekharan wrote: To catch GPU mapping of system memory, TTM_PL_TT and AMDGPU_PL_PREEMPT must be checked. 'Fixes: 3b01ca1b860d ("drm/amdkfd: mark GFX12 system and peer GPU memory mappings as MTYPE_NC")' I don't think that's a valid format for the Fixes tag. It should be a single line and no single quotes. Other than that, the patch is Reviewed-by: Felix Kuehling Signed-off-by: Sreekant Somasekharan --- drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c index df0363ad1a51..6eb370609d01 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c @@ -495,7 +495,8 @@ static void gmc_v12_0_get_vm_pte(struct amdgpu_device *adev, struct amdgpu_bo *bo = mapping->bo_va->base.bo; struct amdgpu_device *bo_adev = amdgpu_ttm_adev(bo->tbo.bdev); bool coherent = bo->flags & AMDGPU_GEM_CREATE_COHERENT; - bool is_system = bo->tbo.resource->mem_type == TTM_PL_SYSTEM; + bool is_system = (bo->tbo.resource->mem_type == TTM_PL_TT) || + (bo->tbo.resource->mem_type == AMDGPU_PL_PREEMPT); *flags &= ~AMDGPU_PTE_EXECUTABLE;
Re: [PATCH] drm/amdkfd: Ensure gpu_id is unique
On 2024-05-09 16:06, Harish Kasiviswanathan wrote: gpu_id needs to be unique for user space to identify GPUs via KFD interface. In the current implementation there is a very small probability of having non unique gpu_ids. v2: Add check to confirm if gpu_id is unique. If not unique, find one Changed commit header to reflect the above v3: Use crc16 as suggested-by: Lijo Lazar Ensure that gpu_id != 0 Signed-off-by: Harish Kasiviswanathan Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 40 +++ 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index 219dcf504f24..4954a3021f70 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "kfd_priv.h" #include "kfd_crat.h" @@ -1091,14 +1092,17 @@ void kfd_topology_shutdown(void) static uint32_t kfd_generate_gpu_id(struct kfd_node *gpu) { - uint32_t hashout; + uint32_t gpu_id; uint32_t buf[8]; uint64_t local_mem_size; - int i; + struct kfd_topology_device *dev; + bool is_unique; + uint8_t *crc_buf; if (!gpu) return 0; + crc_buf = (uint8_t*) local_mem_size = gpu->local_mem_info.local_mem_size_private + gpu->local_mem_info.local_mem_size_public; buf[0] = gpu->adev->pdev->devfn; @@ -,10 +1115,34 @@ static uint32_t kfd_generate_gpu_id(struct kfd_node *gpu) buf[6] = upper_32_bits(local_mem_size); buf[7] = (ffs(gpu->xcc_mask) - 1) | (NUM_XCC(gpu->xcc_mask) << 16); - for (i = 0, hashout = 0; i < 8; i++) - hashout ^= hash_32(buf[i], KFD_GPU_ID_HASH_WIDTH); + gpu_id = crc16(0, crc_buf, sizeof(buf)) & +((1 << KFD_GPU_ID_HASH_WIDTH) - 1); - return hashout; + /* There is a very small possibility when generating a +* 16 (KFD_GPU_ID_HASH_WIDTH) bit value from 8 word buffer +* that the value could be 0 or non-unique. So, check if +* it is unique and non-zero. If not unique increment till +* unique one is found. In case of overflow, restart from 1 +*/ + + down_read(_lock); + do { + is_unique = true; + if (!gpu_id) + gpu_id = 1; + list_for_each_entry(dev, _device_list, list) { + if (dev->gpu && dev->gpu_id == gpu_id) { + is_unique = false; + break; + } + } + if (unlikely(!is_unique)) + gpu_id = (gpu_id + 1) & + ((1 << KFD_GPU_ID_HASH_WIDTH) - 1); + } while (!is_unique); + up_read(_lock); + + return gpu_id; } /* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If *the GPU device is not already present in the topology device @@ -1945,7 +1973,6 @@ int kfd_topology_add_device(struct kfd_node *gpu) struct amdgpu_gfx_config *gfx_info = >adev->gfx.config; struct amdgpu_cu_info *cu_info = >adev->gfx.cu_info; - gpu_id = kfd_generate_gpu_id(gpu); if (gpu->xcp && !gpu->xcp->ddev) { dev_warn(gpu->adev->dev, "Won't add GPU to topology since it has no drm node assigned."); @@ -1968,6 +1995,7 @@ int kfd_topology_add_device(struct kfd_node *gpu) if (res) return res; + gpu_id = kfd_generate_gpu_id(gpu); dev->gpu_id = gpu_id; gpu->id = gpu_id;
Re: [PATCH 11/11] drm/tegra: Use fbdev client helpers
On 2024-05-07 07:58, Thomas Zimmermann wrote: Implement struct drm_client_funcs with the respective helpers and remove the custom code from the emulation. The generic helpers are equivalent in functionality. Signed-off-by: Thomas Zimmermann --- drivers/gpu/drm/radeon/radeon_fbdev.c | 66 ++- Was radeon meant to be a separate patch? Regards, Felix drivers/gpu/drm/tegra/fbdev.c | 58 ++- 2 files changed, 6 insertions(+), 118 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_fbdev.c b/drivers/gpu/drm/radeon/radeon_fbdev.c index 02bf25759059a..cf790922174ea 100644 --- a/drivers/gpu/drm/radeon/radeon_fbdev.c +++ b/drivers/gpu/drm/radeon/radeon_fbdev.c @@ -29,7 +29,6 @@ #include #include -#include #include #include #include @@ -293,71 +292,12 @@ static const struct drm_fb_helper_funcs radeon_fbdev_fb_helper_funcs = { }; /* - * Fbdev client and struct drm_client_funcs + * struct drm_client_funcs */ -static void radeon_fbdev_client_unregister(struct drm_client_dev *client) -{ - struct drm_fb_helper *fb_helper = drm_fb_helper_from_client(client); - struct drm_device *dev = fb_helper->dev; - struct radeon_device *rdev = dev->dev_private; - - if (fb_helper->info) { - vga_switcheroo_client_fb_set(rdev->pdev, NULL); - drm_helper_force_disable_all(dev); - drm_fb_helper_unregister_info(fb_helper); - } else { - drm_client_release(_helper->client); - drm_fb_helper_unprepare(fb_helper); - kfree(fb_helper); - } -} - -static int radeon_fbdev_client_restore(struct drm_client_dev *client) -{ - drm_fb_helper_lastclose(client->dev); - vga_switcheroo_process_delayed_switch(); - - return 0; -} - -static int radeon_fbdev_client_hotplug(struct drm_client_dev *client) -{ - struct drm_fb_helper *fb_helper = drm_fb_helper_from_client(client); - struct drm_device *dev = client->dev; - struct radeon_device *rdev = dev->dev_private; - int ret; - - if (dev->fb_helper) - return drm_fb_helper_hotplug_event(dev->fb_helper); - - ret = drm_fb_helper_init(dev, fb_helper); - if (ret) - goto err_drm_err; - - if (!drm_drv_uses_atomic_modeset(dev)) - drm_helper_disable_unused_functions(dev); - - ret = drm_fb_helper_initial_config(fb_helper); - if (ret) - goto err_drm_fb_helper_fini; - - vga_switcheroo_client_fb_set(rdev->pdev, fb_helper->info); - - return 0; - -err_drm_fb_helper_fini: - drm_fb_helper_fini(fb_helper); -err_drm_err: - drm_err(dev, "Failed to setup radeon fbdev emulation (ret=%d)\n", ret); - return ret; -} - static const struct drm_client_funcs radeon_fbdev_client_funcs = { - .owner = THIS_MODULE, - .unregister = radeon_fbdev_client_unregister, - .restore= radeon_fbdev_client_restore, - .hotplug= radeon_fbdev_client_hotplug, + .owner = THIS_MODULE, + DRM_FBDEV_HELPER_CLIENT_FUNCS, }; void radeon_fbdev_setup(struct radeon_device *rdev) diff --git a/drivers/gpu/drm/tegra/fbdev.c b/drivers/gpu/drm/tegra/fbdev.c index db6eaac3d30e6..f9cc365cfed94 100644 --- a/drivers/gpu/drm/tegra/fbdev.c +++ b/drivers/gpu/drm/tegra/fbdev.c @@ -12,7 +12,6 @@ #include #include -#include #include #include #include @@ -150,63 +149,12 @@ static const struct drm_fb_helper_funcs tegra_fb_helper_funcs = { }; /* - * struct drm_client + * struct drm_client_funcs */ -static void tegra_fbdev_client_unregister(struct drm_client_dev *client) -{ - struct drm_fb_helper *fb_helper = drm_fb_helper_from_client(client); - - if (fb_helper->info) { - drm_fb_helper_unregister_info(fb_helper); - } else { - drm_client_release(_helper->client); - drm_fb_helper_unprepare(fb_helper); - kfree(fb_helper); - } -} - -static int tegra_fbdev_client_restore(struct drm_client_dev *client) -{ - drm_fb_helper_lastclose(client->dev); - - return 0; -} - -static int tegra_fbdev_client_hotplug(struct drm_client_dev *client) -{ - struct drm_fb_helper *fb_helper = drm_fb_helper_from_client(client); - struct drm_device *dev = client->dev; - int ret; - - if (dev->fb_helper) - return drm_fb_helper_hotplug_event(dev->fb_helper); - - ret = drm_fb_helper_init(dev, fb_helper); - if (ret) - goto err_drm_err; - - if (!drm_drv_uses_atomic_modeset(dev)) - drm_helper_disable_unused_functions(dev); - - ret = drm_fb_helper_initial_config(fb_helper); - if (ret) - goto err_drm_fb_helper_fini; - - return 0; - -err_drm_fb_helper_fini: - drm_fb_helper_fini(fb_helper); -err_drm_err: - drm_err(dev,
Re: [PATCH] drm/amdkfd: Ensure gpu_id is unique
On 2024-05-06 17:10, Harish Kasiviswanathan wrote: On 2024-05-06 16:30, Felix Kuehling wrote: On 2024-05-03 18:06, Harish Kasiviswanathan wrote: gpu_id needs to be unique for user space to identify GPUs via KFD interface. In the current implementation there is a very small probability of having non unique gpu_ids. v2: Add check to confirm if gpu_id is unique. If not unique, find one Changed commit header to reflect the above Signed-off-by: Harish Kasiviswanathan --- drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 26 ++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index b93913934b03..01d4c2e10c6d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -1095,6 +1095,8 @@ static uint32_t kfd_generate_gpu_id(struct kfd_node *gpu) uint32_t hashout; uint32_t buf[8]; uint64_t local_mem_size; + struct kfd_topology_device *dev; + bool is_unique; int i; if (!gpu) @@ -1115,6 +1117,28 @@ static uint32_t kfd_generate_gpu_id(struct kfd_node *gpu) for (i = 0, hashout = 0; i < 8; i++) hashout ^= hash_32(buf[i], KFD_GPU_ID_HASH_WIDTH); + /* hash generated could be non-unique. Check if it is unique. + * If not unique increment till unique one is found. In case + * of overflow, restart from 1 + */ + down_read(_lock); + do { + is_unique = true; + list_for_each_entry(dev, _device_list, list) { + if (dev->gpu && dev->gpu_id == hashout) { + is_unique = false; + break; + } + } + if (unlikely(!is_unique)) { + hashout = (hashout + 1) & + ((1 << KFD_GPU_ID_HASH_WIDTH) - 1); + if (!hashout) + hashout = 1; This doesn't catch the case that hashout was 0 before incrementing it, and was found to be unique. I didn't actively think about this case when I sent the patch out. However, we don't have gpu_id to be 0. There are places where gpu_id=0 means it is CPU node I think we make that assumption in a few places, both in kernel mode and user mode, e.g.: struct kfd_process_device *kfd_process_device_data_by_id(struct kfd_process *p, uint32_t gpu_id) { int i; if (gpu_id) { for (i = 0; i < p->n_pdds; i++) { struct kfd_process_device *pdd = p->pdds[i]; if (pdd->user_gpu_id == gpu_id) return pdd; } } return NULL; } Or in the Thunk in hsaKmtGetNodeProperties: /* For CPU only node don't add any additional GPU memory banks. */ if (gpu_id) { uint64_t base, limit; if (is_dgpu) NodeProperties->NumMemoryBanks += NUM_OF_DGPU_HEAPS; else NodeProperties->NumMemoryBanks += NUM_OF_IGPU_HEAPS; if (fmm_get_aperture_base_and_limit(FMM_MMIO, gpu_id, , ) == HSAKMT_STATUS_SUCCESS) NodeProperties->NumMemoryBanks += 1; } Regards, Felix Regards, Felix + } + } while (!is_unique); + up_read(_lock); + return hashout; } /* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If @@ -1946,7 +1970,6 @@ int kfd_topology_add_device(struct kfd_node *gpu) struct amdgpu_gfx_config *gfx_info = >adev->gfx.config; struct amdgpu_cu_info *cu_info = >adev->gfx.cu_info; - gpu_id = kfd_generate_gpu_id(gpu); if (gpu->xcp && !gpu->xcp->ddev) { dev_warn(gpu->adev->dev, "Won't add GPU to topology since it has no drm node assigned."); @@ -1969,6 +1992,7 @@ int kfd_topology_add_device(struct kfd_node *gpu) if (res) return res; + gpu_id = kfd_generate_gpu_id(gpu); dev->gpu_id = gpu_id; gpu->id = gpu_id;
Re: [PATCH] drm/amdkfd: Ensure gpu_id is unique
On 2024-05-03 18:06, Harish Kasiviswanathan wrote: gpu_id needs to be unique for user space to identify GPUs via KFD interface. In the current implementation there is a very small probability of having non unique gpu_ids. v2: Add check to confirm if gpu_id is unique. If not unique, find one Changed commit header to reflect the above Signed-off-by: Harish Kasiviswanathan --- drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 26 ++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index b93913934b03..01d4c2e10c6d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -1095,6 +1095,8 @@ static uint32_t kfd_generate_gpu_id(struct kfd_node *gpu) uint32_t hashout; uint32_t buf[8]; uint64_t local_mem_size; + struct kfd_topology_device *dev; + bool is_unique; int i; if (!gpu) @@ -1115,6 +1117,28 @@ static uint32_t kfd_generate_gpu_id(struct kfd_node *gpu) for (i = 0, hashout = 0; i < 8; i++) hashout ^= hash_32(buf[i], KFD_GPU_ID_HASH_WIDTH); + /* hash generated could be non-unique. Check if it is unique. +* If not unique increment till unique one is found. In case +* of overflow, restart from 1 + */ + down_read(_lock); + do { + is_unique = true; + list_for_each_entry(dev, _device_list, list) { + if (dev->gpu && dev->gpu_id == hashout) { + is_unique = false; + break; + } + } + if (unlikely(!is_unique)) { + hashout = (hashout + 1) & + ((1 << KFD_GPU_ID_HASH_WIDTH) - 1); + if (!hashout) + hashout = 1; This doesn't catch the case that hashout was 0 before incrementing it, and was found to be unique. Regards, Felix + } + } while (!is_unique); + up_read(_lock); + return hashout; } /* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If @@ -1946,7 +1970,6 @@ int kfd_topology_add_device(struct kfd_node *gpu) struct amdgpu_gfx_config *gfx_info = >adev->gfx.config; struct amdgpu_cu_info *cu_info = >adev->gfx.cu_info; - gpu_id = kfd_generate_gpu_id(gpu); if (gpu->xcp && !gpu->xcp->ddev) { dev_warn(gpu->adev->dev, "Won't add GPU to topology since it has no drm node assigned."); @@ -1969,6 +1992,7 @@ int kfd_topology_add_device(struct kfd_node *gpu) if (res) return res; + gpu_id = kfd_generate_gpu_id(gpu); dev->gpu_id = gpu_id; gpu->id = gpu_id;
Re: [PATCH] drm/amdkfd: Refactor kfd CRIU into its own file
On 2024-05-06 15:20, David Francis wrote: The kfd CRIU code takes up about a thousand lines in the kfd_chardev file; move it to its own file. No functional change intended. Signed-off-by: David Francis --- drivers/gpu/drm/amd/amdkfd/Makefile | 1 + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 972 +- drivers/gpu/drm/amd/amdkfd/kfd_criu.c| 989 +++ drivers/gpu/drm/amd/amdkfd/kfd_criu.h| 50 ++ 4 files changed, 1046 insertions(+), 966 deletions(-) create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_criu.c create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_criu.h diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 0d3d8972240d..e06af4073ac5 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -32,6 +32,7 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_flat_memory.o \ $(AMDKFD_PATH)/kfd_process.o \ $(AMDKFD_PATH)/kfd_queue.o \ + $(AMDKFD_PATH)/kfd_criu.o \ Any particular reason for adding this in the middle and not the end? $(AMDKFD_PATH)/kfd_mqd_manager.o \ $(AMDKFD_PATH)/kfd_mqd_manager_cik.o \ $(AMDKFD_PATH)/kfd_mqd_manager_vi.o \ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 6b713fb0b818..e6e44a199a93 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -45,6 +45,7 @@ Can you remove #include and "amdgpu_dma_buf.h" here? Or is it still needed by something else left in kfd_chardev.c? Other than that, this patch is Reviewed-by: Felix Kuehling #include "kfd_smi_events.h" #include "amdgpu_dma_buf.h" #include "kfd_debug.h" +#include "kfd_criu.h" static long kfd_ioctl(struct file *, unsigned int, unsigned long); static int kfd_open(struct inode *, struct file *); @@ -1751,967 +1752,6 @@ static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data) } #endif -static int criu_checkpoint_process(struct kfd_process *p, -uint8_t __user *user_priv_data, -uint64_t *priv_offset) -{ - struct kfd_criu_process_priv_data process_priv; - int ret; - - memset(_priv, 0, sizeof(process_priv)); - - process_priv.version = KFD_CRIU_PRIV_VERSION; - /* For CR, we don't consider negative xnack mode which is used for -* querying without changing it, here 0 simply means disabled and 1 -* means enabled so retry for finding a valid PTE. -*/ - process_priv.xnack_mode = p->xnack_enabled ? 1 : 0; - - ret = copy_to_user(user_priv_data + *priv_offset, - _priv, sizeof(process_priv)); - - if (ret) { - pr_err("Failed to copy process information to user\n"); - ret = -EFAULT; - } - - *priv_offset += sizeof(process_priv); - return ret; -} - -static int criu_checkpoint_devices(struct kfd_process *p, -uint32_t num_devices, -uint8_t __user *user_addr, -uint8_t __user *user_priv_data, -uint64_t *priv_offset) -{ - struct kfd_criu_device_priv_data *device_priv = NULL; - struct kfd_criu_device_bucket *device_buckets = NULL; - int ret = 0, i; - - device_buckets = kvzalloc(num_devices * sizeof(*device_buckets), GFP_KERNEL); - if (!device_buckets) { - ret = -ENOMEM; - goto exit; - } - - device_priv = kvzalloc(num_devices * sizeof(*device_priv), GFP_KERNEL); - if (!device_priv) { - ret = -ENOMEM; - goto exit; - } - - for (i = 0; i < num_devices; i++) { - struct kfd_process_device *pdd = p->pdds[i]; - - device_buckets[i].user_gpu_id = pdd->user_gpu_id; - device_buckets[i].actual_gpu_id = pdd->dev->id; - - /* -* priv_data does not contain useful information for now and is reserved for -* future use, so we do not set its contents. -*/ - } - - ret = copy_to_user(user_addr, device_buckets, num_devices * sizeof(*device_buckets)); - if (ret) { - pr_err("Failed to copy device information to user\n"); - ret = -EFAULT; - goto exit; - } - - ret = copy_to_user(user_priv_data + *priv_offset, - device_priv, - num_devices * sizeof(*device_priv)); - if (ret) { - pr_err("Failed to copy device information to user\n"); - ret = -EFAULT; - }
Re: [PATCH] drm/amdkfd: Remove arbitrary timeout for hmm_range_fault
On 2024-05-01 18:56, Philip Yang wrote: On system with khugepaged enabled and user cases with THP buffer, the hmm_range_fault may takes > 15 seconds to return -EBUSY, the arbitrary timeout value is not accurate, cause memory allocation failure. Remove the arbitrary timeout value, return EAGAIN to application if hmm_range_fault return EBUSY, then userspace libdrm and Thunk will call ioctl again. Change EAGAIN to debug message as this is not error. Signed-off-by: Philip Yang Assuming this passes your stress testing without CPU stall warnings, this patch is Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 5 - drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c | 12 +++- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 5 + 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 54198c3928c7..02696c2102f1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1087,7 +1087,10 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr, ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, ); if (ret) { - pr_err("%s: Failed to get user pages: %d\n", __func__, ret); + if (ret == -EAGAIN) + pr_debug("Failed to get user pages, try again\n"); + else + pr_err("%s: Failed to get user pages: %d\n", __func__, ret); goto unregister_out; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c index 431ec72655ec..e36fede7f74c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c @@ -202,20 +202,12 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier *notifier, pr_debug("hmm range: start = 0x%lx, end = 0x%lx", hmm_range->start, hmm_range->end); - /* Assuming 64MB takes maximum 1 second to fault page address */ - timeout = max((hmm_range->end - hmm_range->start) >> 26, 1UL); - timeout *= HMM_RANGE_DEFAULT_TIMEOUT; - timeout = jiffies + msecs_to_jiffies(timeout); + timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); retry: hmm_range->notifier_seq = mmu_interval_read_begin(notifier); r = hmm_range_fault(hmm_range); if (unlikely(r)) { - schedule(); - /* -* FIXME: This timeout should encompass the retry from -* mmu_interval_read_retry() as well. -*/ if (r == -EBUSY && !time_after(jiffies, timeout)) goto retry; goto out_free_pfns; @@ -247,6 +239,8 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier *notifier, out_free_range: kfree(hmm_range); + if (r == -EBUSY) + r = -EAGAIN; return r; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 94f83be2232d..e7040f809f33 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1670,11 +1670,8 @@ static int svm_range_validate_and_map(struct mm_struct *mm, readonly, owner, NULL, _range); WRITE_ONCE(p->svms.faulting_task, NULL); - if (r) { + if (r) pr_debug("failed %d to get svm range pages\n", r); - if (r == -EBUSY) - r = -EAGAIN; - } } else { r = -EFAULT; }
Re: Proposal to add CRIU support to DRM render nodes
On 2024-04-16 10:04, Tvrtko Ursulin wrote: > > On 01/04/2024 18:58, Felix Kuehling wrote: >> >> On 2024-04-01 12:56, Tvrtko Ursulin wrote: >>> >>> On 01/04/2024 17:37, Felix Kuehling wrote: >>>> On 2024-04-01 11:09, Tvrtko Ursulin wrote: >>>>> >>>>> On 28/03/2024 20:42, Felix Kuehling wrote: >>>>>> >>>>>> On 2024-03-28 12:03, Tvrtko Ursulin wrote: >>>>>>> >>>>>>> Hi Felix, >>>>>>> >>>>>>> I had one more thought while browsing around the amdgpu CRIU plugin. It >>>>>>> appears it relies on the KFD support being compiled in and /dev/kfd >>>>>>> present, correct? AFAICT at least, it relies on that to figure out the >>>>>>> amdgpu DRM node. >>>>>>> >>>>>>> In would be probably good to consider designing things without that >>>>>>> dependency. So that checkpointing an application which does not use >>>>>>> /dev/kfd is possible. Or if the kernel does not even have the KFD >>>>>>> support compiled in. >>>>>> >>>>>> Yeah, if we want to support graphics apps that don't use KFD, we should >>>>>> definitely do that. Currently we get a lot of topology information from >>>>>> KFD, not even from the /dev/kfd device but from the sysfs nodes exposed >>>>>> by KFD. We'd need to get GPU device info from the render nodes instead. >>>>>> And if KFD is available, we may need to integrate both sources of >>>>>> information. >>>>>> >>>>>> >>>>>>> >>>>>>> It could perhaps mean no more than adding some GPU discovery code into >>>>>>> CRIU. Which shuold be flexible enough to account for things like >>>>>>> re-assigned minor numbers due driver reload. >>>>>> >>>>>> Do you mean adding GPU discovery to the core CRIU, or to the plugin. I >>>>>> was thinking this is still part of the plugin. >>>>> >>>>> Yes I agree. I was only thinking about adding some DRM device discovery >>>>> code in a more decoupled fashion from the current plugin, for both the >>>>> reason discussed above (decoupling a bit from reliance on kfd sysfs), and >>>>> then also if/when a new DRM driver might want to implement this the code >>>>> could be move to some common plugin area. >>>>> >>>>> I am not sure how feasible that would be though. The "gpu id" concept and >>>>> it's matching in the current kernel code and CRIU plugin - is that value >>>>> tied to the physical GPU instance or how it works? >>>> >>>> The concept of the GPU ID is that it's stable while the system is up, even >>>> when devices get added and removed dynamically. It was baked into the API >>>> early on, but I don't think we ever fully validated device hot plug. I >>>> think the closest we're getting is with our latest MI GPUs and dynamic >>>> partition mode change. >>> >>> Doesn't it read the saved gpu id from the image file while doing restore >>> and tries to open the render node to match it? Maybe I am misreading the >>> code.. But if it does, does it imply that in practice it could be stable >>> across reboots? Or that it is not possible to restore to a different >>> instance of maybe the same GPU model installed in a system? >> >> Ah, the idea is, that when you restore on a different system, you may get >> different GPU IDs. Or you may checkpoint an app running on GPU 1 but restore >> it on GPU 2 on the same system. That's why we need to translate GPU IDs in >> restored applications. User mode still uses the old GPU IDs, but the kernel >> mode driver translates them to the actual GPU IDs of the GPUs that the >> process was restored on. > > I see.. I think. Normal flow is ppd->user_gpu_id set during client init, but > for restored clients it gets overriden during restore so that any further > ioctls can actually not instantly fail. > > And then in amdgpu_plugin_restore_file, when it is opening the render node, > it relies on the kfd topology to have filled in (more or less) the > target_gpu_id corresponding to the render node gpu id of the target GPU - the > one associated with the new kfd gpu_id? Yes. > > I am digging into this be
Re: [PATCH v3 2/3] drm/amdgpu: Reduce mem_type to domain double indirection
On 2024-04-30 13:16, Tvrtko Ursulin wrote: From: Tvrtko Ursulin All apart from AMDGPU_GEM_DOMAIN_GTT memory domains map 1:1 to TTM placements. And the former be either AMDGPU_PL_PREEMPT or TTM_PL_TT, depending on AMDGPU_GEM_CREATE_PREEMPTIBLE. Simplify a few places in the code which convert the TTM placement into a domain by checking against the current placement directly. In the conversion AMDGPU_PL_PREEMPT either does not have to be handled because amdgpu_mem_type_to_domain() cannot return that value anyway. v2: * Remove AMDGPU_PL_PREEMPT handling. v3: * Rebase. Signed-off-by: Tvrtko Ursulin Reviewed-by: Christian König # v1 Reviewed-by: Felix Kuehling # v2 I'm waiting for Christian to review patches 1 and 3. Then I can apply the whole series. Regards, Felix --- drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 3 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 29 + 2 files changed, 13 insertions(+), 19 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c index 055ba2ea4c12..0b3b10d21952 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c @@ -165,8 +165,7 @@ static struct sg_table *amdgpu_dma_buf_map(struct dma_buf_attachment *attach, if (r) return ERR_PTR(r); - } else if (!(amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type) & -AMDGPU_GEM_DOMAIN_GTT)) { + } else if (bo->tbo.resource->mem_type != TTM_PL_TT) { return ERR_PTR(-EBUSY); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index b2a83c802bbd..c581e4952cbd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -983,12 +983,11 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, ttm_bo_pin(>tbo); - domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type); - if (domain == AMDGPU_GEM_DOMAIN_VRAM) { + if (bo->tbo.resource->mem_type == TTM_PL_VRAM) { atomic64_add(amdgpu_bo_size(bo), >vram_pin_size); atomic64_add(amdgpu_vram_mgr_bo_visible_size(bo), >visible_pin_size); - } else if (domain == AMDGPU_GEM_DOMAIN_GTT) { + } else if (bo->tbo.resource->mem_type == TTM_PL_TT) { atomic64_add(amdgpu_bo_size(bo), >gart_pin_size); } @@ -1289,7 +1288,6 @@ void amdgpu_bo_get_memory(struct amdgpu_bo *bo, struct ttm_resource *res = bo->tbo.resource; uint64_t size = amdgpu_bo_size(bo); struct drm_gem_object *obj; - unsigned int domain; bool shared; /* Abort if the BO doesn't currently have a backing store */ @@ -1299,21 +1297,20 @@ void amdgpu_bo_get_memory(struct amdgpu_bo *bo, obj = >tbo.base; shared = drm_gem_object_is_shared_for_memory_stats(obj); - domain = amdgpu_mem_type_to_domain(res->mem_type); - switch (domain) { - case AMDGPU_GEM_DOMAIN_VRAM: + switch (res->mem_type) { + case TTM_PL_VRAM: stats->vram += size; - if (amdgpu_res_cpu_visible(adev, bo->tbo.resource)) + if (amdgpu_res_cpu_visible(adev, res)) stats->visible_vram += size; if (shared) stats->vram_shared += size; break; - case AMDGPU_GEM_DOMAIN_GTT: + case TTM_PL_TT: stats->gtt += size; if (shared) stats->gtt_shared += size; break; - case AMDGPU_GEM_DOMAIN_CPU: + case TTM_PL_SYSTEM: default: stats->cpu += size; if (shared) @@ -1326,7 +1323,7 @@ void amdgpu_bo_get_memory(struct amdgpu_bo *bo, if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) stats->requested_visible_vram += size; - if (domain != AMDGPU_GEM_DOMAIN_VRAM) { + if (res->mem_type != TTM_PL_VRAM) { stats->evicted_vram += size; if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) stats->evicted_visible_vram += size; @@ -1600,20 +1597,18 @@ u64 amdgpu_bo_print_info(int id, struct amdgpu_bo *bo, struct seq_file *m) u64 size; if (dma_resv_trylock(bo->tbo.base.resv)) { - unsigned int domain; - domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type); - switch (domain) { - case AMDGPU_GEM_DOMAIN_VRAM: + switch (bo->tbo.resource->mem_type) { + case TTM_PL_VRAM: if (amdgpu_res_cpu_v
Re: [PATCH 1/2] drm/amdkfd: Use dev_error intead of pr_error
On 2024-05-01 21:08, Harish Kasiviswanathan wrote: > No functional change. This will help in moving gpu_id creation to next > step while still being able to identify the correct GPU > > Signed-off-by: Harish Kasiviswanathan Reviewed-by: Felix Kuehling > --- > drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 19 --- > 1 file changed, 8 insertions(+), 11 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c > b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c > index ba326b43bec5..b93913934b03 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c > @@ -1773,7 +1773,7 @@ static void kfd_fill_cache_non_crat_info(struct > kfd_topology_device *dev, struct > pr_debug("Added [%d] GPU cache entries\n", num_of_entries); > } > > -static int kfd_topology_add_device_locked(struct kfd_node *gpu, uint32_t > gpu_id, > +static int kfd_topology_add_device_locked(struct kfd_node *gpu, > struct kfd_topology_device **dev) > { > int proximity_domain = ++topology_crat_proximity_domain; > @@ -1786,8 +1786,7 @@ static int kfd_topology_add_device_locked(struct > kfd_node *gpu, uint32_t gpu_id, > COMPUTE_UNIT_GPU, gpu, > proximity_domain); > if (res) { > - pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n", > -gpu_id); > + dev_err(gpu->adev->dev, "Error creating VCRAT\n"); > topology_crat_proximity_domain--; > goto err; > } > @@ -1798,8 +1797,7 @@ static int kfd_topology_add_device_locked(struct > kfd_node *gpu, uint32_t gpu_id, > _topology_device_list, > proximity_domain); > if (res) { > - pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n", > -gpu_id); > + dev_err(gpu->adev->dev, "Error parsing VCRAT\n"); > topology_crat_proximity_domain--; > goto err; > } > @@ -1825,8 +1823,8 @@ static int kfd_topology_add_device_locked(struct > kfd_node *gpu, uint32_t gpu_id, > if (!res) > sys_props.generation_count++; > else > - pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. > res=%d\n", > -gpu_id, res); > + dev_err(gpu->adev->dev, "Failed to update GPU to sysfs > topology. res=%d\n", > + res); > > err: > kfd_destroy_crat_image(crat_image); > @@ -1951,11 +1949,10 @@ int kfd_topology_add_device(struct kfd_node *gpu) > gpu_id = kfd_generate_gpu_id(gpu); > if (gpu->xcp && !gpu->xcp->ddev) { > dev_warn(gpu->adev->dev, > - "Won't add GPU (ID: 0x%x) to topology since it has no drm node > assigned.", > - gpu_id); > + "Won't add GPU to topology since it has no drm node > assigned."); > return 0; > } else { > - pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id); > + dev_dbg(gpu->adev->dev, "Adding new GPU to topology\n"); > } > > /* Check to see if this gpu device exists in the topology_device_list. > @@ -1967,7 +1964,7 @@ int kfd_topology_add_device(struct kfd_node *gpu) > down_write(_lock); > dev = kfd_assign_gpu(gpu); > if (!dev) > - res = kfd_topology_add_device_locked(gpu, gpu_id, ); > + res = kfd_topology_add_device_locked(gpu, ); > up_write(_lock); > if (res) > return res;
Re: [PATCH 2/2] drm/amdkfd: Improve chances of unique gpu_id
On 2024-05-01 21:08, Harish Kasiviswanathan wrote: > gpu_id needs to be unique for user space to identify GPUs via KFD > interface. Do a single pass search to detect collision. If > detected, increment gpu_id by one. > > Probability of collisons are very rare. Hence, no more complexity is > added to ensure uniqueness.> > Signed-off-by: Harish Kasiviswanathan > --- > drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 12 ++-- > 1 file changed, 10 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c > b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c > index b93913934b03..f2d1e82e7bed 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c > @@ -1095,6 +1095,8 @@ static uint32_t kfd_generate_gpu_id(struct kfd_node > *gpu) > uint32_t hashout; > uint32_t buf[8]; > uint64_t local_mem_size; > + struct kfd_topology_device *dev; > + bool is_unique = true; > int i; > > if (!gpu) > @@ -1115,7 +1117,13 @@ static uint32_t kfd_generate_gpu_id(struct kfd_node > *gpu) > for (i = 0, hashout = 0; i < 8; i++) > hashout ^= hash_32(buf[i], KFD_GPU_ID_HASH_WIDTH); > > - return hashout; > + down_read(_lock); > + list_for_each_entry(dev, _device_list, list) { > + if (dev->gpu && dev->gpu_id == hashout) > + is_unique = false; You can break early here. > + } > + up_read(_lock); > + return is_unique ? hashout : ++hashout; We should make sure that hashout stays within the KFD_GPU_ID_HASH_WIDTH. And if we're already adding a collision check, we may as well make it air-tight. It should be easy enough by wrapping it in a do-while loop. While we're at it, can we also check that the hash is not 0, because that value is used for non-GPU nodes? I think this would satisfy all my requests: do { if (!hashout) hashout++; is_unique = true; list_for_each_entry(dev, _device_list, list) { if (dev->gpu && dev->gpu_id == hashout) { is_unique = false; hashout = (hashout + 1) & ((1U << KFD_GPU_ID_HASH_WIDTH) - 1); break; } } } while (!is_unique); Regards, Felix > } > /* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If > * the GPU device is not already present in the topology device > @@ -1946,7 +1954,6 @@ int kfd_topology_add_device(struct kfd_node *gpu) > struct amdgpu_gfx_config *gfx_info = >adev->gfx.config; > struct amdgpu_cu_info *cu_info = >adev->gfx.cu_info; > > - gpu_id = kfd_generate_gpu_id(gpu); > if (gpu->xcp && !gpu->xcp->ddev) { > dev_warn(gpu->adev->dev, >"Won't add GPU to topology since it has no drm node > assigned."); > @@ -1969,6 +1976,7 @@ int kfd_topology_add_device(struct kfd_node *gpu) > if (res) > return res; > > + gpu_id = kfd_generate_gpu_id(gpu); > dev->gpu_id = gpu_id; > gpu->id = gpu_id; >
Re: [PATCH v2] drm/amd/amdkfd: Fix a resource leak in svm_range_validate_and_map()
On 2024-05-01 16:38, Ramesh Errabolu wrote: Analysis of code by Coverity, a static code analyser, has identified a resource leak in the symbol hmm_range. This leak occurs when one of the prior steps before it is released encounters an error. Signed-off-by: Ramesh Errabolu Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 386875e6eb96..481cb958e165 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1658,7 +1658,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm, start = map_start << PAGE_SHIFT; end = (map_last + 1) << PAGE_SHIFT; for (addr = start; !r && addr < end; ) { - struct hmm_range *hmm_range; + struct hmm_range *hmm_range = NULL; unsigned long map_start_vma; unsigned long map_last_vma; struct vm_area_struct *vma; @@ -1696,7 +1696,12 @@ static int svm_range_validate_and_map(struct mm_struct *mm, } svm_range_lock(prange); - if (!r && amdgpu_hmm_range_get_pages_done(hmm_range)) { + + /* Free backing memory of hmm_range if it was initialized +* Overrride return value to TRY AGAIN only if prior returns +* were successful +*/ + if (hmm_range && amdgpu_hmm_range_get_pages_done(hmm_range) && !r) { pr_debug("hmm update the range, need validate again\n"); r = -EAGAIN; }
Re: [PATCH] drm/amd/amdkfd: Fix a resource leak in svm_range_validate_and_map()
On 2024-05-01 14:34, Felix Kuehling wrote: On 2024-04-30 19:29, Ramesh Errabolu wrote: Analysis of code by Coverity, a static code analyser, has identified a resource leak in the symbol hmm_range. This leak occurs when one of the prior steps before it is released encounters an error. Signed-off-by: Ramesh Errabolu --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 386875e6eb96..dcb1d5d3f860 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1658,7 +1658,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm, start = map_start << PAGE_SHIFT; end = (map_last + 1) << PAGE_SHIFT; for (addr = start; !r && addr < end; ) { - struct hmm_range *hmm_range; + struct hmm_range *hmm_range = NULL; unsigned long map_start_vma; unsigned long map_last_vma; struct vm_area_struct *vma; @@ -1696,7 +1696,9 @@ static int svm_range_validate_and_map(struct mm_struct *mm, } svm_range_lock(prange); - if (!r && amdgpu_hmm_range_get_pages_done(hmm_range)) { + + // Free backing memory of hmm_range if it was initialized + if (hmm_range && amdgpu_hmm_range_get_pages_done(hmm_range)) { pr_debug("hmm update the range, need validate again\n"); r = -EAGAIN; Nack! This can now override other error codes that aren't meant to be overridden with -EAGAIN. I think a better solution would be to just revserse this condition to ensure that amdgpu_hmm_range_get_pages_done is always called: if (amdgpu_hmm_range_get_pages_done(hmm_range) && !r) { Correction: You still need the NULL check: if (hmm_range && amdgpu_hmm_range_get_pages_done(hmm_range) && !r) { ... } Regards, Felix ... r = -EAGAIN; } Regards, Felix }
Re: [PATCH] drm/amd/amdkfd: Fix a resource leak in svm_range_validate_and_map()
On 2024-04-30 19:29, Ramesh Errabolu wrote: Analysis of code by Coverity, a static code analyser, has identified a resource leak in the symbol hmm_range. This leak occurs when one of the prior steps before it is released encounters an error. Signed-off-by: Ramesh Errabolu --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 386875e6eb96..dcb1d5d3f860 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1658,7 +1658,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm, start = map_start << PAGE_SHIFT; end = (map_last + 1) << PAGE_SHIFT; for (addr = start; !r && addr < end; ) { - struct hmm_range *hmm_range; + struct hmm_range *hmm_range = NULL; unsigned long map_start_vma; unsigned long map_last_vma; struct vm_area_struct *vma; @@ -1696,7 +1696,9 @@ static int svm_range_validate_and_map(struct mm_struct *mm, } svm_range_lock(prange); - if (!r && amdgpu_hmm_range_get_pages_done(hmm_range)) { + + // Free backing memory of hmm_range if it was initialized + if (hmm_range && amdgpu_hmm_range_get_pages_done(hmm_range)) { pr_debug("hmm update the range, need validate again\n"); r = -EAGAIN; Nack! This can now override other error codes that aren't meant to be overridden with -EAGAIN. I think a better solution would be to just revserse this condition to ensure that amdgpu_hmm_range_get_pages_done is always called: if (amdgpu_hmm_range_get_pages_done(hmm_range) && !r) { ... r = -EAGAIN; } Regards, Felix }
Re: [PATCH v2] drm/amdkfd: Let VRAM allocations go to GTT domain on small APUs
On 2024-04-30 6:08, Lang Yu wrote: Small APUs(i.e., consumer, embedded products) usually have a small carveout device memory which can't satisfy most compute workloads memory allocation requirements. We can't even run a Basic MNIST Example with a default 512MB carveout. https://github.com/pytorch/examples/tree/main/mnist. Error Log when running mnist: "torch.cuda.OutOfMemoryError: HIP out of memory. Tried to allocate 84.00 MiB. GPU 0 has a total capacity of 512.00 MiB of which 0 bytes is free. Of the allocated memory 103.83 MiB is allocated by PyTorch, and 22.17 MiB is reserved by PyTorch but unallocated" Though we can change BIOS settings to enlarge carveout size, which is inflexible and may bring complaint. On the other hand, the memory resource can't be effectively used between host and device. The solution is MI300A approach, i.e., let VRAM allocations go to GTT. Then device and host can effectively share system memory. v2: Report local_mem_size_private as 0. (Felix) Signed-off-by: Lang Yu Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c| 5 + .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 20 ++- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 6 -- drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 3 ++- 5 files changed, 23 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 7ba05f030dd1..e3738d417245 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -455,6 +455,9 @@ void amdgpu_amdkfd_get_local_mem_info(struct amdgpu_device *adev, else mem_info->local_mem_size_private = KFD_XCP_MEMORY_SIZE(adev, xcp->id); + } else if (adev->flags & AMD_IS_APU) { + mem_info->local_mem_size_public = (ttm_tt_pages_limit() << PAGE_SHIFT); + mem_info->local_mem_size_private = 0; } else { mem_info->local_mem_size_public = adev->gmc.visible_vram_size; mem_info->local_mem_size_private = adev->gmc.real_vram_size - @@ -824,6 +827,8 @@ u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device *adev, int xcp_id) } do_div(tmp, adev->xcp_mgr->num_xcp_per_mem_partition); return ALIGN_DOWN(tmp, PAGE_SIZE); + } else if (adev->flags & AMD_IS_APU) { + return (ttm_tt_pages_limit() << PAGE_SHIFT); } else { return adev->gmc.real_vram_size; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 4bdf59213384..5843c3d35cb9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -196,7 +196,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, return -EINVAL; vram_size = KFD_XCP_MEMORY_SIZE(adev, xcp_id); - if (adev->gmc.is_app_apu) { + if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) { system_mem_needed = size; ttm_mem_needed = size; } @@ -232,7 +232,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, "adev reference can't be null when vram is used"); if (adev && xcp_id >= 0) { adev->kfd.vram_used[xcp_id] += vram_needed; - adev->kfd.vram_used_aligned[xcp_id] += adev->gmc.is_app_apu ? + adev->kfd.vram_used_aligned[xcp_id] += + (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) ? vram_needed : ALIGN(vram_needed, VRAM_AVAILABLITY_ALIGN); } @@ -260,7 +261,7 @@ void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev, if (adev) { adev->kfd.vram_used[xcp_id] -= size; - if (adev->gmc.is_app_apu) { + if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) { adev->kfd.vram_used_aligned[xcp_id] -= size; kfd_mem_limit.system_mem_used -= size; kfd_mem_limit.ttm_mem_used -= size; @@ -889,7 +890,7 @@ static int kfd_mem_attach(struct amdgpu_device *adev, struct kgd_mem *mem, * if peer device has large BAR. In contrast, access over xGMI is * allowed for both small and large BAR configurations of peer device */ - if ((adev != bo_adev && !adev->gmc.is_app_apu) && + if (
Re: [PATCH 2/3] drm/amdgpu: Reduce mem_type to domain double indirection
On 2024-04-29 12:47, Tvrtko Ursulin wrote: From: Tvrtko Ursulin All apart from AMDGPU_GEM_DOMAIN_GTT memory domains map 1:1 to TTM placements. And the former be either AMDGPU_PL_PREEMPT or TTM_PL_TT, depending on AMDGPU_GEM_CREATE_PREEMPTIBLE. Simplify a few places in the code which convert the TTM placement into a domain by checking against the current placement directly. In the conversion AMDGPU_PL_PREEMPT either does not have to be handled because amdgpu_mem_type_to_domain() cannot return that value anyway. v2: * Remove AMDGPU_PL_PREEMPT handling. Signed-off-by: Tvrtko Ursulin Reviewed-by: Christian König # v1 Reviewed-by: Felix Kuehling I also ran kfdtest on a multi-GPU system just to make sure this didn't break our multi-GPU support. BTW, I had to fix up some things when I tried to apply your patch to the current amd-staging-drm-next branch. That branch was just rebased on Linux 6.8, so maybe that's part of the reason. --- drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 3 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 27 + 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c index 055ba2ea4c12..0b3b10d21952 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c @@ -165,8 +165,7 @@ static struct sg_table *amdgpu_dma_buf_map(struct dma_buf_attachment *attach, if (r) return ERR_PTR(r); - } else if (!(amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type) & -AMDGPU_GEM_DOMAIN_GTT)) { + } else if (bo->tbo.resource->mem_type != TTM_PL_TT) { return ERR_PTR(-EBUSY); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 8bc79924d171..eb5bd6962560 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -976,12 +976,11 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, ttm_bo_pin(>tbo); - domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type); - if (domain == AMDGPU_GEM_DOMAIN_VRAM) { + if (bo->tbo.resource->mem_type == TTM_PL_VRAM) { atomic64_add(amdgpu_bo_size(bo), >vram_pin_size); atomic64_add(amdgpu_vram_mgr_bo_visible_size(bo), >visible_pin_size); - } else if (domain == AMDGPU_GEM_DOMAIN_GTT) { + } else if (bo->tbo.resource->mem_type == TTM_PL_TT) { atomic64_add(amdgpu_bo_size(bo), >gart_pin_size); } @@ -1280,7 +1279,6 @@ void amdgpu_bo_get_memory(struct amdgpu_bo *bo, { uint64_t size = amdgpu_bo_size(bo); struct drm_gem_object *obj; - unsigned int domain; bool shared; /* Abort if the BO doesn't currently have a backing store */ @@ -1290,21 +1288,20 @@ void amdgpu_bo_get_memory(struct amdgpu_bo *bo, obj = >tbo.base; shared = drm_gem_object_is_shared_for_memory_stats(obj); - domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type); - switch (domain) { - case AMDGPU_GEM_DOMAIN_VRAM: + switch (bo->tbo.resource->mem_type) { + case TTM_PL_VRAM: stats->vram += size; if (amdgpu_bo_in_cpu_visible_vram(bo)) stats->visible_vram += size; if (shared) stats->vram_shared += size; break; - case AMDGPU_GEM_DOMAIN_GTT: + case TTM_PL_TT: stats->gtt += size; if (shared) stats->gtt_shared += size; break; - case AMDGPU_GEM_DOMAIN_CPU: + case TTM_PL_SYSTEM: default: stats->cpu += size; if (shared) @@ -1317,7 +1314,7 @@ void amdgpu_bo_get_memory(struct amdgpu_bo *bo, if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) stats->requested_visible_vram += size; - if (domain != AMDGPU_GEM_DOMAIN_VRAM) { + if (bo->tbo.resource->mem_type != TTM_PL_VRAM) { stats->evicted_vram += size; if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) stats->evicted_visible_vram += size; @@ -1592,19 +1589,17 @@ u64 amdgpu_bo_print_info(int id, struct amdgpu_bo *bo, struct seq_file *m) u64 size; if (dma_resv_trylock(bo->tbo.base.resv)) { - unsigned int domain; - domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type); - switch (domain) { - case AMDGPU_GEM_DOMAIN_VRAM: + switch (bo->
Re: [PATCH] drm/amdkfd: update buffer_{store,load}_* modifiers for gfx940
On 2024-04-29 17:50, Jay Cornwall wrote: On 4/29/2024 06:06, Lancelot SIX wrote: Instruction modifiers of the untyped vector memory buffer instructions (MUBUF encoded) changed in gfx940. The slc, scc and glc modifiers have been replaced with sc0, sc1 and nt. The current CWSR trap handler is written using pre-gfx940 modifier names, making the source incompatible with a strict gfx940 assembler. This patch updates the cwsr_trap_handler_gfx9.s source file to be compatible with all gfx9 variants of the ISA. The binary assembled code is unchanged (so the behaviour is unchanged as well), only the source representation is updated. Signed-off-by: Lancelot SIX --- .../drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | 24 --- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm index bb26338204f4..a2d597d7fb57 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm @@ -48,6 +48,12 @@ var ACK_SQC_STORE = 1 //workaround for suspected SQC store bug causing var SAVE_AFTER_XNACK_ERROR = 1 //workaround for TCP store failure after XNACK error when ALLOW_REPLAY=0, for debugger var SINGLE_STEP_MISSED_WORKAROUND = (ASIC_FAMILY <= CHIP_ALDEBARAN) //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised +#if ASIC_FAMILY < CHIP_GC_9_4_3 +#define VMEM_MODIFIERS slc:1 glc:1 +#else +#define VMEM_MODIFIERS sc0:1 nt:1 +#endif + /**/ /* variables */ /**/ @@ -581,7 +587,7 @@ end L_SAVE_LDS_LOOP_VECTOR: ds_read_b64 v[0:1], v2 //x =LDS[a], byte address s_waitcnt lgkmcnt(0) - buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1 + buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset VMEM_MODIFIERS offen:1 // s_waitcnt vmcnt(0) // v_add_u32 v2, vcc[0:1], v2, v3 v_add_u32 v2, v2, v3 @@ -979,17 +985,17 @@ L_TCP_STORE_CHECK_DONE: end function write_4vgprs_to_mem(s_rsrc, s_mem_offset) - buffer_store_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1 - buffer_store_dword v1, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256 - buffer_store_dword v2, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256*2 - buffer_store_dword v3, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256*3 + buffer_store_dword v0, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS + buffer_store_dword v1, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS offset:256 + buffer_store_dword v2, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS offset:256*2 + buffer_store_dword v3, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS offset:256*3 end function read_4vgprs_from_mem(s_rsrc, s_mem_offset) - buffer_load_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1 - buffer_load_dword v1, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256 - buffer_load_dword v2, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256*2 - buffer_load_dword v3, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256*3 + buffer_load_dword v0, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS + buffer_load_dword v1, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS offset:256 + buffer_load_dword v2, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS offset:256*2 + buffer_load_dword v3, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS offset:256*3 s_waitcnt vmcnt(0) end base-commit: cf743996352e327f483dc7d66606c90276f57380 Reviewed-by: Jay Cornwall Acked-by: Felix Kuehling Do you need me to submit the patch to amd-staging-drm-next? Thanks, Felix
Re: [PATCH 2/2] drm/amdkfd: Allow memory oversubscription on small APUs
On 2024-04-29 06:38, Yu, Lang wrote: [Public] -Original Message- From: Kuehling, Felix Sent: Saturday, April 27, 2024 6:45 AM To: Yu, Lang ; amd-gfx@lists.freedesktop.org Cc: Yang, Philip ; Koenig, Christian ; Zhang, Yifan ; Liu, Aaron Subject: Re: [PATCH 2/2] drm/amdkfd: Allow memory oversubscription on small APUs On 2024-04-26 04:37, Lang Yu wrote: The default ttm_tt_pages_limit is 1/2 of system memory. It is prone to out of memory with such a configuration. Indiscriminately allowing the violation of all memory limits is not a good solution. It will lead to poor performance once you actually reach ttm_pages_limit and TTM starts swapping out BOs. Hi Felix, I just feel it's like a bug that 1/2 of system memory is fee, the driver tells users out of memory. On the other hand, if memory is available, why not use it. TTM does not allow us to use more than 1/2 system memory. I believe that's because TTM needs additional memory to swap out BOs. Any GTT allocation through the render node APIs is subject to the same limitations. Render node APIs can handle memory overcommitment more gracefully because the kernel mode driver is in the loop for command submissions and fences. That doesn't work for KFD with user mode queues. The memory limits in KFD are there to prevent overcommitting memory because we need all of our memory (per process) to be resident at the same time. If we let KFD exceed the TTM limits, we get into situations where we're thrashing (processes evicting each other constantly) or even worse, where we're just not able to make all memory resident. So we end up with suspended user mode queues and extremely poor performance or soft hangs. By the way, can we use USERPTR for VRAM allocations? Then we don't have ttm_tt_pages_limit limitations. Thanks. No. There is an expectation that VRAM BOs can be shared between processes through DMABufs (for HIP IPC APIs). You can't export userptrs as DMABufs. You can try to raise the TTM pages limit using a TTM module parameter. But this is taking a risk for system stability when TTM gets into a situation where it needs to swap out a large BO. Regards, Felix I actually did some tests on Strix (12 CU@2100 MHz, 29412M 128bits LPDDR5@937MHz) with https://github.com/ROCm/pytorch-micro-benchmarking. Command: python micro_benchmarking_pytorch.py --network resnet50 --batch-size=64 --iterations=20 1, Run 1 resnet50 (FP32, batch size 64) Memory usage: System mem used 6748M out of 29412M TTM mem used 6658M out of 15719M Memory oversubscription percentage: 0 Throughput [img/sec] : 49.04 2, Run 2 resnet50 simultaneously (FP32, batch size 64) Memory usage: System mem used 13496M out of 29412M TTM mem used 13316M out of 15719M Memory oversubscription percentage: 0 Throughput [img/sec] (respectively) : 25.27 / 26.70 3, Run 3 resnet50 simultaneously (FP32, batch size 64) Memory usage: System mem used 20245M out of 29412M TTM mem used 19974M out of 15719M Memory oversubscription percentage: ~27% Throughput [img/sec](respectively) : 10.62 / 7.47 / 6.90 (In theory: 16 / 16 / 16) From my observations, 1, GPU is underutilized a lot, sometimes its loading is less than 50% and even 0, when running 3 resnet50 simultaneously with ~27% memory oversubscription. The driver is busying evicting and restoring process. It takes ~2-5 seconds to restore all the BOs for one process (swap in and out BOs, actually allocate and copy pages), even though the process doesn't need all the allocated BOs to be resident. 2, Sometimes, the fairness can't be guaranteed between process when memory is oversubscribed. They can't share the GPU equally when created with default priority. 3, The less GPU underutilization time during evicting and restoring, the less performance degradation under memory oversubscription. Regards, Lang Regards, Felix Signed-off-by: Lang Yu --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 12 +--- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 3295838e9a1d..c01c6f3ab562 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -167,7 +167,7 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) int i; int last_valid_bit; -amdgpu_amdkfd_gpuvm_init_mem_limits(); +amdgpu_amdkfd_gpuvm_init_mem_limits(adev); if (adev->kfd.dev) { struct kgd2kfd_shared_resources gpu_resources = { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 1de021ebdd46..13284dbd8c58 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++
Re: [PATCH 3/3] drm/amdgpu: Fix pinned GART area accounting and fdinfo reporting
On 2024-04-29 5:43, Tvrtko Ursulin wrote: On 26/04/2024 23:24, Felix Kuehling wrote: On 2024-04-26 12:43, Tvrtko Ursulin wrote: From: Tvrtko Ursulin When commit b453e42a6e8b ("drm/amdgpu: Add new placement for preemptible SG BOs") added a new TTM region it missed to notice the conceptual imbalance in GART pin size accounting as done in amdgpu_bo_pin/unpin. That imbalance leads to such objects getting accounted against the resource, but are not un-accounted when unpinned. AMDGPU_PL_PREEMPT is mostly used for userptr BOs, which cannot be pinned. In any case you should make sure that the accounting is consistent between amdgpu_bo_pin_restricted and amdgpu_bo_unpin. This patch breaks that consistency. You mean amdgpu_bo_pin(_restricted) and amdgpu_bo_unpin do not run for such objects, or something else? Right. amdgpu_bo_pin_restricted will return an error for userptr BOs: if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) return -EPERM; If they run, then at the end of pin there is: domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type); ... } else if (domain == AMDGPU_GEM_DOMAIN_GTT) { atomic64_add(amdgpu_bo_size(bo), >gart_pin_size); You changed that in your patch 2: - } else if (domain == AMDGPU_GEM_DOMAIN_GTT) { + } else if (bo->tbo.resource->mem_type == TTM_PL_TT || + bo->tbo.resource->mem_type == AMDGPU_PL_PREEMPT) { atomic64_add(amdgpu_bo_size(bo), >gart_pin_size); } I was suggesting you just change this in patch 2 like this, so it matches what's done on unpin: - } else if (domain == AMDGPU_GEM_DOMAIN_GTT) { + } else if (bo->tbo.resource->mem_type == TTM_PL_TT) { atomic64_add(amdgpu_bo_size(bo), >gart_pin_size); } And unpin has no handling for AMDGPU_PL_PREEMPT. Ah I see.. does it rely on amdgpu_mem_type_to_domain returning 0 for AMDGPU_PL_PREEMPT? My confusion was I misread the pinning check as checking the domain as stored in the bo at creation time. Although I am still confused by the statement userptr BOs are not pinned. It is not needed to map them via GART on AMD hardware for GPU to be able to access them? Fix by extending the accounting criteria in amdgpu_bo_unpin. What also aappears needs fixing is not reporting their size from the amdgpu_bo_get_memory, which is used to implement fdinfo stats, so they are not mixed with the regular userspace created and driver owned objects. I think that's true. It's a very fine distinction. AMDGPU_PL_PREEMPT does use system memory and it is GPU accessible, just like GTT. The only difference is, that it's not subject to the GTT limits because their eviction is handled by callbacks other than TTM evictions and doesn't need to wait for fences. As in you think those two hunks of the patch are correct? Yes. It seems, Christian agrees but wants to show preemptible memory separately in debugfs instead of not showing it at all. Regards, Felix Regards, Tvrtko Regards, Felix And also amdgpu_bo_print_info for debugfs reporting. Note that the patch depends on the previous one which broke down the relevant checks from the domain based to placement based. Signed-off-by: Tvrtko Ursulin Fixes: b453e42a6e8b ("drm/amdgpu: Add new placement for preemptible SG BOs") Cc: Felix Kuehling Cc: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index fb984669fc3a..5a2bbc793953 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -1032,7 +1032,8 @@ void amdgpu_bo_unpin(struct amdgpu_bo *bo) atomic64_sub(amdgpu_bo_size(bo), >vram_pin_size); atomic64_sub(amdgpu_vram_mgr_bo_visible_size(bo), >visible_pin_size); - } else if (bo->tbo.resource->mem_type == TTM_PL_TT) { + } else if (bo->tbo.resource->mem_type == TTM_PL_TT || + bo->tbo.resource->mem_type == AMDGPU_PL_PREEMPT) { atomic64_sub(amdgpu_bo_size(bo), >gart_pin_size); } @@ -1298,7 +1299,6 @@ void amdgpu_bo_get_memory(struct amdgpu_bo *bo, stats->vram_shared += size; break; case TTM_PL_TT: - case AMDGPU_PL_PREEMPT: stats->gtt += size; if (shared) stats->gtt_shared += size; @@ -1599,7 +1599,6 @@ u64 amdgpu_bo_print_info(int id, struct amdgpu_bo *bo, struct seq_file *m) placement = "VRAM"; break; case TTM_PL_TT: - case AMDGPU_PL_PREEMPT: placement = "GTT"; break; case TTM_PL_SYSTEM:
Re: [PATCH 3/3] drm/amdgpu: Fix pinned GART area accounting and fdinfo reporting
On 2024-04-29 9:45, Tvrtko Ursulin wrote: On 29/04/2024 12:11, Christian König wrote: Am 29.04.24 um 11:43 schrieb Tvrtko Ursulin: On 26/04/2024 23:24, Felix Kuehling wrote: On 2024-04-26 12:43, Tvrtko Ursulin wrote: From: Tvrtko Ursulin When commit b453e42a6e8b ("drm/amdgpu: Add new placement for preemptible SG BOs") added a new TTM region it missed to notice the conceptual imbalance in GART pin size accounting as done in amdgpu_bo_pin/unpin. That imbalance leads to such objects getting accounted against the resource, but are not un-accounted when unpinned. AMDGPU_PL_PREEMPT is mostly used for userptr BOs, which cannot be pinned. In any case you should make sure that the accounting is consistent between amdgpu_bo_pin_restricted and amdgpu_bo_unpin. This patch breaks that consistency. You mean amdgpu_bo_pin(_restricted) and amdgpu_bo_unpin do not run for such objects, or something else? If they run, then at the end of pin there is: domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type); ... } else if (domain == AMDGPU_GEM_DOMAIN_GTT) { atomic64_add(amdgpu_bo_size(bo), >gart_pin_size); And unpin has no handling for AMDGPU_PL_PREEMPT. Ah I see.. does it rely on amdgpu_mem_type_to_domain returning 0 for AMDGPU_PL_PREEMPT? My confusion was I misread the pinning check as checking the domain as stored in the bo at creation time. Although I am still confused by the statement userptr BOs are not pinned. It is not needed to map them via GART on AMD hardware for GPU to be able to access them? No, a GART mapping is only needed if you want to scanout from them or otherwise use them from the kernel on the GPU. Background is that the kernel doesn't has VM with page tables.. Got it, thanks! Presumably somewhere else in the code then it is prevented to call pin/unpin on those? I was referring to this condition in amdgpu_bo_pin_restricted: if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) return -EPERM; However, when I look into it more, I see that AMDGPU_PL_PREEMPT is used for other SG BOs that actually are pinned, specifically BOs created by KFD with KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL or KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP. These are very small BOs (one or two pages), and only one per process, per GPU, so I'm not sure it's worth adding special handling for them in the BO pin accounting. Regards, Felix What to do, if anything, with the attempt to address the asymmetry in the accounting criteria between the pin and unpin? I mean domain based on pin: domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type); if (domain == AMDGPU_GEM_DOMAIN_VRAM) { atomic64_add(amdgpu_bo_size(bo), >vram_pin_size); atomic64_add(amdgpu_vram_mgr_bo_visible_size(bo), >visible_pin_size); } else if (domain == AMDGPU_GEM_DOMAIN_GTT) { atomic64_add(amdgpu_bo_size(bo), >gart_pin_size); } Versus placement based on unpin: if (bo->tbo.resource->mem_type == TTM_PL_VRAM) { atomic64_sub(amdgpu_bo_size(bo), >vram_pin_size); atomic64_sub(amdgpu_vram_mgr_bo_visible_size(bo), >visible_pin_size); } else if (bo->tbo.resource->mem_type == TTM_PL_TT) { atomic64_sub(amdgpu_bo_size(bo), >gart_pin_size); } The fact amdgpu_mem_type_to_domain never translates back to AMDGPU_PL_PREEMPT means there is indeed currently no bug. Is 2/3 still desirable to convert the check in pin to me mem_type based? Fix by extending the accounting criteria in amdgpu_bo_unpin. What also aappears needs fixing is not reporting their size from the amdgpu_bo_get_memory, which is used to implement fdinfo stats, so they are not mixed with the regular userspace created and driver owned objects. I think that's true. It's a very fine distinction. AMDGPU_PL_PREEMPT does use system memory and it is GPU accessible, just like GTT. The only difference is, that it's not subject to the GTT limits because their eviction is handled by callbacks other than TTM evictions and doesn't need to wait for fences. As in you think those two hunks of the patch are correct? I think so as well, yes. But we still need a name for preemptible BOs while printing them in debugfs. Currently it looks the name is 'CPU': amdgpu_bo_print_info() ... case AMDGPU_GEM_DOMAIN_CPU: default: placement = "CPU"; break; Also, where to account them in struct amdgpu_mem_stats? Regards, Tvrtko Regards, Christian. Regards, Tvrtko Regards, Felix And also amdgpu_bo_print_info for debugfs reporting. Note that the patch depends on the previous one which broke down the relevant checks from the domain based to placement based. Signed-off-by: Tvrtko Ursulin Fixes: b453e42a6e8b ("drm/amdgpu: Add new placement for preemptible SG BOs") Cc: Felix Kuehl
Re: [PATCH 1/2] drm/amdkfd: Let VRAM allocations go to GTT domain on small APUs
On 2024-04-26 04:37, Lang Yu wrote: Small APUs(i.e., consumer, embedded products) usually have a small carveout device memory which can't satisfy most compute workloads memory allocation requirements. We can't even run a Basic MNIST Example with a default 512MB carveout. https://github.com/pytorch/examples/tree/main/mnist. Though we can change BIOS settings to enlarge carveout size, which is inflexible and may bring complaint. On the other hand, the memory resource can't be effectively used between host and device. The solution is MI300A approach, i.e., let VRAM allocations go to GTT. Signed-off-by: Lang Yu Two nit-picks inline. Other than that, this patch looks reasonable to me. --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c| 6 +- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 21 +++ drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 6 -- drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 3 ++- 5 files changed, 24 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 7ba05f030dd1..3295838e9a1d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -456,7 +456,9 @@ void amdgpu_amdkfd_get_local_mem_info(struct amdgpu_device *adev, mem_info->local_mem_size_private = KFD_XCP_MEMORY_SIZE(adev, xcp->id); } else { - mem_info->local_mem_size_public = adev->gmc.visible_vram_size; + mem_info->local_mem_size_public = adev->flags & AMD_IS_APU ? + (ttm_tt_pages_limit() << PAGE_SHIFT) : + adev->gmc.visible_vram_size; mem_info->local_mem_size_private = adev->gmc.real_vram_size - adev->gmc.visible_vram_size; On an APU the private size should be reported as 0. } @@ -824,6 +826,8 @@ u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device *adev, int xcp_id) } do_div(tmp, adev->xcp_mgr->num_xcp_per_mem_partition); return ALIGN_DOWN(tmp, PAGE_SIZE); + } else if (adev->flags & AMD_IS_APU) { + return (ttm_tt_pages_limit() << PAGE_SHIFT); } else { return adev->gmc.real_vram_size; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index c4f9960dafbb..7eb5afcc4895 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -196,7 +196,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, return -EINVAL; vram_size = KFD_XCP_MEMORY_SIZE(adev, xcp_id); - if (adev->gmc.is_app_apu) { + if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) { system_mem_needed = size; ttm_mem_needed = size; } @@ -232,7 +232,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, "adev reference can't be null when vram is used"); if (adev && xcp_id >= 0) { adev->kfd.vram_used[xcp_id] += vram_needed; - adev->kfd.vram_used_aligned[xcp_id] += adev->gmc.is_app_apu ? + adev->kfd.vram_used_aligned[xcp_id] += + (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) ? vram_needed : ALIGN(vram_needed, VRAM_AVAILABLITY_ALIGN); } @@ -260,7 +261,7 @@ void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev, if (adev) { adev->kfd.vram_used[xcp_id] -= size; - if (adev->gmc.is_app_apu) { + if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) { adev->kfd.vram_used_aligned[xcp_id] -= size; kfd_mem_limit.system_mem_used -= size; kfd_mem_limit.ttm_mem_used -= size; @@ -889,7 +890,7 @@ static int kfd_mem_attach(struct amdgpu_device *adev, struct kgd_mem *mem, * if peer device has large BAR. In contrast, access over xGMI is * allowed for both small and large BAR configurations of peer device */ - if ((adev != bo_adev && !adev->gmc.is_app_apu) && + if ((adev != bo_adev && !(adev->gmc.is_app_apu || adev->flags & AMD_IS_APU)) && ((mem->domain == AMDGPU_GEM_DOMAIN_VRAM) || (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) || (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))) { @@ -1657,7 +1658,7 @@ size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device
Re: [PATCH 2/2] drm/amdkfd: Allow memory oversubscription on small APUs
On 2024-04-26 04:37, Lang Yu wrote: The default ttm_tt_pages_limit is 1/2 of system memory. It is prone to out of memory with such a configuration. Indiscriminately allowing the violation of all memory limits is not a good solution. It will lead to poor performance once you actually reach ttm_pages_limit and TTM starts swapping out BOs. Regards, Felix Signed-off-by: Lang Yu --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 12 +--- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 3295838e9a1d..c01c6f3ab562 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -167,7 +167,7 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) int i; int last_valid_bit; - amdgpu_amdkfd_gpuvm_init_mem_limits(); + amdgpu_amdkfd_gpuvm_init_mem_limits(adev); if (adev->kfd.dev) { struct kgd2kfd_shared_resources gpu_resources = { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 1de021ebdd46..13284dbd8c58 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -363,7 +363,7 @@ u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device *adev, int xcp_id); #if IS_ENABLED(CONFIG_HSA_AMD) -void amdgpu_amdkfd_gpuvm_init_mem_limits(void); +void amdgpu_amdkfd_gpuvm_init_mem_limits(struct amdgpu_device *adev); void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev, struct amdgpu_vm *vm); @@ -376,7 +376,7 @@ void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo); void amdgpu_amdkfd_reserve_system_mem(uint64_t size); #else static inline -void amdgpu_amdkfd_gpuvm_init_mem_limits(void) +void amdgpu_amdkfd_gpuvm_init_mem_limits(struct amdgpu_device *adev) { } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 7eb5afcc4895..a3e623a320b3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -60,6 +60,7 @@ static struct { int64_t system_mem_used; int64_t ttm_mem_used; spinlock_t mem_limit_lock; + bool alow_oversubscribe; } kfd_mem_limit; static const char * const domain_bit_to_string[] = { @@ -110,7 +111,7 @@ static bool reuse_dmamap(struct amdgpu_device *adev, struct amdgpu_device *bo_ad * System (TTM + userptr) memory - 15/16th System RAM * TTM memory - 3/8th System RAM */ -void amdgpu_amdkfd_gpuvm_init_mem_limits(void) +void amdgpu_amdkfd_gpuvm_init_mem_limits(struct amdgpu_device *adev) { struct sysinfo si; uint64_t mem; @@ -130,6 +131,7 @@ void amdgpu_amdkfd_gpuvm_init_mem_limits(void) kfd_mem_limit.max_system_mem_limit -= AMDGPU_RESERVE_MEM_LIMIT; kfd_mem_limit.max_ttm_mem_limit = ttm_tt_pages_limit() << PAGE_SHIFT; + kfd_mem_limit.alow_oversubscribe = !!(adev->flags & AMD_IS_APU); pr_debug("Kernel memory limit %lluM, TTM limit %lluM\n", (kfd_mem_limit.max_system_mem_limit >> 20), (kfd_mem_limit.max_ttm_mem_limit >> 20)); @@ -221,8 +223,12 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, kfd_mem_limit.max_ttm_mem_limit) || (adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] + vram_needed > vram_size - reserved_for_pt - atomic64_read(>vram_pin_size))) { - ret = -ENOMEM; - goto release; + if (kfd_mem_limit.alow_oversubscribe) { + pr_warn_ratelimited("Memory is getting oversubscried.\n"); + } else { + ret = -ENOMEM; + goto release; + } } /* Update memory accounting by decreasing available system
Re: [PATCH 3/3] drm/amdgpu: Fix pinned GART area accounting and fdinfo reporting
On 2024-04-26 12:43, Tvrtko Ursulin wrote: From: Tvrtko Ursulin When commit b453e42a6e8b ("drm/amdgpu: Add new placement for preemptible SG BOs") added a new TTM region it missed to notice the conceptual imbalance in GART pin size accounting as done in amdgpu_bo_pin/unpin. That imbalance leads to such objects getting accounted against the resource, but are not un-accounted when unpinned. AMDGPU_PL_PREEMPT is mostly used for userptr BOs, which cannot be pinned. In any case you should make sure that the accounting is consistent between amdgpu_bo_pin_restricted and amdgpu_bo_unpin. This patch breaks that consistency. Fix by extending the accounting criteria in amdgpu_bo_unpin. What also aappears needs fixing is not reporting their size from the amdgpu_bo_get_memory, which is used to implement fdinfo stats, so they are not mixed with the regular userspace created and driver owned objects. I think that's true. It's a very fine distinction. AMDGPU_PL_PREEMPT does use system memory and it is GPU accessible, just like GTT. The only difference is, that it's not subject to the GTT limits because their eviction is handled by callbacks other than TTM evictions and doesn't need to wait for fences. Regards, Felix And also amdgpu_bo_print_info for debugfs reporting. Note that the patch depends on the previous one which broke down the relevant checks from the domain based to placement based. Signed-off-by: Tvrtko Ursulin Fixes: b453e42a6e8b ("drm/amdgpu: Add new placement for preemptible SG BOs") Cc: Felix Kuehling Cc: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index fb984669fc3a..5a2bbc793953 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -1032,7 +1032,8 @@ void amdgpu_bo_unpin(struct amdgpu_bo *bo) atomic64_sub(amdgpu_bo_size(bo), >vram_pin_size); atomic64_sub(amdgpu_vram_mgr_bo_visible_size(bo), >visible_pin_size); - } else if (bo->tbo.resource->mem_type == TTM_PL_TT) { + } else if (bo->tbo.resource->mem_type == TTM_PL_TT || + bo->tbo.resource->mem_type == AMDGPU_PL_PREEMPT) { atomic64_sub(amdgpu_bo_size(bo), >gart_pin_size); } @@ -1298,7 +1299,6 @@ void amdgpu_bo_get_memory(struct amdgpu_bo *bo, stats->vram_shared += size; break; case TTM_PL_TT: - case AMDGPU_PL_PREEMPT: stats->gtt += size; if (shared) stats->gtt_shared += size; @@ -1599,7 +1599,6 @@ u64 amdgpu_bo_print_info(int id, struct amdgpu_bo *bo, struct seq_file *m) placement = "VRAM"; break; case TTM_PL_TT: - case AMDGPU_PL_PREEMPT: placement = "GTT"; break; case TTM_PL_SYSTEM:
Re: [PATCH] drm/amdkfd: Flush the process wq before creating a kfd_process
On 2024-04-26 14:55, Lancelot SIX wrote: There is a race condition when re-creating a kfd_process for a process. This has been observed when a process under the debugger executes exec(3). In this scenario: - The process executes exec. - This will eventually release the process's mm, which will cause the kfd_process object associated with the process to be freed (kfd_process_free_notifier decrements the reference count to the kfd_process to 0). This causes kfd_process_ref_release to enqueue kfd_process_wq_release to the kfd_process_wq. - The debugger receives the PTRACE_EVENT_EXEC notification, and tries to re-enable AMDGPU traps (KFD_IOC_DBG_TRAP_ENABLE). - When handling this request, KFD tries to re-create a kfd_process. This eventually calls kfd_create_process and kobject_init_and_add. At this point the call to kobject_init_and_add can fail because the old kfd_process.kobj has not been freed yet by kfd_process_wq_release. This patch proposes to avoid this race by making sure to drain kfd_process_wq before creating a new kfd_process object. This way, we know that any cleanup task is done executing when we reach kobject_init_and_add. Signed-off-by: Lancelot SIX Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 8 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 58c1fe542193..451bb058cc62 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -829,6 +829,14 @@ struct kfd_process *kfd_create_process(struct task_struct *thread) if (process) { pr_debug("Process already found\n"); } else { + /* If the process just called exec(3), it is possible that the +* cleanup of the kfd_process (following the release of the mm +* of the old process image) is still in the cleanup work queue. +* Make sure to drain any job before trying to recreate any +* resource for this process. +*/ + flush_workqueue(kfd_process_wq); + process = create_process(thread); if (IS_ERR(process)) goto out; base-commit: cf743996352e327f483dc7d66606c90276f57380
Re: [PATCH] drm/amdkfd: Enforce queue BO's adev
On 2024-04-24 13:40, Harish Kasiviswanathan wrote: Queue buffer, though it is in system memory, has to be created using the correct amdgpu device. Enforce this as the BO needs to mapped to the GART for MES Hardware scheduler to access it. Signed-off-by: Harish Kasiviswanathan I guess this doesn't break existing user mode. It only makes it fail in a more obvious way. If that's the case, the patch is Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 8fd5e0da628c..963cf6d657cb 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -373,6 +373,11 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, err = -EINVAL; goto err_wptr_map_gart; } + if (dev->adev != amdgpu_ttm_adev(wptr_bo->tbo.bdev)) { + pr_err("Queue memory allocated to wrong device\n"); + err = -EINVAL; + goto err_wptr_map_gart; + } err = amdgpu_amdkfd_map_gtt_bo_to_gart(dev->adev, wptr_bo); if (err) {
Re: [PATCH v6 0/5] Best effort contiguous VRAM allocation
The series is Reviewed-by: Felix Kuehling On 2024-04-24 11:27, Philip Yang wrote: This patch series implement new KFD memory alloc flag for best effort contiguous VRAM allocation, to support peer direct access RDMA device with limited scatter-gather dma capability. v2: rebase on patch ("drm/amdgpu: Modify the contiguous flags behaviour") to avoid adding the new GEM flag v3: add patch 2 to handle sg segment size limit (Christian) v4: remove the buddy block size limit from vram mgr because sg table creation already remove the limit, and resource uses u64 to handle block start, size (Christian) v5: remove patch 7 which is not for upstream, add AMDGPU prefix to the macro name. v6: use shorter flag name, use interruptible wait ctx, drop patch 5/6 (Felix) Philip Yang (5): drm/amdgpu: Support contiguous VRAM allocation drm/amdgpu: Handle sg size limit for contiguous allocation drm/amdgpu: Evict BOs from same process for contiguous allocation drm/amdkfd: Evict BO itself for contiguous allocation drm/amdkfd: Bump kfd version for contiguous VRAM allocation .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 23 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 12 +- include/uapi/linux/kfd_ioctl.h| 4 +++- 4 files changed, 33 insertions(+), 9 deletions(-)
Re: [PATCH v5 1/6] drm/amdgpu: Support contiguous VRAM allocation
On 2024-04-23 11:28, Philip Yang wrote: RDMA device with limited scatter-gather ability requires contiguous VRAM buffer allocation for RDMA peer direct support. Add a new KFD alloc memory flag and store as bo alloc flag AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS. When pin this bo to export for RDMA peerdirect access, this will set TTM_PL_FLAG_CONTIFUOUS flag, and ask VRAM buddy allocator to get contiguous VRAM. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 include/uapi/linux/kfd_ioctl.h | 1 + 2 files changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 0ae9fd844623..ef9154043757 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1712,6 +1712,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) ? AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0; + + /* For contiguous VRAM allocation */ + if (flags & KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT) + alloc_flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; } xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ? 0 : fpriv->xcp_id; diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 2040a470ddb4..c1394c162d4e 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -407,6 +407,7 @@ struct kfd_ioctl_acquire_vm_args { #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 26) #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED (1 << 25) #define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT (1 << 24) +#define KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT (1 << 23) If I understand it correctly, AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS was redefined to mean "best effort". Maybe we can drop the explicit "BEST_EFFORT" from this flag as well to keep the name to a reasonable length. Regards, Felix /* Allocate memory for later SVM (shared virtual memory) mapping. *
Re: [PATCH v5 3/6] drm/amdgpu: Evict BOs from same process for contiguous allocation
On 2024-04-23 11:28, Philip Yang wrote: When TTM failed to alloc VRAM, TTM try evict BOs from VRAM to system memory then retry the allocation, this skips the KFD BOs from the same process because KFD require all BOs are resident for user queues. If TTM with TTM_PL_FLAG_CONTIGUOUS flag to alloc contiguous VRAM, allow TTM evict KFD BOs from the same process, this will evict the user queues first, and restore the queues later after contiguous VRAM allocation. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 851509c6e90e..c907d6005641 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1398,7 +1398,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo, */ dma_resv_for_each_fence(_cursor, bo->base.resv, DMA_RESV_USAGE_BOOKKEEP, f) { - if (amdkfd_fence_check_mm(f, current->mm)) + if (amdkfd_fence_check_mm(f, current->mm) && + !(place->flags & TTM_PL_FLAG_CONTIGUOUS)) return false; }
Re: [PATCH v5 4/6] drm/amdkfd: Evict BO itself for contiguous allocation
On 2024-04-23 11:28, Philip Yang wrote: If the BO pages pinned for RDMA is not contiguous on VRAM, evict it to system memory first to free the VRAM space, then allocate contiguous VRAM space, and then move it from system memory back to VRAM. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index ef9154043757..5d118e5580ce 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1470,13 +1470,27 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain) if (unlikely(ret)) return ret; + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) { + /* +* If bo is not contiguous on VRAM, move to system memory first to ensure +* we can get contiguous VRAM space after evicting other BOs. +*/ + if (!(bo->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) { + ret = amdgpu_amdkfd_bo_validate(bo, AMDGPU_GEM_DOMAIN_GTT, false); amdgpu_amdkfd_bo_validate is meant for use in kernel threads. It always runs uninterruptible. I believe pin_bo runs in the context of ioctls from user mode. So it should be interruptible. Regards, Felix + if (unlikely(ret)) { + pr_debug("validate bo 0x%p to GTT failed %d\n", >tbo, ret); + goto out; + } + } + } + ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0); if (ret) pr_err("Error in Pinning BO to domain: %d\n", domain); amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false); +out: amdgpu_bo_unreserve(bo); - return ret; }
Re: [PATCH v5 5/6] drm/amdkfd: Increase KFD bo restore wait time
On 2024-04-23 11:28, Philip Yang wrote: TTM allocate contiguous VRAM may takes more than 1 second to evict BOs for larger size RDMA buffer. Because KFD restore bo worker reserves all KFD BOs, then TTM cannot hold the remainning KFD BOs lock to evict them, this causes TTM failed to alloc contiguous VRAM. Increase the KFD restore BO wait time to 2 seconds, long enough for RDMA pin BO to alloc the contiguous VRAM. Two seconds is a very long time that the GPU will be idle whenever memory gets evicted. Maybe we need to look for a solution where the restore gets scheduled in response to a fence when the migration completes. With my most recent changes I made to the eviction fence handling, I think we can decouple the scheduling of the restore work from the evict work. So we could schedule the delayed restore worker in a fence callback set up in amdgpu_bo_move or somewhere around there, and keep a short delay that starts counting at the end of the eviction move blit. Regards, Felix Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index a81ef232fdef..c205e2d3acf9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -698,7 +698,7 @@ struct qcm_process_device { /* KFD Memory Eviction */ /* Approx. wait time before attempting to restore evicted BOs */ -#define PROCESS_RESTORE_TIME_MS 100 +#define PROCESS_RESTORE_TIME_MS 2000 /* Approx. back off time if restore fails due to lack of memory */ #define PROCESS_BACK_OFF_TIME_MS 100 /* Approx. time before evicting the process again */
Re: [PATCH] drm/amdkfd: handle duplicate BOs in reserve_bo_and_cond_vms
On 2024-04-22 05:10, Lang Yu wrote: Observed on gfx8 ASIC when KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM is used. Two attachments use the same VM, root PD would be locked twice. [ 57.910418] Call Trace: [ 57.793726] ? reserve_bo_and_cond_vms+0x111/0x1c0 [amdgpu] [ 57.793820] amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu+0x6c/0x1c0 [amdgpu] [ 57.793923] ? idr_get_next_ul+0xbe/0x100 [ 57.793933] kfd_process_device_free_bos+0x7e/0xf0 [amdgpu] [ 57.794041] kfd_process_wq_release+0x2ae/0x3c0 [amdgpu] [ 57.794141] ? process_scheduled_works+0x29c/0x580 [ 57.794147] process_scheduled_works+0x303/0x580 [ 57.794157] ? __pfx_worker_thread+0x10/0x10 [ 57.794160] worker_thread+0x1a2/0x370 [ 57.794165] ? __pfx_worker_thread+0x10/0x10 [ 57.794167] kthread+0x11b/0x150 [ 57.794172] ? __pfx_kthread+0x10/0x10 [ 57.794177] ret_from_fork+0x3d/0x60 [ 57.794181] ? __pfx_kthread+0x10/0x10 [ 57.794184] ret_from_fork_asm+0x1b/0x30 Signed-off-by: Lang Yu Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 101a2836480d..c4aaf9c394e7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1188,7 +1188,8 @@ static int reserve_bo_and_cond_vms(struct kgd_mem *mem, int ret; ctx->sync = >sync; - drm_exec_init(>exec, DRM_EXEC_INTERRUPTIBLE_WAIT); + drm_exec_init(>exec, DRM_EXEC_INTERRUPTIBLE_WAIT | + DRM_EXEC_IGNORE_DUPLICATES); drm_exec_until_all_locked(>exec) { ctx->n_vms = 0; list_for_each_entry(entry, >attachments, list) {
Re: [PATCH] drm/amdgpu: Fix VRAM memory accounting
On 2024-04-23 14:56, Mukul Joshi wrote: Subtract the VRAM pinned memory when checking for available memory in amdgpu_amdkfd_reserve_mem_limit function since that memory is not available for use. Signed-off-by: Mukul Joshi Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 101a2836480d..f672205243e0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -220,7 +220,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, (kfd_mem_limit.ttm_mem_used + ttm_mem_needed > kfd_mem_limit.max_ttm_mem_limit) || (adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] + vram_needed > -vram_size - reserved_for_pt)) { +vram_size - reserved_for_pt - atomic64_read(>vram_pin_size))) { ret = -ENOMEM; goto release; }
Re: [PATCH] drm/amdgpu: Fix two reset triggered in a row
On 2024-04-23 01:50, Christian König wrote: Am 22.04.24 um 21:45 schrieb Yunxiang Li: Reset request from KFD is missing a check for if a reset is already in progress, this causes a second reset to be triggered right after the previous one finishes. Add the check to align with the other reset sources. NAK, that isn't how this should be handled. Instead all reset source which are handled by a previous reset should be canceled. In other words there should be a cancel_work(>kfd.reset_work); somewhere in the KFD code. When this doesn't work correctly then that is somehow missing. If you see the use of amdgpu_in_reset() outside of the low level functions than that is clearly a bug. Do we need to do that for all reset workers in the driver separately? I don't see where this is done for other reset workers. Regards, Felix Regards, Christian. Signed-off-by: Yunxiang Li --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 3b4591f554f1..ce3dbb1cc2da 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -283,7 +283,7 @@ int amdgpu_amdkfd_post_reset(struct amdgpu_device *adev) void amdgpu_amdkfd_gpu_reset(struct amdgpu_device *adev) { - if (amdgpu_device_should_recover_gpu(adev)) + if (amdgpu_device_should_recover_gpu(adev) && !amdgpu_in_reset(adev)) amdgpu_reset_domain_schedule(adev->reset_domain, >kfd.reset_work); }
Re: [PATCH] drm/amdgpu: Fix two reset triggered in a row
On 2024-04-22 16:14, Alex Deucher wrote: On Mon, Apr 22, 2024 at 3:52 PM Yunxiang Li wrote: Reset request from KFD is missing a check for if a reset is already in progress, this causes a second reset to be triggered right after the previous one finishes. Add the check to align with the other reset sources. Acked-by: Alex Deucher Reviewed-by: Felix Kuehling Signed-off-by: Yunxiang Li --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 3b4591f554f1..ce3dbb1cc2da 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -283,7 +283,7 @@ int amdgpu_amdkfd_post_reset(struct amdgpu_device *adev) void amdgpu_amdkfd_gpu_reset(struct amdgpu_device *adev) { - if (amdgpu_device_should_recover_gpu(adev)) + if (amdgpu_device_should_recover_gpu(adev) && !amdgpu_in_reset(adev)) amdgpu_reset_domain_schedule(adev->reset_domain, >kfd.reset_work); } -- 2.34.1
Re: [PATCH] drm/amdkfd: Add VRAM accounting for SVM migration
On 2024-04-19 12:23, Mukul Joshi wrote: Do VRAM accounting when doing migrations to vram to make sure there is enough available VRAM and migrating to VRAM doesn't evict other possible non-unified memory BOs. If migrating to VRAM fails, driver can fall back to using system memory seamlessly. Signed-off-by: Mukul Joshi Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 16 +++- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 2 +- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index bdc01ca9609a..a6bfc00c0310 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -509,10 +509,19 @@ svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc, start = start_mgr << PAGE_SHIFT; end = (last_mgr + 1) << PAGE_SHIFT; + r = amdgpu_amdkfd_reserve_mem_limit(node->adev, + prange->npages * PAGE_SIZE, + KFD_IOC_ALLOC_MEM_FLAGS_VRAM, + node->xcp ? node->xcp->id : 0); + if (r) { + dev_dbg(node->adev->dev, "failed to allocate VRAM, size exceeds VRAM limit\n", r); + return -ENOSPC; + } + r = svm_range_vram_node_new(node, prange, true); if (r) { dev_dbg(node->adev->dev, "fail %ld to alloc vram\n", r); - return r; + goto out; } ttm_res_offset = (start_mgr - prange->start + prange->offset) << PAGE_SHIFT; @@ -545,6 +554,11 @@ svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc, svm_range_vram_node_free(prange); } +out: + amdgpu_amdkfd_unreserve_mem_limit(node->adev, + prange->npages * PAGE_SIZE, + KFD_IOC_ALLOC_MEM_FLAGS_VRAM, + node->xcp ? node->xcp->id : 0); return r < 0 ? r : 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index f7d75b432cc6..bfab16b43fec 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -3426,7 +3426,7 @@ svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange, mm, KFD_MIGRATE_TRIGGER_PREFETCH); *migrated = !r; - return r; + return 0; } int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence)
[PATCH] drm/amdkfd: Fix rescheduling of restore worker
Handle the case that the restore worker was already scheduled by another eviction while the restore was in progress. Fixes: 9a1c1339abf9 ("drm/amdkfd: Run restore_workers on freezable WQs") Signed-off-by: Felix Kuehling --- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index aafdf064651f..58c1fe542193 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -2012,9 +2012,9 @@ static void restore_process_worker(struct work_struct *work) if (ret) { pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d ms\n", p->pasid, PROCESS_BACK_OFF_TIME_MS); - ret = queue_delayed_work(kfd_restore_wq, >restore_work, - msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS)); - WARN(!ret, "reschedule restore work failed\n"); + if (mod_delayed_work(kfd_restore_wq, >restore_work, +msecs_to_jiffies(PROCESS_RESTORE_TIME_MS))) + kfd_process_restore_queues(p); } } -- 2.34.1
Re: [PATCH v2] drm/amdkfd: make sure VM is ready for updating operations
On 2024-04-11 4:11, Lang Yu wrote: When page table BOs were evicted but not validated before updating page tables, VM is still in evicting state, amdgpu_vm_update_range returns -EBUSY and restore_process_worker runs into a dead loop. v2: Split the BO validation and page table update into two separate loops in amdgpu_amdkfd_restore_process_bos. (Felix) 1.Validate BOs 2.Validate VM (and DMABuf attachments) 3.Update page tables for the BOs validated above Fixes: 2fdba514ad5a ("drm/amdgpu: Auto-validate DMABuf imports in compute VMs") Signed-off-by: Lang Yu Reviewed-by: Felix Kuehling --- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 34 +++ 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 0ae9fd844623..e2c9e6ddb1d1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -2900,13 +2900,12 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu * amdgpu_sync_create(_obj); - /* Validate BOs and map them to GPUVM (update VM page tables). */ + /* Validate BOs managed by KFD */ list_for_each_entry(mem, _info->kfd_bo_list, validate_list) { struct amdgpu_bo *bo = mem->bo; uint32_t domain = mem->domain; - struct kfd_mem_attachment *attachment; struct dma_resv_iter cursor; struct dma_fence *fence; @@ -2931,6 +2930,25 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu * goto validate_map_fail; } } + } + + if (failed_size) + pr_debug("0x%lx/0x%lx in system\n", failed_size, total_size); + + /* Validate PDs, PTs and evicted DMABuf imports last. Otherwise BO +* validations above would invalidate DMABuf imports again. +*/ + ret = process_validate_vms(process_info, ); + if (ret) { + pr_debug("Validating VMs failed, ret: %d\n", ret); + goto validate_map_fail; + } + + /* Update mappings managed by KFD. */ + list_for_each_entry(mem, _info->kfd_bo_list, + validate_list) { + struct kfd_mem_attachment *attachment; + list_for_each_entry(attachment, >attachments, list) { if (!attachment->is_mapped) continue; @@ -2947,18 +2965,6 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu * } } - if (failed_size) - pr_debug("0x%lx/0x%lx in system\n", failed_size, total_size); - - /* Validate PDs, PTs and evicted DMABuf imports last. Otherwise BO -* validations above would invalidate DMABuf imports again. -*/ - ret = process_validate_vms(process_info, ); - if (ret) { - pr_debug("Validating VMs failed, ret: %d\n", ret); - goto validate_map_fail; - } - /* Update mappings not managed by KFD */ list_for_each_entry(peer_vm, _info->vm_list_head, vm_list_node) {
[PATCH] drm/amdgpu: Update BO eviction priorities
Make SVM BOs more likely to get evicted than other BOs. These BOs opportunistically use available VRAM, but can fall back relatively seamlessly to system memory. It also avoids SVM migrations evicting other, more important BOs as they will evict other SVM allocations first. Signed-off-by: Felix Kuehling --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index cd2dd3ed7153..d80671535ab3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -608,6 +608,8 @@ int amdgpu_bo_create(struct amdgpu_device *adev, else amdgpu_bo_placement_from_domain(bo, bp->domain); if (bp->type == ttm_bo_type_kernel) + bo->tbo.priority = 2; + else if (!(bp->flags & AMDGPU_GEM_CREATE_DISCARDABLE)) bo->tbo.priority = 1; if (!bp->destroy) -- 2.34.1
Re: [PATCH] drm/amdgpu/mes11: print MES opcodes rather than numbers
On 2024-04-17 15:53, Alex Deucher wrote: Makes it easier to review the logs when there are MES errors. v2: use dbg for emitted, add helpers for fetching strings v3: fix missing commas (Harish) Reviewed by Shaoyun.liu (v2) Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 78 -- 1 file changed, 74 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 81833395324a0..414b7beff397f 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -100,18 +100,72 @@ static const struct amdgpu_ring_funcs mes_v11_0_ring_funcs = { .insert_nop = amdgpu_ring_insert_nop, }; +static const char *mes_v11_0_opcodes[] = { + "MES_SCH_API_SET_HW_RSRC", + "MES_SCH_API_SET_SCHEDULING_CONFIG", + "MES_SCH_API_ADD_QUEUE", + "MES_SCH_API_REMOVE_QUEUE", + "MES_SCH_API_PERFORM_YIELD", + "MES_SCH_API_SET_GANG_PRIORITY_LEVEL", + "MES_SCH_API_SUSPEND", + "MES_SCH_API_RESUME", + "MES_SCH_API_RESET", + "MES_SCH_API_SET_LOG_BUFFER", + "MES_SCH_API_CHANGE_GANG_PRORITY", + "MES_SCH_API_QUERY_SCHEDULER_STATUS", + "MES_SCH_API_PROGRAM_GDS", + "MES_SCH_API_SET_DEBUG_VMID", + "MES_SCH_API_MISC", + "MES_SCH_API_UPDATE_ROOT_PAGE_TABLE", + "MES_SCH_API_AMD_LOG", Maybe drop the prefixes. They don't add any information value and only bloat the log messages and module binary size. Other than that, the patch is Acked-by: Felix Kuehling +}; + +static const char *mes_v11_0_misc_opcodes[] = { + "MESAPI_MISC__WRITE_REG", + "MESAPI_MISC__INV_GART", + "MESAPI_MISC__QUERY_STATUS", + "MESAPI_MISC__READ_REG", + "MESAPI_MISC__WAIT_REG_MEM", + "MESAPI_MISC__SET_SHADER_DEBUGGER", +}; + +static const char *mes_v11_0_get_op_string(union MESAPI__MISC *x_pkt) +{ + const char *op_str = NULL; + + if (x_pkt->header.opcode < ARRAY_SIZE(mes_v11_0_opcodes)) + op_str = mes_v11_0_opcodes[x_pkt->header.opcode]; + + return op_str; +} + +static const char *mes_v11_0_get_misc_op_string(union MESAPI__MISC *x_pkt) +{ + const char *op_str = NULL; + + if ((x_pkt->header.opcode == MES_SCH_API_MISC) && + (x_pkt->opcode <= ARRAY_SIZE(mes_v11_0_misc_opcodes))) + op_str = mes_v11_0_misc_opcodes[x_pkt->opcode]; + + return op_str; +} + static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, void *pkt, int size, int api_status_off) { int ndw = size / 4; signed long r; - union MESAPI__ADD_QUEUE *x_pkt = pkt; + union MESAPI__MISC *x_pkt = pkt; struct MES_API_STATUS *api_status; struct amdgpu_device *adev = mes->adev; struct amdgpu_ring *ring = >ring; unsigned long flags; signed long timeout = 300; /* 3000 ms */ + const char *op_str, *misc_op_str; + + if (x_pkt->header.opcode >= MES_SCH_API_MAX) + return -EINVAL; if (amdgpu_emu_mode) { timeout *= 100; @@ -135,13 +189,29 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, amdgpu_ring_commit(ring); spin_unlock_irqrestore(>ring_lock, flags); - DRM_DEBUG("MES msg=%d was emitted\n", x_pkt->header.opcode); + op_str = mes_v11_0_get_op_string(x_pkt); + misc_op_str = mes_v11_0_get_misc_op_string(x_pkt); + + if (misc_op_str) + dev_dbg(adev->dev, "MES msg=%s (%s) was emitted\n", op_str, misc_op_str); + else if (op_str) + dev_dbg(adev->dev, "MES msg=%s was emitted\n", op_str); + else + dev_dbg(adev->dev, "MES msg=%d was emitted\n", x_pkt->header.opcode); r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout); if (r < 1) { - DRM_ERROR("MES failed to response msg=%d\n", - x_pkt->header.opcode); + + if (misc_op_str) + dev_err(adev->dev, "MES failed to respond to msg=%s (%s)\n", + op_str, misc_op_str); + else if (op_str) + dev_err(adev->dev, "MES failed to respond to msg=%s\n", + op_str); + else + dev_err(adev->dev, "MES failed to respond to msg=%d\n", + x_pkt->header.opcode); while (halt_if_hws_hang) schedule();
[PATCH] drm/amdkfd: Fix eviction fence handling
Handle case that dma_fence_get_rcu_safe returns NULL. If restore work is already scheduled, only update its timer. The same work item cannot be queued twice, so undo the extra queue eviction. Fixes: 9a1c1339abf9 ("drm/amdkfd: Run restore_workers on freezable WQs") Signed-off-by: Felix Kuehling --- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index b79986412cd8..aafdf064651f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -1922,6 +1922,8 @@ static int signal_eviction_fence(struct kfd_process *p) rcu_read_lock(); ef = dma_fence_get_rcu_safe(>ef); rcu_read_unlock(); + if (!ef) + return -EINVAL; ret = dma_fence_signal(ef); dma_fence_put(ef); @@ -1949,10 +1951,9 @@ static void evict_process_worker(struct work_struct *work) * they are responsible stopping the queues and scheduling * the restore work. */ - if (!signal_eviction_fence(p)) - queue_delayed_work(kfd_restore_wq, >restore_work, - msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)); - else + if (signal_eviction_fence(p) || + mod_delayed_work(kfd_restore_wq, >restore_work, +msecs_to_jiffies(PROCESS_RESTORE_TIME_MS))) kfd_process_restore_queues(p); pr_debug("Finished evicting pasid 0x%x\n", p->pasid); -- 2.34.1
Re: [PATCH] rock-dgb_defconfig: Update for Linux 6.7 with UBSAN
On 2024-04-16 13:02, Chen, Xiaogang wrote: On 4/15/2024 2:49 PM, Felix Kuehling wrote: Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding. make rock-dbg_defconfig make savedefconfig cp defconfig arch/x86/config/rock-dbg_defconfig This also enables UBSAN, which can help catch some types of bugs at compile time. Enabling UBSAN cause compiler insert code to perform certain kinds of check before operations that may cause undefined behaviour. I think it catches errors at run time, not compile time, and increases kernel size. You're right. I saw it supports some range checking only on arrays where the size is known at compile time. But the range checking itself needs to happen at runtime. Regards, Felix Regards Xiaogang Signed-off-by: Felix Kuehling --- arch/x86/configs/rock-dbg_defconfig | 46 + 1 file changed, 14 insertions(+), 32 deletions(-) diff --git a/arch/x86/configs/rock-dbg_defconfig b/arch/x86/configs/rock-dbg_defconfig index 0ad80a8c8eab..80129ca354b4 100644 --- a/arch/x86/configs/rock-dbg_defconfig +++ b/arch/x86/configs/rock-dbg_defconfig @@ -34,11 +34,12 @@ CONFIG_CHECKPOINT_RESTORE=y CONFIG_SCHED_AUTOGROUP=y CONFIG_BLK_DEV_INITRD=y CONFIG_EXPERT=y -CONFIG_USERFAULTFD=y -# CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y +CONFIG_KEXEC=y +CONFIG_KEXEC_FILE=y +CONFIG_KEXEC_JUMP=y +CONFIG_CRASH_DUMP=y CONFIG_SMP=y -# CONFIG_RETPOLINE is not set CONFIG_X86_INTEL_LPSS=y CONFIG_IOSF_MBI_DEBUG=y CONFIG_HYPERVISOR_GUEST=y @@ -48,9 +49,6 @@ CONFIG_PROCESSOR_SELECT=y CONFIG_GART_IOMMU=y CONFIG_NR_CPUS=256 CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y -CONFIG_I8K=m -CONFIG_MICROCODE_AMD=y -CONFIG_MICROCODE_OLD_INTERFACE=y CONFIG_X86_MSR=m CONFIG_X86_CPUID=m # CONFIG_X86_5LEVEL is not set @@ -61,12 +59,8 @@ CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT=1 CONFIG_EFI=y CONFIG_EFI_STUB=y CONFIG_EFI_MIXED=y -CONFIG_KEXEC=y -CONFIG_KEXEC_FILE=y -CONFIG_CRASH_DUMP=y -CONFIG_KEXEC_JUMP=y CONFIG_PHYSICAL_ALIGN=0x100 -CONFIG_LEGACY_VSYSCALL_EMULATE=y +# CONFIG_RETPOLINE is not set CONFIG_HIBERNATION=y CONFIG_PM_WAKELOCKS=y CONFIG_PM_DEBUG=y @@ -74,7 +68,6 @@ CONFIG_PM_ADVANCED_DEBUG=y CONFIG_PM_TRACE_RTC=y CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y CONFIG_ACPI_EC_DEBUGFS=m -CONFIG_ACPI_VIDEO=m CONFIG_ACPI_DOCK=y CONFIG_ACPI_PROCESSOR_AGGREGATOR=m CONFIG_ACPI_PCI_SLOT=y @@ -108,6 +101,8 @@ CONFIG_BLK_DEV_INTEGRITY=y CONFIG_BLK_DEV_THROTTLING=y CONFIG_PARTITION_ADVANCED=y CONFIG_BINFMT_MISC=y +CONFIG_ZSWAP=y +# CONFIG_COMPAT_BRK is not set CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_KSM=y @@ -115,15 +110,12 @@ CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 CONFIG_MEMORY_FAILURE=y CONFIG_HWPOISON_INJECT=m CONFIG_TRANSPARENT_HUGEPAGE=y -CONFIG_CLEANCACHE=y -CONFIG_FRONTSWAP=y CONFIG_CMA=y CONFIG_CMA_AREAS=7 CONFIG_MEM_SOFT_DIRTY=y -CONFIG_ZSWAP=y -CONFIG_ZSMALLOC=y CONFIG_ZONE_DEVICE=y CONFIG_DEVICE_PRIVATE=y +CONFIG_USERFAULTFD=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_PACKET_DIAG=y @@ -167,7 +159,6 @@ CONFIG_BRIDGE_NETFILTER=m CONFIG_NF_CONNTRACK=m CONFIG_NF_CONNTRACK_SECMARK=y CONFIG_NF_CONNTRACK_ZONES=y -# CONFIG_NF_CONNTRACK_PROCFS is not set CONFIG_NF_CONNTRACK_EVENTS=y CONFIG_NF_CONNTRACK_TIMEOUT=y CONFIG_NF_CONNTRACK_TIMESTAMP=y @@ -178,7 +169,6 @@ CONFIG_NETFILTER_NETLINK_GLUE_CT=y CONFIG_NF_TABLES=m CONFIG_NF_TABLES_NETDEV=y CONFIG_NFT_CT=m -CONFIG_NFT_COUNTER=m CONFIG_NFT_CONNLIMIT=m CONFIG_NFT_LOG=m CONFIG_NFT_LIMIT=m @@ -270,7 +260,6 @@ CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_CLUSTERIP=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -312,7 +301,6 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y # CONFIG_PREVENT_FIRMWARE_BUILD is not set -CONFIG_EFI_VARS=y CONFIG_PARPORT=y CONFIG_PARPORT_PC=y CONFIG_PARPORT_SERIAL=y @@ -363,7 +351,6 @@ CONFIG_E1000=y CONFIG_E1000E=y CONFIG_IGB=y CONFIG_IGBVF=y -CONFIG_IXGB=y CONFIG_IXGBE=y CONFIG_I40E=y CONFIG_SKY2=y @@ -401,14 +388,14 @@ CONFIG_SENSORS_K10TEMP=m CONFIG_WATCHDOG=y CONFIG_RC_CORE=y CONFIG_RC_DECODERS=y +CONFIG_IR_JVC_DECODER=y +CONFIG_IR_MCE_KBD_DECODER=y CONFIG_IR_NEC_DECODER=y CONFIG_IR_RC5_DECODER=y CONFIG_IR_RC6_DECODER=y -CONFIG_IR_JVC_DECODER=y -CONFIG_IR_SONY_DECODER=y CONFIG_IR_SANYO_DECODER=y CONFIG_IR_SHARP_DECODER=y -CONFIG_IR_MCE_KBD_DECODER=y +CONFIG_IR_SONY_DECODER=y CONFIG_IR_XMP_DECODER=y CONFIG_AGP=y CONFIG_AGP_AMD64=y @@ -422,7 +409,6 @@ CONFIG_HSA_AMD_P2P=y CONFIG_DRM_AST=m CONFIG_FB=y CONFIG_BACKLIGHT_CLASS_DEVICE=y -CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y CONFIG_HID_BATTERY_STRENGTH=y CONFIG_HIDRAW=y @@ -456,7 +442,6 @@ CONFIG_R
Re: [PATCH] drm/amdkfd: fix NULL pointer dereference
This patch does not apply to amd-staging-drm-next. This is against a DKMS branch and should be reviewed on our internal mailing list. However, I suspect that part of the problem is, that the DKMS branch has diverged quite a bit in this area, and is missing at least one patch from me that was reverted, probably because of an improper port. The proper solution should involve getting the DKMS branch back in sync with upstream. I'll look into that. Regards, Felix On 2024-04-13 14:07, vitaly.pros...@amd.com wrote: From: Vitaly Prosyak [ +0.006038] BUG: kernel NULL pointer dereference, address: 0028 [ +0.006969] #PF: supervisor read access in kernel mode [ +0.005139] #PF: error_code(0x) - not-present page [ +0.005139] PGD 0 P4D 0 [ +0.002530] Oops: [#1] PREEMPT SMP NOPTI [ +0.004356] CPU: 11 PID: 12625 Comm: kworker/11:0 Tainted: GW 6.7.0+ #2 [ +0.008097] Hardware name: ASUS System Product Name/Pro WS WRX80E-SAGE SE WIFI II, BIOS 1302 12/08/2023 [ +0.009398] Workqueue: events evict_process_worker [amdgpu] [ +0.005750] RIP: 0010:evict_process_worker+0x2f/0x460 [amdgpu] [ +0.005991] Code: 55 48 89 e5 41 57 41 56 4c 8d b7 a8 fc ff ff 41 55 41 54 53 48 89 fb 48 83 ec 10 0f 1f 44 00 00 48 8b 43 f8 8b 93 b0 00 00 00 <48> 3b 50 28 0f 85 50 03 00 00 48 8d 7b 58 e8 ee be cb bf 48 8b 05 [ +0.018791] RSP: 0018:c90009a2be10 EFLAGS: 00010282 [ +0.005226] RAX: RBX: 888197ffc358 RCX: [ +0.007140] RDX: 0a1b RSI: RDI: 888197ffc358 [ +0.007139] RBP: c90009a2be48 R08: R09: [ +0.007139] R10: R11: R12: 888197ffc358 [ +0.007139] R13: 888100153a00 R14: 888197ffc000 R15: 888100153a05 [ +0.007137] FS: () GS:889facac() knlGS: [ +0.008094] CS: 0010 DS: ES: CR0: 80050033 [ +0.005747] CR2: 0028 CR3: 00010d1fc001 CR4: 00770ef0 [ +0.007138] PKRU: 5554 [ +0.002702] Call Trace: [ +0.002443] [ +0.002096] ? show_regs+0x72/0x90 [ +0.003402] ? __die+0x25/0x80 [ +0.003052] ? page_fault_oops+0x154/0x4c0 [ +0.004099] ? do_user_addr_fault+0x30e/0x6e0 [ +0.004357] ? psi_group_change+0x237/0x520 [ +0.004185] ? exc_page_fault+0x84/0x1b0 [ +0.003926] ? asm_exc_page_fault+0x27/0x30 [ +0.004187] ? evict_process_worker+0x2f/0x460 [amdgpu] [ +0.005377] process_one_work+0x17b/0x360 [ +0.004011] ? __pfx_worker_thread+0x10/0x10 [ +0.004269] worker_thread+0x307/0x430 [ +0.003748] ? __pfx_worker_thread+0x10/0x10 [ +0.004268] kthread+0xf7/0x130 [ +0.003142] ? __pfx_kthread+0x10/0x10 [ +0.003749] ret_from_fork+0x46/0x70 [ +0.003573] ? __pfx_kthread+0x10/0x10 [ +0.003747] ret_from_fork_asm+0x1b/0x30 [ +0.003924] When we run stressful tests, the eviction fence could be zero and not match to last_eviction_seqno. Avoid calling dma_fence_signal and dma_fence_put with zero fences to rely on checking parameters in DMA API. Cc: Alex Deucher Cc: Christian Koenig Cc: Xiaogang Chen Cc: Felix Kuehling Signed-off-by: Vitaly Prosyak --- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 10 ++ 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index eb380296017d..a15fae1c398a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -2118,7 +2118,7 @@ static void evict_process_worker(struct work_struct *work) */ p = container_of(dwork, struct kfd_process, eviction_work); trace_kfd_evict_process_worker_start(p); - WARN_ONCE(p->last_eviction_seqno != p->ef->seqno, + WARN_ONCE(p->ef && p->last_eviction_seqno != p->ef->seqno, "Eviction fence mismatch\n"); /* Narrow window of overlap between restore and evict work @@ -2134,9 +2134,11 @@ static void evict_process_worker(struct work_struct *work) pr_debug("Started evicting pasid 0x%x\n", p->pasid); ret = kfd_process_evict_queues(p, false, KFD_QUEUE_EVICTION_TRIGGER_TTM); if (!ret) { - dma_fence_signal(p->ef); - dma_fence_put(p->ef); - p->ef = NULL; + if (p->ef) { + dma_fence_signal(p->ef); + dma_fence_put(p->ef); + p->ef = NULL; + } if (!kfd_process_unmap_doorbells_if_idle(p)) kfd_process_schedule_restore(p);
[PATCH] rock-dgb_defconfig: Update for Linux 6.7 with UBSAN
make rock-dbg_defconfig make savedefconfig cp defconfig arch/x86/config/rock-dbg_defconfig This also enables UBSAN, which can help catch some types of bugs at compile time. Signed-off-by: Felix Kuehling --- arch/x86/configs/rock-dbg_defconfig | 46 + 1 file changed, 14 insertions(+), 32 deletions(-) diff --git a/arch/x86/configs/rock-dbg_defconfig b/arch/x86/configs/rock-dbg_defconfig index 0ad80a8c8eab..80129ca354b4 100644 --- a/arch/x86/configs/rock-dbg_defconfig +++ b/arch/x86/configs/rock-dbg_defconfig @@ -34,11 +34,12 @@ CONFIG_CHECKPOINT_RESTORE=y CONFIG_SCHED_AUTOGROUP=y CONFIG_BLK_DEV_INITRD=y CONFIG_EXPERT=y -CONFIG_USERFAULTFD=y -# CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y +CONFIG_KEXEC=y +CONFIG_KEXEC_FILE=y +CONFIG_KEXEC_JUMP=y +CONFIG_CRASH_DUMP=y CONFIG_SMP=y -# CONFIG_RETPOLINE is not set CONFIG_X86_INTEL_LPSS=y CONFIG_IOSF_MBI_DEBUG=y CONFIG_HYPERVISOR_GUEST=y @@ -48,9 +49,6 @@ CONFIG_PROCESSOR_SELECT=y CONFIG_GART_IOMMU=y CONFIG_NR_CPUS=256 CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y -CONFIG_I8K=m -CONFIG_MICROCODE_AMD=y -CONFIG_MICROCODE_OLD_INTERFACE=y CONFIG_X86_MSR=m CONFIG_X86_CPUID=m # CONFIG_X86_5LEVEL is not set @@ -61,12 +59,8 @@ CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT=1 CONFIG_EFI=y CONFIG_EFI_STUB=y CONFIG_EFI_MIXED=y -CONFIG_KEXEC=y -CONFIG_KEXEC_FILE=y -CONFIG_CRASH_DUMP=y -CONFIG_KEXEC_JUMP=y CONFIG_PHYSICAL_ALIGN=0x100 -CONFIG_LEGACY_VSYSCALL_EMULATE=y +# CONFIG_RETPOLINE is not set CONFIG_HIBERNATION=y CONFIG_PM_WAKELOCKS=y CONFIG_PM_DEBUG=y @@ -74,7 +68,6 @@ CONFIG_PM_ADVANCED_DEBUG=y CONFIG_PM_TRACE_RTC=y CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y CONFIG_ACPI_EC_DEBUGFS=m -CONFIG_ACPI_VIDEO=m CONFIG_ACPI_DOCK=y CONFIG_ACPI_PROCESSOR_AGGREGATOR=m CONFIG_ACPI_PCI_SLOT=y @@ -108,6 +101,8 @@ CONFIG_BLK_DEV_INTEGRITY=y CONFIG_BLK_DEV_THROTTLING=y CONFIG_PARTITION_ADVANCED=y CONFIG_BINFMT_MISC=y +CONFIG_ZSWAP=y +# CONFIG_COMPAT_BRK is not set CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_KSM=y @@ -115,15 +110,12 @@ CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 CONFIG_MEMORY_FAILURE=y CONFIG_HWPOISON_INJECT=m CONFIG_TRANSPARENT_HUGEPAGE=y -CONFIG_CLEANCACHE=y -CONFIG_FRONTSWAP=y CONFIG_CMA=y CONFIG_CMA_AREAS=7 CONFIG_MEM_SOFT_DIRTY=y -CONFIG_ZSWAP=y -CONFIG_ZSMALLOC=y CONFIG_ZONE_DEVICE=y CONFIG_DEVICE_PRIVATE=y +CONFIG_USERFAULTFD=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_PACKET_DIAG=y @@ -167,7 +159,6 @@ CONFIG_BRIDGE_NETFILTER=m CONFIG_NF_CONNTRACK=m CONFIG_NF_CONNTRACK_SECMARK=y CONFIG_NF_CONNTRACK_ZONES=y -# CONFIG_NF_CONNTRACK_PROCFS is not set CONFIG_NF_CONNTRACK_EVENTS=y CONFIG_NF_CONNTRACK_TIMEOUT=y CONFIG_NF_CONNTRACK_TIMESTAMP=y @@ -178,7 +169,6 @@ CONFIG_NETFILTER_NETLINK_GLUE_CT=y CONFIG_NF_TABLES=m CONFIG_NF_TABLES_NETDEV=y CONFIG_NFT_CT=m -CONFIG_NFT_COUNTER=m CONFIG_NFT_CONNLIMIT=m CONFIG_NFT_LOG=m CONFIG_NFT_LIMIT=m @@ -270,7 +260,6 @@ CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_CLUSTERIP=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -312,7 +301,6 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y # CONFIG_PREVENT_FIRMWARE_BUILD is not set -CONFIG_EFI_VARS=y CONFIG_PARPORT=y CONFIG_PARPORT_PC=y CONFIG_PARPORT_SERIAL=y @@ -363,7 +351,6 @@ CONFIG_E1000=y CONFIG_E1000E=y CONFIG_IGB=y CONFIG_IGBVF=y -CONFIG_IXGB=y CONFIG_IXGBE=y CONFIG_I40E=y CONFIG_SKY2=y @@ -401,14 +388,14 @@ CONFIG_SENSORS_K10TEMP=m CONFIG_WATCHDOG=y CONFIG_RC_CORE=y CONFIG_RC_DECODERS=y +CONFIG_IR_JVC_DECODER=y +CONFIG_IR_MCE_KBD_DECODER=y CONFIG_IR_NEC_DECODER=y CONFIG_IR_RC5_DECODER=y CONFIG_IR_RC6_DECODER=y -CONFIG_IR_JVC_DECODER=y -CONFIG_IR_SONY_DECODER=y CONFIG_IR_SANYO_DECODER=y CONFIG_IR_SHARP_DECODER=y -CONFIG_IR_MCE_KBD_DECODER=y +CONFIG_IR_SONY_DECODER=y CONFIG_IR_XMP_DECODER=y CONFIG_AGP=y CONFIG_AGP_AMD64=y @@ -422,7 +409,6 @@ CONFIG_HSA_AMD_P2P=y CONFIG_DRM_AST=m CONFIG_FB=y CONFIG_BACKLIGHT_CLASS_DEVICE=y -CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y CONFIG_HID_BATTERY_STRENGTH=y CONFIG_HIDRAW=y @@ -456,7 +442,6 @@ CONFIG_RTC_CLASS=y # CONFIG_RTC_HCTOSYS is not set CONFIG_DMADEVICES=y CONFIG_DMABUF_MOVE_NOTIFY=y -# CONFIG_X86_PLATFORM_DEVICES is not set CONFIG_AMD_IOMMU=y CONFIG_INTEL_IOMMU=y # CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON is not set @@ -473,9 +458,7 @@ CONFIG_XFS_WARN=y CONFIG_FANOTIFY=y CONFIG_QUOTA=y CONFIG_QUOTA_NETLINK_INTERFACE=y -# CONFIG_PRINT_QUOTA_WARNING is not set CONFIG_QFMT_V2=y -CONFIG_AUTOFS4_FS=y CONFIG_FUSE_FS=m CONFIG_CUSE=m CONFIG_OVERLAY_FS=y @@ -509,22 +492,21 @@ CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_BOOTPARAM=y -CONFIG_SECURITY_SELINUX_DISABLE=y -CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 CONFIG_LSM="yama,loadpin,safesetid,integrity,selinux,smack,
[PATCH] drm/amdkfd: Fix memory leak in create_process failure
Fix memory leak due to a leaked mmget reference on an error handling code path that is triggered when attempting to create KFD processes while a GPU reset is in progress. Fixes: 0ab2d7532b05 ("drm/amdkfd: prepare per-process debug enable and disable") CC: Xiaogang Chen Signed-off-by: Felix Kuehling --- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 717a60d7a4ea..b79986412cd8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -819,9 +819,9 @@ struct kfd_process *kfd_create_process(struct task_struct *thread) mutex_lock(_processes_mutex); if (kfd_is_locked()) { - mutex_unlock(_processes_mutex); pr_debug("KFD is locked! Cannot create process"); - return ERR_PTR(-EINVAL); + process = ERR_PTR(-EINVAL); + goto out; } /* A prior open of /dev/kfd could have already created the process. */ -- 2.34.1
Re: [PATCH] drm/amdkfd: make sure VM is ready for updating operations
On 2024-04-08 3:55, Christian König wrote: Am 07.04.24 um 06:52 schrieb Lang Yu: When VM is in evicting state, amdgpu_vm_update_range would return -EBUSY. Then restore_process_worker runs into a dead loop. Fixes: 2fdba514ad5a ("drm/amdgpu: Auto-validate DMABuf imports in compute VMs") Mhm, while it would be good to have this case handled as error it should never occur in practice since we should have validated the VM before validating the DMA-bufs. @Felix isn't that something we have taken care of? The problem I saw when I implemented Auto-validate was, that migration of a BO invalidates its DMABuf attachments. So I need to validate the DMABuf attachments after validating the BOs they attach to. This auto-validation happens in amdgpu_vm_validate. So I needed to do the VM validation after the BO validation. The problem now seems to be that the BO validation happens in the same loop as the page table update. And the page table update fails if the VM is not valid. I never saw this problem in my testing, probably because I never got my page tables evicted? Anyway, I think the solution is to split the BO validation and page table update into two separate loops in amdgpu_amdkfd_restore_process_pos: 1. Validate BOs 2. Validate VM (and DMABuf attachments) 3. Update page tables for the BOs validated above Regards, Felix Regards, Christian. Signed-off-by: Lang Yu --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 0ae9fd844623..8c71fe07807a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -2900,6 +2900,12 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu * amdgpu_sync_create(_obj); + ret = process_validate_vms(process_info, NULL); + if (ret) { + pr_debug("Validating VMs failed, ret: %d\n", ret); + goto validate_map_fail; + } + /* Validate BOs and map them to GPUVM (update VM page tables). */ list_for_each_entry(mem, _info->kfd_bo_list, validate_list) {
Re: [PATCH 1/2] amd/amdkfd: sync all devices to wait all processes being evicted
On 2024-04-03 14:12, Zhigang Luo wrote: If there are more than one device doing reset in parallel, the first device will call kfd_suspend_all_processes() to evict all processes on all devices, this call takes time to finish. other device will start reset and recover without waiting. if the process has not been evicted before doing recover, it will be restored, then caused page fault. Signed-off-by: Zhigang Luo This patch is Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 17 ++--- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 041ec3de55e7..719d6d365e15 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -960,7 +960,6 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) { struct kfd_node *node; int i; - int count; if (!kfd->init_complete) return; @@ -968,12 +967,10 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) /* for runtime suspend, skip locking kfd */ if (!run_pm) { mutex_lock(_processes_mutex); - count = ++kfd_locked; - mutex_unlock(_processes_mutex); - /* For first KFD device suspend all the KFD processes */ - if (count == 1) + if (++kfd_locked == 1) kfd_suspend_all_processes(); + mutex_unlock(_processes_mutex); } for (i = 0; i < kfd->num_nodes; i++) { @@ -984,7 +981,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) { - int ret, count, i; + int ret, i; if (!kfd->init_complete) return 0; @@ -998,12 +995,10 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) /* for runtime resume, skip unlocking kfd */ if (!run_pm) { mutex_lock(_processes_mutex); - count = --kfd_locked; - mutex_unlock(_processes_mutex); - - WARN_ONCE(count < 0, "KFD suspend / resume ref. error"); - if (count == 0) + if (--kfd_locked == 0) ret = kfd_resume_all_processes(); + WARN_ONCE(kfd_locked < 0, "KFD suspend / resume ref. error"); + mutex_unlock(_processes_mutex); } return ret;
Re: [PATCH 1/2] amd/amdkfd: sync all devices to wait all processes being evicted
On 2024-04-01 17:53, Zhigang Luo wrote: If there are more than one device doing reset in parallel, the first device will call kfd_suspend_all_processes() to evict all processes on all devices, this call takes time to finish. other device will start reset and recover without waiting. if the process has not been evicted before doing recover, it will be restored, then caused page fault. Signed-off-by: Zhigang Luo Change-Id: Ib1eddb56b69ecd41fe703abd169944154f48b0cd Please remove the Change-Id: before you push. Other than that, this patch is --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 041ec3de55e7..55f89c858c7a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -969,11 +969,11 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) if (!run_pm) { mutex_lock(_processes_mutex); count = ++kfd_locked; - mutex_unlock(_processes_mutex); /* For first KFD device suspend all the KFD processes */ if (count == 1) kfd_suspend_all_processes(); This could be simplified now. The variable "count" was only needed for the broken attempt to do call suspend outside the lock. Now you can just do: mutex_lock(_processes_mutex); if (++kfd_locked == 1) kfd_suspend_all_processes(); mutex_unlock(_processes_mutex); To be consistent, we probably need to make a similar change in kgd2kfd_resume and run kfd_resume_all_processes under the lock as well. Otherwise there could be a race condition between suspend and resume. Regards, Felix + mutex_unlock(_processes_mutex); } for (i = 0; i < kfd->num_nodes; i++) {
Re: Proposal to add CRIU support to DRM render nodes
On 2024-04-01 12:56, Tvrtko Ursulin wrote: On 01/04/2024 17:37, Felix Kuehling wrote: On 2024-04-01 11:09, Tvrtko Ursulin wrote: On 28/03/2024 20:42, Felix Kuehling wrote: On 2024-03-28 12:03, Tvrtko Ursulin wrote: Hi Felix, I had one more thought while browsing around the amdgpu CRIU plugin. It appears it relies on the KFD support being compiled in and /dev/kfd present, correct? AFAICT at least, it relies on that to figure out the amdgpu DRM node. In would be probably good to consider designing things without that dependency. So that checkpointing an application which does not use /dev/kfd is possible. Or if the kernel does not even have the KFD support compiled in. Yeah, if we want to support graphics apps that don't use KFD, we should definitely do that. Currently we get a lot of topology information from KFD, not even from the /dev/kfd device but from the sysfs nodes exposed by KFD. We'd need to get GPU device info from the render nodes instead. And if KFD is available, we may need to integrate both sources of information. It could perhaps mean no more than adding some GPU discovery code into CRIU. Which shuold be flexible enough to account for things like re-assigned minor numbers due driver reload. Do you mean adding GPU discovery to the core CRIU, or to the plugin. I was thinking this is still part of the plugin. Yes I agree. I was only thinking about adding some DRM device discovery code in a more decoupled fashion from the current plugin, for both the reason discussed above (decoupling a bit from reliance on kfd sysfs), and then also if/when a new DRM driver might want to implement this the code could be move to some common plugin area. I am not sure how feasible that would be though. The "gpu id" concept and it's matching in the current kernel code and CRIU plugin - is that value tied to the physical GPU instance or how it works? The concept of the GPU ID is that it's stable while the system is up, even when devices get added and removed dynamically. It was baked into the API early on, but I don't think we ever fully validated device hot plug. I think the closest we're getting is with our latest MI GPUs and dynamic partition mode change. Doesn't it read the saved gpu id from the image file while doing restore and tries to open the render node to match it? Maybe I am misreading the code.. But if it does, does it imply that in practice it could be stable across reboots? Or that it is not possible to restore to a different instance of maybe the same GPU model installed in a system? Ah, the idea is, that when you restore on a different system, you may get different GPU IDs. Or you may checkpoint an app running on GPU 1 but restore it on GPU 2 on the same system. That's why we need to translate GPU IDs in restored applications. User mode still uses the old GPU IDs, but the kernel mode driver translates them to the actual GPU IDs of the GPUs that the process was restored on. This also highlights another aspect on those spatially partitioned GPUs. GPU IDs identify device partitions, not devices. Similarly, each partition has its own render node, and the KFD topology info in sysfs points to the render-minor number corresponding to each GPU ID. I am not familiar with this. This is not SR-IOV but some other kind of partitioning? Would you have any links where I could read more? Right, the bare-metal driver can partition a PF spatially without SRIOV. SRIOV can also use spatial partitioning and expose each partition through its own VF, but that's not useful for bare metal. Spatial partitioning is new in MI300. There is some high-level info in this whitepaper: https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-3-white-paper.pdf. Regards, Felix Regards, Tvrtko Otherwise I am eagerly awaiting to hear more about the design specifics around dma-buf handling. And also seeing how to extend to other DRM related anonymous fds. I've been pretty far under-water lately. I hope I'll find time to work on this more, but it's probably going to be at least a few weeks. Got it. Regards, Tvrtko Regards, Felix Regards, Tvrtko On 15/03/2024 18:36, Tvrtko Ursulin wrote: On 15/03/2024 02:33, Felix Kuehling wrote: On 2024-03-12 5:45, Tvrtko Ursulin wrote: On 11/03/2024 14:48, Tvrtko Ursulin wrote: Hi Felix, On 06/12/2023 21:23, Felix Kuehling wrote: Executive Summary: We need to add CRIU support to DRM render nodes in order to maintain CRIU support for ROCm application once they start relying on render nodes for more GPU memory management. In this email I'm providing some background why we are doing this, and outlining some of the problems we need to solve to checkpoint and restore render node state and shared memory (DMABuf) state. I have some thoughts on the API design, leaning on what we did for KFD, but would like to get feedback from the DRI
Re: Proposal to add CRIU support to DRM render nodes
On 2024-04-01 11:09, Tvrtko Ursulin wrote: On 28/03/2024 20:42, Felix Kuehling wrote: On 2024-03-28 12:03, Tvrtko Ursulin wrote: Hi Felix, I had one more thought while browsing around the amdgpu CRIU plugin. It appears it relies on the KFD support being compiled in and /dev/kfd present, correct? AFAICT at least, it relies on that to figure out the amdgpu DRM node. In would be probably good to consider designing things without that dependency. So that checkpointing an application which does not use /dev/kfd is possible. Or if the kernel does not even have the KFD support compiled in. Yeah, if we want to support graphics apps that don't use KFD, we should definitely do that. Currently we get a lot of topology information from KFD, not even from the /dev/kfd device but from the sysfs nodes exposed by KFD. We'd need to get GPU device info from the render nodes instead. And if KFD is available, we may need to integrate both sources of information. It could perhaps mean no more than adding some GPU discovery code into CRIU. Which shuold be flexible enough to account for things like re-assigned minor numbers due driver reload. Do you mean adding GPU discovery to the core CRIU, or to the plugin. I was thinking this is still part of the plugin. Yes I agree. I was only thinking about adding some DRM device discovery code in a more decoupled fashion from the current plugin, for both the reason discussed above (decoupling a bit from reliance on kfd sysfs), and then also if/when a new DRM driver might want to implement this the code could be move to some common plugin area. I am not sure how feasible that would be though. The "gpu id" concept and it's matching in the current kernel code and CRIU plugin - is that value tied to the physical GPU instance or how it works? The concept of the GPU ID is that it's stable while the system is up, even when devices get added and removed dynamically. It was baked into the API early on, but I don't think we ever fully validated device hot plug. I think the closest we're getting is with our latest MI GPUs and dynamic partition mode change. This also highlights another aspect on those spatially partitioned GPUs. GPU IDs identify device partitions, not devices. Similarly, each partition has its own render node, and the KFD topology info in sysfs points to the render-minor number corresponding to each GPU ID. Regards, Felix Otherwise I am eagerly awaiting to hear more about the design specifics around dma-buf handling. And also seeing how to extend to other DRM related anonymous fds. I've been pretty far under-water lately. I hope I'll find time to work on this more, but it's probably going to be at least a few weeks. Got it. Regards, Tvrtko Regards, Felix Regards, Tvrtko On 15/03/2024 18:36, Tvrtko Ursulin wrote: On 15/03/2024 02:33, Felix Kuehling wrote: On 2024-03-12 5:45, Tvrtko Ursulin wrote: On 11/03/2024 14:48, Tvrtko Ursulin wrote: Hi Felix, On 06/12/2023 21:23, Felix Kuehling wrote: Executive Summary: We need to add CRIU support to DRM render nodes in order to maintain CRIU support for ROCm application once they start relying on render nodes for more GPU memory management. In this email I'm providing some background why we are doing this, and outlining some of the problems we need to solve to checkpoint and restore render node state and shared memory (DMABuf) state. I have some thoughts on the API design, leaning on what we did for KFD, but would like to get feedback from the DRI community regarding that API and to what extent there is interest in making that generic. We are working on using DRM render nodes for virtual address mappings in ROCm applications to implement the CUDA11-style VM API and improve interoperability between graphics and compute. This uses DMABufs for sharing buffer objects between KFD and multiple render node devices, as well as between processes. In the long run this also provides a path to moving all or most memory management from the KFD ioctl API to libdrm. Once ROCm user mode starts using render nodes for virtual address management, that creates a problem for checkpointing and restoring ROCm applications with CRIU. Currently there is no support for checkpointing and restoring render node state, other than CPU virtual address mappings. Support will be needed for checkpointing GEM buffer objects and handles, their GPU virtual address mappings and memory sharing relationships between devices and processes. Eventually, if full CRIU support for graphics applications is desired, more state would need to be captured, including scheduler contexts and BO lists. Most of this state is driver-specific. After some internal discussions we decided to take our design process public as this potentially touches DRM GEM and DMABuf APIs and may have implications for other drivers in the future. One basic question before going in
Re: Proposal to add CRIU support to DRM render nodes
On 2024-03-28 12:03, Tvrtko Ursulin wrote: Hi Felix, I had one more thought while browsing around the amdgpu CRIU plugin. It appears it relies on the KFD support being compiled in and /dev/kfd present, correct? AFAICT at least, it relies on that to figure out the amdgpu DRM node. In would be probably good to consider designing things without that dependency. So that checkpointing an application which does not use /dev/kfd is possible. Or if the kernel does not even have the KFD support compiled in. Yeah, if we want to support graphics apps that don't use KFD, we should definitely do that. Currently we get a lot of topology information from KFD, not even from the /dev/kfd device but from the sysfs nodes exposed by KFD. We'd need to get GPU device info from the render nodes instead. And if KFD is available, we may need to integrate both sources of information. It could perhaps mean no more than adding some GPU discovery code into CRIU. Which shuold be flexible enough to account for things like re-assigned minor numbers due driver reload. Do you mean adding GPU discovery to the core CRIU, or to the plugin. I was thinking this is still part of the plugin. Otherwise I am eagerly awaiting to hear more about the design specifics around dma-buf handling. And also seeing how to extend to other DRM related anonymous fds. I've been pretty far under-water lately. I hope I'll find time to work on this more, but it's probably going to be at least a few weeks. Regards, Felix Regards, Tvrtko On 15/03/2024 18:36, Tvrtko Ursulin wrote: On 15/03/2024 02:33, Felix Kuehling wrote: On 2024-03-12 5:45, Tvrtko Ursulin wrote: On 11/03/2024 14:48, Tvrtko Ursulin wrote: Hi Felix, On 06/12/2023 21:23, Felix Kuehling wrote: Executive Summary: We need to add CRIU support to DRM render nodes in order to maintain CRIU support for ROCm application once they start relying on render nodes for more GPU memory management. In this email I'm providing some background why we are doing this, and outlining some of the problems we need to solve to checkpoint and restore render node state and shared memory (DMABuf) state. I have some thoughts on the API design, leaning on what we did for KFD, but would like to get feedback from the DRI community regarding that API and to what extent there is interest in making that generic. We are working on using DRM render nodes for virtual address mappings in ROCm applications to implement the CUDA11-style VM API and improve interoperability between graphics and compute. This uses DMABufs for sharing buffer objects between KFD and multiple render node devices, as well as between processes. In the long run this also provides a path to moving all or most memory management from the KFD ioctl API to libdrm. Once ROCm user mode starts using render nodes for virtual address management, that creates a problem for checkpointing and restoring ROCm applications with CRIU. Currently there is no support for checkpointing and restoring render node state, other than CPU virtual address mappings. Support will be needed for checkpointing GEM buffer objects and handles, their GPU virtual address mappings and memory sharing relationships between devices and processes. Eventually, if full CRIU support for graphics applications is desired, more state would need to be captured, including scheduler contexts and BO lists. Most of this state is driver-specific. After some internal discussions we decided to take our design process public as this potentially touches DRM GEM and DMABuf APIs and may have implications for other drivers in the future. One basic question before going into any API details: Is there a desire to have CRIU support for other DRM drivers? This sounds like a very interesting feature on the overall, although I cannot answer on the last question here. I forgot to finish this thought. I cannot answer / don't know of any concrete plans, but I think feature is pretty cool and if amdgpu gets it working I wouldn't be surprised if other drivers would get interested. Thanks, that's good to hear! Funnily enough, it has a tiny relation to an i915 feature I recently implemented on Mesa's request, which is to be able to "upload" the GPU context from the GPU hang error state and replay the hanging request. It is kind of (at a stretch) a very special tiny subset of checkout and restore so I am not mentioning it as a curiosity. And there is also another partical conceptual intersect with the (at the moment not yet upstream) i915 online debugger. This part being in the area of discovering and enumerating GPU resources beloning to the client. I don't see an immediate design or code sharing opportunities though but just mentioning. I did spend some time reading your plugin and kernel implementation out of curiousity and have some comments and questions. With that out of the way, some cons
Re: [PATCH] drm/amdgpu: use vm_update_mode=0 as default in sriov for gfx10.3 onwards
On 2024-03-28 13:59, Danijel Slivka wrote: Apply this rule to all newer asics in sriov case. For asic with VF MMIO access protection avoid using CPU for VM table updates. CPU pagetable updates have issues with HDP flush as VF MMIO access protection blocks write to BIF_BX_DEV0_EPF0_VF0_HDP_MEM_COHERENCY_FLUSH_CNTL register during sriov runtime. Please mention that you moved the check to amdgpu_device_init to ensure that it runs after amdgpu_device_ip_early_init where the IP versions are discovered. Signed-off-by: Danijel Slivka --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 6 -- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 12dc71a6b5db..59ee902a1eaa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4072,6 +4072,13 @@ int amdgpu_device_init(struct amdgpu_device *adev, /* Enable TMZ based on IP_VERSION */ amdgpu_gmc_tmz_set(adev); + if (amdgpu_sriov_vf(adev) && + (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0))) Please fix the indentation. The second line should be aligned with the open parenthesis from the previous line. You could also remove the extra parentheses around the comparison. They're not needed, and IMO they make the code less readable. With that fixed, the patch is Reviewed-by: Felix Kuehling + /* VF MMIO access (except mailbox range) from CPU +* will be blocked during sriov runtime +*/ + adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; + amdgpu_gmc_noretry_set(adev); /* Need to get xgmi info early to decide the reset behavior*/ if (adev->gmc.xgmi.supported) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index aed60aaf1a55..6f01de220c44 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c @@ -724,12 +724,6 @@ void amdgpu_detect_virtualization(struct amdgpu_device *adev) adev->virt.caps |= AMDGPU_PASSTHROUGH_MODE; } - if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) - /* VF MMIO access (except mailbox range) from CPU -* will be blocked during sriov runtime -*/ - adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; - /* we have the ability to check now */ if (amdgpu_sriov_vf(adev)) { switch (adev->asic_type) {
Re: [PATCH 1/2] drm/amdgpu: always allocate cleared VRAM for KFD allocations
On 2024-03-26 11:52, Alex Deucher wrote: This adds allocation latency, but aligns better with user expectations. The latency should improve with the drm buddy clearing patches that Arun has been working on. If we submit this before the clear-page-tracking patches are in, this will cause unacceptable performance regressions for ROCm applications. Regards, Felix Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 0ae9fd844623..f9a4ea082821 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1709,7 +1709,8 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( alloc_domain = AMDGPU_GEM_DOMAIN_GTT; alloc_flags = 0; } else { - alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; + alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE | + AMDGPU_GEM_CREATE_VRAM_CLEARED; alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) ? AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0; }
Re: [PATCH] drm/amdgpu : Increase the mes log buffer size as per new MES FW version
On 2024-03-25 19:33, Liu, Shaoyun wrote: [AMD Official Use Only - General] It can cause page fault when the log size exceed the page size . I'd consider that a breaking change in the firmware that should be avoided. Is there a way the updated driver can tell the FW the log size that it allocated, so that old drivers continue to work with new firmware? Regards, Felix -Original Message- From: Kuehling, Felix Sent: Monday, March 25, 2024 2:58 PM To: Liu, Shaoyun ; amd-gfx@lists.freedesktop.org Subject: Re: [PATCH] drm/amdgpu : Increase the mes log buffer size as per new MES FW version On 2024-03-22 12:49, shaoyunl wrote: From MES version 0x54, the log entry increased and require the log buffer size to be increased. The 16k is maximum size agreed What happens when you run the new firmware on an old kernel that only allocates 4KB? Regards, Felix Signed-off-by: shaoyunl --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 5 ++--- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index 9ace848e174c..78e4f88f5134 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -103,7 +103,7 @@ static int amdgpu_mes_event_log_init(struct amdgpu_device *adev) if (!amdgpu_mes_log_enable) return 0; - r = amdgpu_bo_create_kernel(adev, PAGE_SIZE, PAGE_SIZE, + r = amdgpu_bo_create_kernel(adev, AMDGPU_MES_LOG_BUFFER_SIZE, +PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, >mes.event_log_gpu_obj, >mes.event_log_gpu_addr, @@ -1548,12 +1548,11 @@ static int amdgpu_debugfs_mes_event_log_show(struct seq_file *m, void *unused) uint32_t *mem = (uint32_t *)(adev->mes.event_log_cpu_addr); seq_hex_dump(m, "", DUMP_PREFIX_OFFSET, 32, 4, - mem, PAGE_SIZE, false); + mem, AMDGPU_MES_LOG_BUFFER_SIZE, false); return 0; } - DEFINE_SHOW_ATTRIBUTE(amdgpu_debugfs_mes_event_log); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index 7d4f93fea937..4c8fc3117ef8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -52,6 +52,7 @@ enum amdgpu_mes_priority_level { #define AMDGPU_MES_PROC_CTX_SIZE 0x1000 /* one page area */ #define AMDGPU_MES_GANG_CTX_SIZE 0x1000 /* one page area */ +#define AMDGPU_MES_LOG_BUFFER_SIZE 0x4000 /* Maximu log buffer size +for MES */ struct amdgpu_mes_funcs;
Re: [PATCH] drm/amd/amdgpu: Enable IH Retry CAM by register read
On 2024-03-26 12:04, Alam, Dewan wrote: [AMD Official Use Only - General] Looping in +@Zhang, Zhaochen CAM control register can only be written by PF. VF can only read the register. In SRIOV VF, the write won't work. In SRIOV case, CAM's enablement is controlled by the host. Hence, we think the enablement status should be decided by the register reading. Thank you for clarifying that. With that in mind, I would suggest changes to the commit headline and description to avoid confusion: drm/amdgpu: Confirm IH retry CAM enablement by reading the register Under SRIOV, the IH CAM cannot be enabled by the guest. The host controls this register. In the guest driver, read the register to confirm whether the CAM was enabled. Regards, Felix Thanks, Dewan -Original Message- From: Kuehling, Felix Sent: Wednesday, March 13, 2024 3:46 PM To: Alam, Dewan;amd-gfx@lists.freedesktop.org Cc: Zhang, Hawking Subject: Re: [PATCH] drm/amd/amdgpu: Enable IH Retry CAM by register read On 2024-03-13 13:43, Dewan Alam wrote: IH Retry CAM should be enabled by register reads instead of always being set to true. This explanation sounds odd. Your code is still writing the register first. What's the reason for reading back the register? I assume it's not needed for enabling the CAM, but to check whether it was enabled successfully. What are the configurations where it cannot be enabled successfully? Two more nit-picks inline ... Signed-off-by: Dewan Alam --- drivers/gpu/drm/amd/amdgpu/vega20_ih.c | 15 +++ 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c index b9e785846637..c330f5a88a06 100644 --- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c @@ -337,13 +337,20 @@ static int vega20_ih_irq_init(struct amdgpu_device *adev) /* Enable IH Retry CAM */ if (amdgpu_ip_version(adev, OSSSYS_HWIP, 0) == IP_VERSION(4, 4, 0) || - amdgpu_ip_version(adev, OSSSYS_HWIP, 0) == IP_VERSION(4, 4, 2)) + amdgpu_ip_version(adev, OSSSYS_HWIP, 0) == IP_VERSION(4, 4, 2)) +{ WREG32_FIELD15(OSSSYS, 0, IH_RETRY_INT_CAM_CNTL_ALDEBARAN, ENABLE, 1); - else + adev->irq.retry_cam_enabled = REG_GET_FIELD( + RREG32_SOC15(OSSSYS, 0, + mmIH_RETRY_INT_CAM_CNTL_ALDEBARAN), + IH_RETRY_INT_CAM_CNTL_ALDEBARAN, ENABLE); + } else { Indentation looks wrong here. WREG32_FIELD15(OSSSYS, 0, IH_RETRY_INT_CAM_CNTL, ENABLE, 1); - - adev->irq.retry_cam_enabled = true; + adev->irq.retry_cam_enabled = REG_GET_FIELD( + RREG32_SOC15(OSSSYS, 0, + mmIH_RETRY_INT_CAM_CNTL), + IH_RETRY_INT_CAM_CNTL, ENABLE); + } Wrong indentation. Regards, Felix /* enable interrupts */ ret = vega20_ih_toggle_interrupts(adev, true);
Re: [PATCH 2/3] amd/amdgpu: wait no process running in kfd before resuming device
On 2024-03-26 10:53, Philip Yang wrote: On 2024-03-25 14:45, Felix Kuehling wrote: On 2024-03-22 15:57, Zhigang Luo wrote: it will cause page fault after device recovered if there is a process running. Signed-off-by: Zhigang Luo Change-Id: Ib1eddb56b69ecd41fe703abd169944154f48b0cd --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 70261eb9b0bb..2867e9186e44 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4974,6 +4974,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, retry: amdgpu_amdkfd_pre_reset(adev); + amdgpu_amdkfd_wait_no_process_running(adev); + This waits for the processes to be terminated. What would cause the processes to be terminated? Why do the processes need to be terminated? Isn't it enough if the processes are removed from the runlist in pre-reset, so they can no longer execute on the GPU? mode 1 reset on SRIOV is much faster then BM, kgd2kfd_pre_reset sends GPU reset event to user space, don't remove queues from the runlist, after mode1 reset is done, there is queue still running and generate vm fault because the GPU page table is gone. I think seeing a page fault during the reset is not a problem. Seeing a page fault after the reset would be a bug. The process should not be on the runlist after the reset is done. Waiting for the process to terminate first looks like a workaround, when the real bug is maybe that we're not updating the process state correctly in pre-reset. All currently running processes should be put into evicted state, so they are not put back on the runlist after the reset. Regards, Felix Regards, Philip Regards, Felix amdgpu_device_stop_pending_resets(adev); if (from_hypervisor)
Re: [PATCH] drm/amdgpu : Increase the mes log buffer size as per new MES FW version
On 2024-03-22 12:49, shaoyunl wrote: From MES version 0x54, the log entry increased and require the log buffer size to be increased. The 16k is maximum size agreed What happens when you run the new firmware on an old kernel that only allocates 4KB? Regards, Felix Signed-off-by: shaoyunl --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 5 ++--- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index 9ace848e174c..78e4f88f5134 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -103,7 +103,7 @@ static int amdgpu_mes_event_log_init(struct amdgpu_device *adev) if (!amdgpu_mes_log_enable) return 0; - r = amdgpu_bo_create_kernel(adev, PAGE_SIZE, PAGE_SIZE, + r = amdgpu_bo_create_kernel(adev, AMDGPU_MES_LOG_BUFFER_SIZE, PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, >mes.event_log_gpu_obj, >mes.event_log_gpu_addr, @@ -1548,12 +1548,11 @@ static int amdgpu_debugfs_mes_event_log_show(struct seq_file *m, void *unused) uint32_t *mem = (uint32_t *)(adev->mes.event_log_cpu_addr); seq_hex_dump(m, "", DUMP_PREFIX_OFFSET, 32, 4, -mem, PAGE_SIZE, false); +mem, AMDGPU_MES_LOG_BUFFER_SIZE, false); return 0; } - DEFINE_SHOW_ATTRIBUTE(amdgpu_debugfs_mes_event_log); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index 7d4f93fea937..4c8fc3117ef8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -52,6 +52,7 @@ enum amdgpu_mes_priority_level { #define AMDGPU_MES_PROC_CTX_SIZE 0x1000 /* one page area */ #define AMDGPU_MES_GANG_CTX_SIZE 0x1000 /* one page area */ +#define AMDGPU_MES_LOG_BUFFER_SIZE 0x4000 /* Maximu log buffer size for MES */ struct amdgpu_mes_funcs;
Re: [PATCH 2/3] amd/amdgpu: wait no process running in kfd before resuming device
On 2024-03-22 15:57, Zhigang Luo wrote: it will cause page fault after device recovered if there is a process running. Signed-off-by: Zhigang Luo Change-Id: Ib1eddb56b69ecd41fe703abd169944154f48b0cd --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 70261eb9b0bb..2867e9186e44 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4974,6 +4974,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, retry: amdgpu_amdkfd_pre_reset(adev); + amdgpu_amdkfd_wait_no_process_running(adev); + This waits for the processes to be terminated. What would cause the processes to be terminated? Why do the processes need to be terminated? Isn't it enough if the processes are removed from the runlist in pre-reset, so they can no longer execute on the GPU? Regards, Felix amdgpu_device_stop_pending_resets(adev); if (from_hypervisor)
Re: [PATCH] drm/amdkfd: Cleanup workqueue during module unload
On 2024-03-20 18:52, Mukul Joshi wrote: Destroy the high priority workqueue that handles interrupts during KFD node cleanup. Signed-off-by: Mukul Joshi Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c index dd3c43c1ad70..9b6b6e882593 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c @@ -104,6 +104,8 @@ void kfd_interrupt_exit(struct kfd_node *node) */ flush_workqueue(node->ih_wq); + destroy_workqueue(node->ih_wq); + kfifo_free(>ih_fifo); }
Re: [PATCH] drm/amdkfd: range check cp bad op exception interrupts
On 2024-03-13 10:21, Jonathan Kim wrote: Due to a CP interrupt bug, bad packet garbage exception codes are raised. Do a range check so that the debugger and runtime do not receive garbage codes. Update the user api to guard exception code type checking as well. Signed-off-by: Jonathan Kim Tested-by: Jesse Zhang Reviewed-by: Felix Kuehling --- .../gpu/drm/amd/amdkfd/kfd_int_process_v10.c| 3 ++- .../gpu/drm/amd/amdkfd/kfd_int_process_v11.c| 3 ++- drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 3 ++- include/uapi/linux/kfd_ioctl.h | 17 ++--- 4 files changed, 20 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c index a8e76287dde0..013d0a073b9b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c @@ -339,7 +339,8 @@ static void event_interrupt_wq_v10(struct kfd_node *dev, break; } kfd_signal_event_interrupt(pasid, context_id0 & 0x7f, 23); - } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) { + } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE && + KFD_DBG_EC_TYPE_IS_PACKET(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0))) { kfd_set_dbg_ev_from_interrupt(dev, pasid, KFD_DEBUG_DOORBELL_ID(context_id0), KFD_EC_MASK(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0)), diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c index 7e2859736a55..fe2ad0c0de95 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c @@ -328,7 +328,8 @@ static void event_interrupt_wq_v11(struct kfd_node *dev, /* CP */ if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) kfd_signal_event_interrupt(pasid, context_id0, 32); - else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) + else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE && + KFD_DBG_EC_TYPE_IS_PACKET(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0))) kfd_set_dbg_ev_from_interrupt(dev, pasid, KFD_CTXID0_DOORBELL_ID(context_id0), KFD_EC_MASK(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0)), diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index ff7392336795..5483211c5d3d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -388,7 +388,8 @@ static void event_interrupt_wq_v9(struct kfd_node *dev, break; } kfd_signal_event_interrupt(pasid, sq_int_data, 24); - } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) { + } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE && + KFD_DBG_EC_TYPE_IS_PACKET(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0))) { kfd_set_dbg_ev_from_interrupt(dev, pasid, KFD_DEBUG_DOORBELL_ID(context_id0), KFD_EC_MASK(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0)), diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 9ce46edc62a5..2040a470ddb4 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -913,14 +913,25 @@ enum kfd_dbg_trap_exception_code { KFD_EC_MASK(EC_DEVICE_NEW)) #define KFD_EC_MASK_PROCESS (KFD_EC_MASK(EC_PROCESS_RUNTIME) | \ KFD_EC_MASK(EC_PROCESS_DEVICE_REMOVE)) +#define KFD_EC_MASK_PACKET (KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_DIM_INVALID) |\ + KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_GROUP_SEGMENT_SIZE_INVALID) | \ + KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_CODE_INVALID) | \ +KFD_EC_MASK(EC_QUEUE_PACKET_RESERVED) | \ +KFD_EC_MASK(EC_QUEUE_PACKET_UNSUPPORTED) | \ + KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID) |\ + KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID) | \ + KFD_EC_MASK(EC_QUEUE_PACKET_VENDOR_UNSUPPORTED)) /* Checks for exception code types for KFD search */ +#define KFD_DBG_EC_IS_VALID(ecode) (ecode > EC_NONE && ecode < EC_MAX) #define KFD_DBG_EC_TYPE_IS_QUEUE(ecode) \ -
Re: [PATCH] drm/amdkfd: Check cgroup when returning DMABuf info
On 2024-03-18 16:12, Felix Kuehling wrote: On 2024-03-15 14:17, Mukul Joshi wrote: Check cgroup permissions when returning DMA-buf info and based on cgroup check return the id of the GPU that has access to the BO. Signed-off-by: Mukul Joshi --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index dfa8c69532d4..f9631f4b1a02 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1523,7 +1523,7 @@ static int kfd_ioctl_get_dmabuf_info(struct file *filep, /* Find a KFD GPU device that supports the get_dmabuf_info query */ for (i = 0; kfd_topology_enum_kfd_devices(i, ) == 0; i++) - if (dev) + if (dev && !kfd_devcgroup_check_permission(dev)) break; if (!dev) return -EINVAL; @@ -1545,7 +1545,7 @@ static int kfd_ioctl_get_dmabuf_info(struct file *filep, if (xcp_id >= 0) args->gpu_id = dmabuf_adev->kfd.dev->nodes[xcp_id]->id; else - args->gpu_id = dmabuf_adev->kfd.dev->nodes[0]->id; + args->gpu_id = dev->id; If I remember correctly, this was meant as a fallback in case for GTT BOs where the exporting partition wasn't known and the application didn't have access to the first partition. I think the way you wrote this, it could also change the behaviour (report the wrong GPU ID) on single-partition GPUs, which is probably not intended. Never mind. I double checked: On single-partition GPUs, bo->xcp_id always seems to be 0. So your code won't change the behaviour here. The patch is Reviewed-by: Felix Kuehling Maybe this would preserve the behaviour for that case: ... - else + else if (!kfd_devcgroup_check_permission(dmabuf_adev->kfd.dev->nodes[0])) args->gpu_id = dmabuf_adev->kfd.dev->nodes[0]->id; + else + args->gpu_id = dev->id; Or maybe a more general solution would make DMABuf import work when the exporter is really unknown or not even a GPU. This came up not so long ago in the context of interop with 3rd-party devices. This may require user mode changes as well. Regards, Felix args->flags = flags; /* Copy metadata buffer to user mode */
Re: [PATCH] drm/amdkfd: Check cgroup when returning DMABuf info
On 2024-03-20 15:09, Joshi, Mukul wrote: [AMD Official Use Only - General] -Original Message- From: Kuehling, Felix Sent: Monday, March 18, 2024 4:13 PM To: Joshi, Mukul ; amd-gfx@lists.freedesktop.org Subject: Re: [PATCH] drm/amdkfd: Check cgroup when returning DMABuf info On 2024-03-15 14:17, Mukul Joshi wrote: Check cgroup permissions when returning DMA-buf info and based on cgroup check return the id of the GPU that has access to the BO. Signed-off-by: Mukul Joshi --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index dfa8c69532d4..f9631f4b1a02 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1523,7 +1523,7 @@ static int kfd_ioctl_get_dmabuf_info(struct file *filep, /* Find a KFD GPU device that supports the get_dmabuf_info query */ for (i = 0; kfd_topology_enum_kfd_devices(i, ) == 0; i++) - if (dev) + if (dev && !kfd_devcgroup_check_permission(dev)) break; if (!dev) return -EINVAL; @@ -1545,7 +1545,7 @@ static int kfd_ioctl_get_dmabuf_info(struct file *filep, if (xcp_id >= 0) args->gpu_id = dmabuf_adev->kfd.dev->nodes[xcp_id]->id; else - args->gpu_id = dmabuf_adev->kfd.dev->nodes[0]->id; + args->gpu_id = dev->id; If I remember correctly, this was meant as a fallback in case for GTT BOs where the exporting partition wasn't known and the application didn't have access to the first partition. I think the way you wrote this, it could also change the behaviour (report the wrong GPU ID) on single-partition GPUs, which is probably not intended. Maybe this would preserve the behaviour for that case: Can you please explain why this could be a issue on a single partition GPU? What would xcp_id be on a single-partition GPU? If it's < 0, then your patch changes the behaviour. Instead or returning the GPU ID from the GPU where the memory was allocated, it returns some arbitrary GPU that the application has access to. Regards, Felix Regards, Mukul ... - else + else if (!kfd_devcgroup_check_permission(dmabuf_adev->kfd.dev- nodes[0])) args->gpu_id = dmabuf_adev->kfd.dev->nodes[0]->id; + else + args->gpu_id = dev->id; Or maybe a more general solution would make DMABuf import work when the exporter is really unknown or not even a GPU. This came up not so long ago in the context of interop with 3rd-party devices. This may require user mode changes as well. Regards, Felix args->flags = flags; /* Copy metadata buffer to user mode */
Re: [PATCH] drm/amdkfd: Check cgroup when returning DMABuf info
On 2024-03-15 14:17, Mukul Joshi wrote: Check cgroup permissions when returning DMA-buf info and based on cgroup check return the id of the GPU that has access to the BO. Signed-off-by: Mukul Joshi --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index dfa8c69532d4..f9631f4b1a02 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1523,7 +1523,7 @@ static int kfd_ioctl_get_dmabuf_info(struct file *filep, /* Find a KFD GPU device that supports the get_dmabuf_info query */ for (i = 0; kfd_topology_enum_kfd_devices(i, ) == 0; i++) - if (dev) + if (dev && !kfd_devcgroup_check_permission(dev)) break; if (!dev) return -EINVAL; @@ -1545,7 +1545,7 @@ static int kfd_ioctl_get_dmabuf_info(struct file *filep, if (xcp_id >= 0) args->gpu_id = dmabuf_adev->kfd.dev->nodes[xcp_id]->id; else - args->gpu_id = dmabuf_adev->kfd.dev->nodes[0]->id; + args->gpu_id = dev->id; If I remember correctly, this was meant as a fallback in case for GTT BOs where the exporting partition wasn't known and the application didn't have access to the first partition. I think the way you wrote this, it could also change the behaviour (report the wrong GPU ID) on single-partition GPUs, which is probably not intended. Maybe this would preserve the behaviour for that case: ... - else + else if (!kfd_devcgroup_check_permission(dmabuf_adev->kfd.dev->nodes[0])) args->gpu_id = dmabuf_adev->kfd.dev->nodes[0]->id; + else + args->gpu_id = dev->id; Or maybe a more general solution would make DMABuf import work when the exporter is really unknown or not even a GPU. This came up not so long ago in the context of interop with 3rd-party devices. This may require user mode changes as well. Regards, Felix args->flags = flags; /* Copy metadata buffer to user mode */
Re: [PATCH 05/10] drivers: use new capable_any functionality
On 2024-03-15 7:37, Christian Göttsche wrote: Use the new added capable_any function in appropriate cases, where a task is required to have any of two capabilities. Reorder CAP_SYS_ADMIN last. Signed-off-by: Christian Göttsche Acked-by: Alexander Gordeev (s390 portion) Acked-by: Felix Kuehling (amdkfd portion) --- v4: Additional usage in kfd_ioctl() v3: rename to capable_any() --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 3 +-- drivers/net/caif/caif_serial.c | 2 +- drivers/s390/block/dasd_eckd.c | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index dfa8c69532d4..8c7ebca01c17 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -3290,8 +3290,7 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) * more priviledged access. */ if (unlikely(ioctl->flags & KFD_IOC_FLAG_CHECKPOINT_RESTORE)) { - if (!capable(CAP_CHECKPOINT_RESTORE) && - !capable(CAP_SYS_ADMIN)) { + if (!capable_any(CAP_CHECKPOINT_RESTORE, CAP_SYS_ADMIN)) { retcode = -EACCES; goto err_i1; } diff --git a/drivers/net/caif/caif_serial.c b/drivers/net/caif/caif_serial.c index ed3a589def6b..e908b9ce57dc 100644 --- a/drivers/net/caif/caif_serial.c +++ b/drivers/net/caif/caif_serial.c @@ -326,7 +326,7 @@ static int ldisc_open(struct tty_struct *tty) /* No write no play */ if (tty->ops->write == NULL) return -EOPNOTSUPP; - if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_TTY_CONFIG)) + if (!capable_any(CAP_SYS_TTY_CONFIG, CAP_SYS_ADMIN)) return -EPERM; /* release devices to avoid name collision */ diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 373c1a86c33e..8f9a5136306a 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -5384,7 +5384,7 @@ static int dasd_symm_io(struct dasd_device *device, void __user *argp) char psf0, psf1; int rc; - if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RAWIO)) + if (!capable_any(CAP_SYS_RAWIO, CAP_SYS_ADMIN)) return -EACCES; psf0 = psf1 = 0;
Re: Proposal to add CRIU support to DRM render nodes
On 2024-03-12 5:45, Tvrtko Ursulin wrote: On 11/03/2024 14:48, Tvrtko Ursulin wrote: Hi Felix, On 06/12/2023 21:23, Felix Kuehling wrote: Executive Summary: We need to add CRIU support to DRM render nodes in order to maintain CRIU support for ROCm application once they start relying on render nodes for more GPU memory management. In this email I'm providing some background why we are doing this, and outlining some of the problems we need to solve to checkpoint and restore render node state and shared memory (DMABuf) state. I have some thoughts on the API design, leaning on what we did for KFD, but would like to get feedback from the DRI community regarding that API and to what extent there is interest in making that generic. We are working on using DRM render nodes for virtual address mappings in ROCm applications to implement the CUDA11-style VM API and improve interoperability between graphics and compute. This uses DMABufs for sharing buffer objects between KFD and multiple render node devices, as well as between processes. In the long run this also provides a path to moving all or most memory management from the KFD ioctl API to libdrm. Once ROCm user mode starts using render nodes for virtual address management, that creates a problem for checkpointing and restoring ROCm applications with CRIU. Currently there is no support for checkpointing and restoring render node state, other than CPU virtual address mappings. Support will be needed for checkpointing GEM buffer objects and handles, their GPU virtual address mappings and memory sharing relationships between devices and processes. Eventually, if full CRIU support for graphics applications is desired, more state would need to be captured, including scheduler contexts and BO lists. Most of this state is driver-specific. After some internal discussions we decided to take our design process public as this potentially touches DRM GEM and DMABuf APIs and may have implications for other drivers in the future. One basic question before going into any API details: Is there a desire to have CRIU support for other DRM drivers? This sounds like a very interesting feature on the overall, although I cannot answer on the last question here. I forgot to finish this thought. I cannot answer / don't know of any concrete plans, but I think feature is pretty cool and if amdgpu gets it working I wouldn't be surprised if other drivers would get interested. Thanks, that's good to hear! Funnily enough, it has a tiny relation to an i915 feature I recently implemented on Mesa's request, which is to be able to "upload" the GPU context from the GPU hang error state and replay the hanging request. It is kind of (at a stretch) a very special tiny subset of checkout and restore so I am not mentioning it as a curiosity. And there is also another partical conceptual intersect with the (at the moment not yet upstream) i915 online debugger. This part being in the area of discovering and enumerating GPU resources beloning to the client. I don't see an immediate design or code sharing opportunities though but just mentioning. I did spend some time reading your plugin and kernel implementation out of curiousity and have some comments and questions. With that out of the way, some considerations for a possible DRM CRIU API (either generic of AMDGPU driver specific): The API goes through several phases during checkpoint and restore: Checkpoint: 1. Process-info (enumerates objects and sizes so user mode can allocate memory for the checkpoint, stops execution on the GPU) 2. Checkpoint (store object metadata for BOs, queues, etc.) 3. Unpause (resumes execution after the checkpoint is complete) Restore: 1. Restore (restore objects, VMAs are not in the right place at this time) 2. Resume (final fixups after the VMAs are sorted out, resume execution) Btw is check-pointing guaranteeing all relevant activity is idled? For instance dma_resv objects are free of fences which would need to restored for things to continue executing sensibly? Or how is that handled? In our compute use cases, we suspend user mode queues. This can include CWSR (compute-wave-save-restore) where the state of in-flight waves is stored in memory and can be reloaded and resumed from memory later. We don't use any fences other than "eviction fences", that are signaled after the queues are suspended. And those fences are never handed to user mode. So we don't need to worry about any fence state in the checkpoint. If we extended this to support the kernel mode command submission APIs, I would expect that we'd wait for all current submissions to complete, and stop new ones from being sent to the HW before taking the checkpoint. When we take the checkpoint in the CRIU plugin, the CPU threads are already frozen and cannot submit any more work. If we wait for all currently pending submissions to dra
Re: [PATCH 2/2] drm/amdkfd: Check preemption status on all XCDs
On 2024-03-14 12:00, Mukul Joshi wrote: This patch adds the following functionality: - Check the queue preemption status on all XCDs in a partition for GFX 9.4.3. - Update the queue preemption debug message to print the queue doorbell id for which preemption failed. - Change the signature of check preemption failed function to return a bool instead of uint32_t and pass the MQD manager as an argument. Suggested-by: Jay Cornwall Signed-off-by: Mukul Joshi --- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 3 +-- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 18 + drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h | 4 ++- .../gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c | 4 +-- .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c | 4 +-- .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c | 4 +-- .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 25 --- .../gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c | 4 +-- 8 files changed, 52 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 1ce398ab0b3d..151fabf84040 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -1997,8 +1997,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, * check those fields */ mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]; - if (mqd_mgr->check_preemption_failed(dqm->packet_mgr.priv_queue->queue->mqd)) { - dev_err(dev, "HIQ MQD's queue_doorbell_id0 is not 0, Queue preemption time out\n"); + if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd)) { while (halt_if_hws_hang) schedule(); return -ETIME; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c index 050a6936ff84..cbec8c87c984 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c @@ -290,3 +290,21 @@ uint64_t kfd_mqd_stride(struct mqd_manager *mm, { return mm->mqd_size; } + +bool kfd_check_hiq_mqd_doorbell_id(struct kfd_node *node, uint32_t doorbell_id, + uint32_t inst) +{ + if (doorbell_id) { + struct device *dev = node->adev->dev; + + if (KFD_GC_VERSION(node) == IP_VERSION(9, 4, 3)) Could this be made more generic? E.g.: if (node->adev->xcp_mgr && node->adev->xcp_mgr->num_xcps > 0) Other than that, the series is Reviewed-by: Felix Kuehling + dev_err(dev, "XCC %d: Queue preemption failed for queue with doorbell_id: %x\n", + inst, doorbell_id); + else + dev_err(dev, "Queue preemption failed for queue with doorbell_id: %x\n", + doorbell_id); + return true; + } + + return false; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h index ba3eebb2ca6d..17cc1f25c8d0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h @@ -119,7 +119,7 @@ struct mqd_manager { #if defined(CONFIG_DEBUG_FS) int (*debugfs_show_mqd)(struct seq_file *m, void *data); #endif - uint32_t (*check_preemption_failed)(void *mqd); + bool (*check_preemption_failed)(struct mqd_manager *mm, void *mqd); uint64_t (*mqd_stride)(struct mqd_manager *mm, struct queue_properties *p); @@ -198,4 +198,6 @@ void kfd_get_hiq_xcc_mqd(struct kfd_node *dev, uint64_t kfd_hiq_mqd_stride(struct kfd_node *dev); uint64_t kfd_mqd_stride(struct mqd_manager *mm, struct queue_properties *q); +bool kfd_check_hiq_mqd_doorbell_id(struct kfd_node *node, uint32_t doorbell_id, + uint32_t inst); #endif /* KFD_MQD_MANAGER_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c index 8f9f56f7a8b0..05f3ac2eaef9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c @@ -206,11 +206,11 @@ static void __update_mqd(struct mqd_manager *mm, void *mqd, q->is_active = QUEUE_IS_ACTIVE(*q); } -static uint32_t check_preemption_failed(void *mqd) +static bool check_preemption_failed(struct mqd_manager *mm, void *mqd) { struct cik_mqd *m = (struct cik_mqd *)mqd; - return m->queue_doorbell_id0; + return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0); } static void update_
Re: [PATCH AUTOSEL 5.15 3/5] drm/amdgpu: Enable gpu reset for S3 abort cases on Raven series
On 2024-03-11 11:14, Sasha Levin wrote: From: Prike Liang [ Upstream commit c671ec01311b4744b377f98b0b4c6d033fe569b3 ] Currently, GPU resets can now be performed successfully on the Raven series. While GPU reset is required for the S3 suspend abort case. So now can enable gpu reset for S3 abort cases on the Raven series. This looks suspicious to me. I'm not sure what conditions made the GPU reset successful. But unless all the changes involved were also backported, this should probably not be applied to older kernel branches. I'm speculating it may be related to the removal of AMD IOMMUv2. Regards, Felix Signed-off-by: Prike Liang Acked-by: Alex Deucher Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/amdgpu/soc15.c | 45 +- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index 6a3486f52d698..ef5b3eedc8615 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -605,11 +605,34 @@ soc15_asic_reset_method(struct amdgpu_device *adev) return AMD_RESET_METHOD_MODE1; } +static bool soc15_need_reset_on_resume(struct amdgpu_device *adev) +{ + u32 sol_reg; + + sol_reg = RREG32_SOC15(MP0, 0, mmMP0_SMN_C2PMSG_81); + + /* Will reset for the following suspend abort cases. +* 1) Only reset limit on APU side, dGPU hasn't checked yet. +* 2) S3 suspend abort and TOS already launched. +*/ + if (adev->flags & AMD_IS_APU && adev->in_s3 && + !adev->suspend_complete && + sol_reg) + return true; + + return false; +} + static int soc15_asic_reset(struct amdgpu_device *adev) { /* original raven doesn't have full asic reset */ - if ((adev->apu_flags & AMD_APU_IS_RAVEN) || - (adev->apu_flags & AMD_APU_IS_RAVEN2)) + /* On the latest Raven, the GPU reset can be performed +* successfully. So now, temporarily enable it for the +* S3 suspend abort case. +*/ + if (((adev->apu_flags & AMD_APU_IS_RAVEN) || + (adev->apu_flags & AMD_APU_IS_RAVEN2)) && + !soc15_need_reset_on_resume(adev)) return 0; switch (soc15_asic_reset_method(adev)) { @@ -1490,24 +1513,6 @@ static int soc15_common_suspend(void *handle) return soc15_common_hw_fini(adev); } -static bool soc15_need_reset_on_resume(struct amdgpu_device *adev) -{ - u32 sol_reg; - - sol_reg = RREG32_SOC15(MP0, 0, mmMP0_SMN_C2PMSG_81); - - /* Will reset for the following suspend abort cases. -* 1) Only reset limit on APU side, dGPU hasn't checked yet. -* 2) S3 suspend abort and TOS already launched. -*/ - if (adev->flags & AMD_IS_APU && adev->in_s3 && - !adev->suspend_complete && - sol_reg) - return true; - - return false; -} - static int soc15_common_resume(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle;
Re: [PATCH] drm/amdgpu: Do a basic health check before reset
On 2024-03-13 5:41, Lijo Lazar wrote: Check if the device is present in the bus before trying to recover. It could be that device itself is lost from the bus in some hang situations. Signed-off-by: Lijo Lazar --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24 ++ 1 file changed, 24 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 1e9454e6e4cb..b37113b79483 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5536,6 +5536,23 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) } +static int amdgpu_device_health_check(struct list_head *device_list_handle) +{ + struct amdgpu_device *tmp_adev; + int ret = 0; + u32 status; + + list_for_each_entry(tmp_adev, device_list_handle, reset_list) { + pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, ); + if (PCI_POSSIBLE_ERROR(status)) { + dev_err(tmp_adev->dev, "device lost from bus!"); + ret = -ENODEV; You could just return here. What's the point of looking for other devices if you're going to return an error anyway? Regards, Felix + } + } + + return ret; +} + /** * amdgpu_device_gpu_recover - reset the asic and recover scheduler * @@ -5607,6 +5624,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, device_list_handle = _list; } + if (!amdgpu_sriov_vf(adev)) { + r = amdgpu_device_health_check(device_list_handle); + if (r) + goto end_reset; + } + /* We need to lock reset domain only once both for XGMI and single device */ tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, reset_list); @@ -5772,6 +5795,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, reset_list); amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); +end_reset: if (hive) { mutex_unlock(>hive_lock); amdgpu_put_xgmi_hive(hive);
Re: [PATCH] drm/amd/amdgpu: Enable IH Retry CAM by register read
On 2024-03-13 13:43, Dewan Alam wrote: IH Retry CAM should be enabled by register reads instead of always being set to true. This explanation sounds odd. Your code is still writing the register first. What's the reason for reading back the register? I assume it's not needed for enabling the CAM, but to check whether it was enabled successfully. What are the configurations where it cannot be enabled successfully? Two more nit-picks inline ... Signed-off-by: Dewan Alam --- drivers/gpu/drm/amd/amdgpu/vega20_ih.c | 15 +++ 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c index b9e785846637..c330f5a88a06 100644 --- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c @@ -337,13 +337,20 @@ static int vega20_ih_irq_init(struct amdgpu_device *adev) /* Enable IH Retry CAM */ if (amdgpu_ip_version(adev, OSSSYS_HWIP, 0) == IP_VERSION(4, 4, 0) || - amdgpu_ip_version(adev, OSSSYS_HWIP, 0) == IP_VERSION(4, 4, 2)) + amdgpu_ip_version(adev, OSSSYS_HWIP, 0) == IP_VERSION(4, 4, 2)) { WREG32_FIELD15(OSSSYS, 0, IH_RETRY_INT_CAM_CNTL_ALDEBARAN, ENABLE, 1); - else + adev->irq.retry_cam_enabled = REG_GET_FIELD( + RREG32_SOC15(OSSSYS, 0, + mmIH_RETRY_INT_CAM_CNTL_ALDEBARAN), + IH_RETRY_INT_CAM_CNTL_ALDEBARAN, ENABLE); + } else { Indentation looks wrong here. WREG32_FIELD15(OSSSYS, 0, IH_RETRY_INT_CAM_CNTL, ENABLE, 1); - - adev->irq.retry_cam_enabled = true; + adev->irq.retry_cam_enabled = REG_GET_FIELD( + RREG32_SOC15(OSSSYS, 0, + mmIH_RETRY_INT_CAM_CNTL), + IH_RETRY_INT_CAM_CNTL, ENABLE); + } Wrong indentation. Regards, Felix /* enable interrupts */ ret = vega20_ih_toggle_interrupts(adev, true);
Re: [PATCH v3] drm/amdgpu: Init zone device and drm client after mode-1 reset on reload
On 2024-03-08 14:00, Ahmad Rehman wrote: In passthrough environment, when amdgpu is reloaded after unload, mode-1 is triggered after initializing the necessary IPs, That init does not include KFD, and KFD init waits until the reset is completed. KFD init is called in the reset handler, but in this case, the zone device and drm client is not initialized, causing app to create kernel panic. v2: Removing the init KFD condition from amdgpu_amdkfd_drm_client_create. As the previous version has the potential of creating DRM client twice. v3: v2 patch results in SDMA engine hung as DRM open causes VM clear to SDMA before SDAM init. Adding the condition to in drm client creation, on top of v1, to guard against drm client creation call multiple times. Signed-off-by: Ahmad Rehman Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c| 5 - 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index f5f2945711be..4389d24f36e2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -146,8 +146,8 @@ int amdgpu_amdkfd_drm_client_create(struct amdgpu_device *adev) { int ret; - if (!adev->kfd.init_complete) - return 0; + if (!adev->kfd.init_complete || adev->kfd.client.dev) +return 0; ret = drm_client_init(>ddev, >kfd.client, "kfd", _client_funcs); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 15b188aaf681..80b9642f2bc4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2479,8 +2479,11 @@ static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work) } for (i = 0; i < mgpu_info.num_dgpu; i++) { adev = mgpu_info.gpu_ins[i].adev; - if (!adev->kfd.init_complete) + if (!adev->kfd.init_complete) { + kgd2kfd_init_zone_device(adev); amdgpu_amdkfd_device_init(adev); + amdgpu_amdkfd_drm_client_create(adev); + } amdgpu_ttm_set_buffer_funcs_status(adev, true); } }
Re: [PATCH] drm/amdgpu: Handle duplicate BOs during process restore
On 2024-03-11 12:33, Christian König wrote: Am 11.03.24 um 16:33 schrieb Felix Kuehling: On 2024-03-11 11:25, Joshi, Mukul wrote: [AMD Official Use Only - General] -Original Message- From: Christian König Sent: Monday, March 11, 2024 2:50 AM To: Joshi, Mukul ; amd-gfx@lists.freedesktop.org Cc: Kuehling, Felix Subject: Re: [PATCH] drm/amdgpu: Handle duplicate BOs during process restore Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding. Am 08.03.24 um 17:22 schrieb Mukul Joshi: In certain situations, some apps can import a BO multiple times (through IPC for example). To restore such processes successfully, we need to tell drm to ignore duplicate BOs. While at it, also add additional logging to prevent silent failures when process restore fails. Signed-off-by: Mukul Joshi --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 14 ++ 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index bf8e6653341f..65d808d8b5da 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -2869,14 +2869,16 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu * mutex_lock(_info->lock); - drm_exec_init(, 0); + drm_exec_init(, DRM_EXEC_IGNORE_DUPLICATES); drm_exec_until_all_locked() { list_for_each_entry(peer_vm, _info->vm_list_head, vm_list_node) { ret = amdgpu_vm_lock_pd(peer_vm, , 2); drm_exec_retry_on_contention(); - if (unlikely(ret)) + if (unlikely(ret)) { + pr_err("Locking VM PD failed, ret: + %d\n", ret); goto ttm_reserve_fail; + } That's a bad idea. Locking can always be interrupted and that would print an error here. Thanks Christian. Will send out a patch to change it to pr_debug. We cannot get interrupted here because we're in a worker thread. We should be running in non-interruptible mode. Ah! Ok in that case this isn't necessary. But in general I think we should avoid error printing like that. If we want to know where something failed there is a function tracker for that. In this case, it was hard to know that something failed at all. The problem manifested as a soft-hang in an application, and it took several teams several days to track it down to an eviction/restore problem in kernel mode. A failure to reserve BOs seems like the type of problem that is not expected here, and would justify an error or warning message in the kernel log. That would have helped track down this issue much faster. Regards, Felix Regards, Christian. Regards, Felix Regards, Mukul Regards, Christian. } /* Reserve all BOs and page tables/directory. Add all BOs from @@ -2889,8 +2891,10 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu * gobj = >bo->tbo.base; ret = drm_exec_prepare_obj(, gobj, 1); drm_exec_retry_on_contention(); - if (unlikely(ret)) + if (unlikely(ret)) { + pr_err("drm_exec_prepare_obj failed, + ret: %d\n", ret); goto ttm_reserve_fail; + } } } @@ -2950,8 +2954,10 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu * * validations above would invalidate DMABuf imports again. */ ret = process_validate_vms(process_info, ); - if (ret) + if (ret) { + pr_err("Validating VMs failed, ret: %d\n", ret); goto validate_map_fail; + } /* Update mappings not managed by KFD */ list_for_each_entry(peer_vm, _info->vm_list_head,
Re: [PATCH] drm/amdgpu: Handle duplicate BOs during process restore
On 2024-03-11 11:25, Joshi, Mukul wrote: [AMD Official Use Only - General] -Original Message- From: Christian König Sent: Monday, March 11, 2024 2:50 AM To: Joshi, Mukul ; amd-gfx@lists.freedesktop.org Cc: Kuehling, Felix Subject: Re: [PATCH] drm/amdgpu: Handle duplicate BOs during process restore Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding. Am 08.03.24 um 17:22 schrieb Mukul Joshi: In certain situations, some apps can import a BO multiple times (through IPC for example). To restore such processes successfully, we need to tell drm to ignore duplicate BOs. While at it, also add additional logging to prevent silent failures when process restore fails. Signed-off-by: Mukul Joshi --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 14 ++ 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index bf8e6653341f..65d808d8b5da 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -2869,14 +2869,16 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu * mutex_lock(_info->lock); - drm_exec_init(, 0); + drm_exec_init(, DRM_EXEC_IGNORE_DUPLICATES); drm_exec_until_all_locked() { list_for_each_entry(peer_vm, _info->vm_list_head, vm_list_node) { ret = amdgpu_vm_lock_pd(peer_vm, , 2); drm_exec_retry_on_contention(); - if (unlikely(ret)) + if (unlikely(ret)) { + pr_err("Locking VM PD failed, ret: + %d\n", ret); goto ttm_reserve_fail; + } That's a bad idea. Locking can always be interrupted and that would print an error here. Thanks Christian. Will send out a patch to change it to pr_debug. We cannot get interrupted here because we're in a worker thread. We should be running in non-interruptible mode. Regards, Felix Regards, Mukul Regards, Christian. } /* Reserve all BOs and page tables/directory. Add all BOs from @@ -2889,8 +2891,10 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu * gobj = >bo->tbo.base; ret = drm_exec_prepare_obj(, gobj, 1); drm_exec_retry_on_contention(); - if (unlikely(ret)) + if (unlikely(ret)) { + pr_err("drm_exec_prepare_obj failed, + ret: %d\n", ret); goto ttm_reserve_fail; + } } } @@ -2950,8 +2954,10 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu * * validations above would invalidate DMABuf imports again. */ ret = process_validate_vms(process_info, ); - if (ret) + if (ret) { + pr_err("Validating VMs failed, ret: %d\n", ret); goto validate_map_fail; + } /* Update mappings not managed by KFD */ list_for_each_entry(peer_vm, _info->vm_list_head,
Re: [PATCH] drm/amdgpu: Handle duplicate BOs during process restore
On 2024-03-08 11:22, Mukul Joshi wrote: In certain situations, some apps can import a BO multiple times (through IPC for example). To restore such processes successfully, we need to tell drm to ignore duplicate BOs. While at it, also add additional logging to prevent silent failures when process restore fails. Signed-off-by: Mukul Joshi --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 14 ++ 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index bf8e6653341f..65d808d8b5da 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -2869,14 +2869,16 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu * mutex_lock(_info->lock); - drm_exec_init(, 0); + drm_exec_init(, DRM_EXEC_IGNORE_DUPLICATES); drm_exec_until_all_locked() { list_for_each_entry(peer_vm, _info->vm_list_head, vm_list_node) { ret = amdgpu_vm_lock_pd(peer_vm, , 2); drm_exec_retry_on_contention(); - if (unlikely(ret)) + if (unlikely(ret)) { + pr_err("Locking VM PD failed, ret: %d\n", ret); pr_err makes sense here as it indicates a persistent problem that would cause soft hangs, like in this case. goto ttm_reserve_fail; + } } /* Reserve all BOs and page tables/directory. Add all BOs from @@ -2889,8 +2891,10 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu * gobj = >bo->tbo.base; ret = drm_exec_prepare_obj(, gobj, 1); drm_exec_retry_on_contention(); - if (unlikely(ret)) + if (unlikely(ret)) { + pr_err("drm_exec_prepare_obj failed, ret: %d\n", ret); Same here, pr_err is fine. goto ttm_reserve_fail; + } } } @@ -2950,8 +2954,10 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu * * validations above would invalidate DMABuf imports again. */ ret = process_validate_vms(process_info, ); - if (ret) + if (ret) { + pr_err("Validating VMs failed, ret: %d\n", ret); I'd make this a pr_debug to avoid spamming the log. validation can fail intermittently and rescheduling the worker is there to handle it. With that fixed, the patch is Reviewed-by: Felix Kuehling goto validate_map_fail; + } /* Update mappings not managed by KFD */ list_for_each_entry(peer_vm, _info->vm_list_head,
Re: [PATCH v5 1/2] drm/amdgpu: implement TLB flush fence
On 2024-03-07 1:39, Sharma, Shashank wrote: On 07/03/2024 00:54, Felix Kuehling wrote: On 2024-03-06 09:41, Shashank Sharma wrote: From: Christian König The problem is that when (for example) 4k pages are replaced with a single 2M page we need to wait for change to be flushed out by invalidating the TLB before the PT can be freed. Solve this by moving the TLB flush into a DMA-fence object which can be used to delay the freeing of the PT BOs until it is signaled. V2: (Shashank) - rebase - set dma_fence_error only in case of error - add tlb_flush fence only when PT/PD BO is locked (Felix) - use vm->pasid when f is NULL (Mukul) V4: - add a wait for (f->dependency) in tlb_fence_work (Christian) - move the misplaced fence_create call to the end (Philip) V5: - free the f->dependency properly (Christian) Cc: Christian Koenig Cc: Felix Kuehling Cc: Rajneesh Bhardwaj Cc: Alex Deucher Reviewed-by: Shashank Sharma Signed-off-by: Christian König Signed-off-by: Shashank Sharma --- drivers/gpu/drm/amd/amdgpu/Makefile | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 10 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 4 + .../gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c | 112 ++ 4 files changed, 128 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile index fa26a4e3a99d..91ab4cf29b5b 100644 --- a/drivers/gpu/drm/amd/amdgpu/Makefile +++ b/drivers/gpu/drm/amd/amdgpu/Makefile @@ -70,7 +70,8 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \ amdgpu_cs.o amdgpu_bios.o amdgpu_benchmark.o \ atombios_dp.o amdgpu_afmt.o amdgpu_trace_points.o \ atombios_encoders.o amdgpu_sa.o atombios_i2c.o \ - amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_ib.o amdgpu_pll.o \ + amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_vm_tlb_fence.o \ + amdgpu_ib.o amdgpu_pll.o \ amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \ amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o \ amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 0960e0a665d3..310aae6fb49b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -988,6 +988,15 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, r = vm->update_funcs->commit(, fence); + /* Prepare a TLB flush fence to be attached to PTs */ + if (!unlocked && params.needs_flush && vm->is_compute_context) { + amdgpu_vm_tlb_fence_create(adev, vm, fence); This schedules a TLB flush after "fence" signals and replaces "fence" with a new one that will signal after the TLB flush is done. That part I understand. I'm not sure why this only applies to compute contexts. + + /* Makes sure no PD/PT is freed before the flush */ + dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence, + DMA_RESV_USAGE_BOOKKEEP); But what's the point of adding the fence to the page table reservation? This is after the BOs have already been freed. Maybe it would make more sense to move this into the next patch, where the freeing is done after this point. To make it easier for code review, the split of the patches is like: - one patch introduces function creating tlb_flush_fence and uses it - the second patch does the rework and movement of freeing of the buffer after the patch attach. If we move this change into next patch, in this patch we will just create the fence, where one can argue why create the fence if no one is using it. May be, we can make 'changes in freeing of buffers' as first patch in sequence, and make this second patch in the series, so that you know the background of changes better. Sure. I don't think it's super important. I was just trying to understand how the two patches fit together. I think it makes sense now. I discussed this also with Philip offline. We think there may be an easier way to solve the "wait for TLB flush before freeing BOs" thing, but I believe using the new TLB flush fence is architecturally cleaner, and that fence will be useful to solve some other issues that are either still lingering, or currently have only some ugly workarounds. I'll need to dig through the code and my memory to remember the details. I'm still not sure whether the creation of the TLB flush fence should be limited to compute contexts, but I'm happy to get them at least there for now. The series is Acked-by: Felix Kuehling Regards, Felix - Shashank Regards, Felix + } + error_unlock: amdgpu_vm_eviction_unlock(vm); drm_dev_exit(idx); @@ -2237,6 +2246
Re: [PATCH v5 1/2] drm/amdgpu: implement TLB flush fence
On 2024-03-06 09:41, Shashank Sharma wrote: From: Christian König The problem is that when (for example) 4k pages are replaced with a single 2M page we need to wait for change to be flushed out by invalidating the TLB before the PT can be freed. Solve this by moving the TLB flush into a DMA-fence object which can be used to delay the freeing of the PT BOs until it is signaled. V2: (Shashank) - rebase - set dma_fence_error only in case of error - add tlb_flush fence only when PT/PD BO is locked (Felix) - use vm->pasid when f is NULL (Mukul) V4: - add a wait for (f->dependency) in tlb_fence_work (Christian) - move the misplaced fence_create call to the end (Philip) V5: - free the f->dependency properly (Christian) Cc: Christian Koenig Cc: Felix Kuehling Cc: Rajneesh Bhardwaj Cc: Alex Deucher Reviewed-by: Shashank Sharma Signed-off-by: Christian König Signed-off-by: Shashank Sharma --- drivers/gpu/drm/amd/amdgpu/Makefile | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c| 10 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h| 4 + .../gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c | 112 ++ 4 files changed, 128 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile index fa26a4e3a99d..91ab4cf29b5b 100644 --- a/drivers/gpu/drm/amd/amdgpu/Makefile +++ b/drivers/gpu/drm/amd/amdgpu/Makefile @@ -70,7 +70,8 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \ amdgpu_cs.o amdgpu_bios.o amdgpu_benchmark.o \ atombios_dp.o amdgpu_afmt.o amdgpu_trace_points.o \ atombios_encoders.o amdgpu_sa.o atombios_i2c.o \ - amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_ib.o amdgpu_pll.o \ + amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_vm_tlb_fence.o \ + amdgpu_ib.o amdgpu_pll.o \ amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \ amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o \ amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 0960e0a665d3..310aae6fb49b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -988,6 +988,15 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, r = vm->update_funcs->commit(, fence); + /* Prepare a TLB flush fence to be attached to PTs */ + if (!unlocked && params.needs_flush && vm->is_compute_context) { + amdgpu_vm_tlb_fence_create(adev, vm, fence); This schedules a TLB flush after "fence" signals and replaces "fence" with a new one that will signal after the TLB flush is done. That part I understand. I'm not sure why this only applies to compute contexts. + + /* Makes sure no PD/PT is freed before the flush */ + dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence, + DMA_RESV_USAGE_BOOKKEEP); But what's the point of adding the fence to the page table reservation? This is after the BOs have already been freed. Maybe it would make more sense to move this into the next patch, where the freeing is done after this point. Regards, Felix + } + error_unlock: amdgpu_vm_eviction_unlock(vm); drm_dev_exit(idx); @@ -2237,6 +2246,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, mutex_init(>eviction_lock); vm->evicting = false; + vm->tlb_fence_context = dma_fence_context_alloc(1); r = amdgpu_vm_pt_create(adev, vm, adev->vm_manager.root_level, false, , xcp_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 64b3f69efa57..298f604b8e5f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -341,6 +341,7 @@ struct amdgpu_vm { atomic64_t tlb_seq; uint64_ttlb_seq_va; uint64_t*tlb_seq_cpu_addr; + uint64_ttlb_fence_context; atomic64_t kfd_last_flushed_seq; @@ -594,5 +595,8 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev, uint64_t addr, uint32_t status, unsigned int vmhub); +void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev, +struct amdgpu_vm *vm, +struct dma_fence **fence); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c new file mode 100644 index 0
Re: [PATCH 2/3] drm/amdgpu: sdma support for sriov cpx mode
On 2024-03-05 14:49, Dhume, Samir wrote: [AMD Official Use Only - General] -Original Message- From: Kuehling, Felix Sent: Monday, March 4, 2024 6:47 PM To: Dhume, Samir ; amd-gfx@lists.freedesktop.org Cc: Lazar, Lijo ; Wan, Gavin ; Liu, Leo ; Deucher, Alexander Subject: Re: [PATCH 2/3] drm/amdgpu: sdma support for sriov cpx mode On 2024-03-04 10:19, Samir Dhume wrote: Signed-off-by: Samir Dhume Please add a meaningful commit description to all the patches in the series. See one more comment below. Right! --- drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 34 +++- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index fec5a3d1c4bc..f666ececbe7d 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -82,17 +82,37 @@ static unsigned sdma_v4_4_2_seq_to_irq_id(int seq_num) } } -static int sdma_v4_4_2_irq_id_to_seq(unsigned client_id) +static int sdma_v4_4_2_irq_id_to_seq(struct amdgpu_device *adev, unsigned client_id) { + + struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr; + bool sriov_cpx_odd = false; + int mode; + + if (amdgpu_sriov_vf(adev)) { + mode = xcp_mgr->funcs->query_partition_mode(xcp_mgr); This queries an MMIO register for the current mode. Is that really necessary to do in the interrupt handler? Could we use the partition mode stored in xcp_mgr->mode instead? The design appears to be that even when the host sets the mode to DPX/QPX/CPX, each guest sets itself to be in the SPX mode and xcp_mgr->mode is set to SPX. But I can use a new field in xcp_mgr to reflect the system mode set by the host and remove the MMIO access from the interrupt handler. Can you clarify what it means when the host and guest see a different partition mode? Is this the case, where the host partitions the device into several VFs, and the guest partitions those VFs further into smaller partitions? As far as I know, that finer partitioning in the guest is actually controlled by the host as well. If the guest sees SPX mode, it means it doesn't partition the VF into smaller pieces. Instead of looking at the partition mode, would it make more sense to just query the number of XCDs in the partition (from the xcc_mask)? That should give the right answer regardless of how the host partitioned the GPU. Regards, Felix Thanks, samir Regards, Felix + + if (mode == AMDGPU_CPX_PARTITION_MODE) { + if (adev->gfx.funcs->get_xcc_id(adev, 0) & 0x1) + sriov_cpx_odd = true; + } + } + switch (client_id) { case SOC15_IH_CLIENTID_SDMA0: return 0; case SOC15_IH_CLIENTID_SDMA1: return 1; case SOC15_IH_CLIENTID_SDMA2: - return 2; + if (sriov_cpx_odd) + return 0; + else + return 2; case SOC15_IH_CLIENTID_SDMA3: - return 3; + if (sriov_cpx_odd) + return 1; + else + return 3; default: return -EINVAL; } @@ -1541,7 +1561,7 @@ static int sdma_v4_4_2_process_trap_irq(struct amdgpu_device *adev, uint32_t instance, i; DRM_DEBUG("IH: SDMA trap\n"); - instance = sdma_v4_4_2_irq_id_to_seq(entry->client_id); + instance = sdma_v4_4_2_irq_id_to_seq(adev, entry->client_id); /* Client id gives the SDMA instance in AID. To know the exact SDMA * instance, interrupt entry gives the node id which corresponds to the AID instance. @@ -1584,7 +1604,7 @@ static int sdma_v4_4_2_process_ras_data_cb(struct amdgpu_device *adev, if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) goto out; - instance = sdma_v4_4_2_irq_id_to_seq(entry->client_id); + instance = sdma_v4_4_2_irq_id_to_seq(adev, entry->client_id); if (instance < 0) goto out; @@ -1603,7 +1623,7 @@ static int sdma_v4_4_2_process_illegal_inst_irq(struct amdgpu_device *adev, DRM_ERROR("Illegal instruction in SDMA command stream\n"); - instance = sdma_v4_4_2_irq_id_to_seq(entry->client_id); + instance = sdma_v4_4_2_irq_id_to_seq(adev, entry->client_id); if (instance < 0) return 0; @@ -1647,7 +1667,7 @@ static int sdma_v4_4_2_print_iv_entry(struct amdgpu_device *adev, struct amdgpu_task_info task_info; u64 addr; - instance = sdma_v4_4_2_irq_id_to_seq(entry->client_id); + instance = sdma_v4_4_2_irq_id_to_seq(adev, entry->client_id); if (instance < 0 || instance >= adev->sdma.num_instances) { dev_err(adev->dev, "sdma instance invalid %d\n", instance); return -EINVAL;
Re: [PATCH] drm/amdkfd: make kfd_class constant
On 2024-03-05 7:15, Ricardo B. Marliere wrote: Since commit 43a7206b0963 ("driver core: class: make class_register() take a const *"), the driver core allows for struct class to be in read-only memory, so move the kfd_class structure to be declared at build time placing it into read-only memory, instead of having to be dynamically allocated at boot time. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: Ricardo B. Marliere The patch looks good to me. Do you want me to apply this to Alex's amd-staging-drm-next? Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 21 +++-- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index f030cafc5a0a..dfa8c69532d4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -63,8 +63,10 @@ static const struct file_operations kfd_fops = { }; static int kfd_char_dev_major = -1; -static struct class *kfd_class; struct device *kfd_device; +static const struct class kfd_class = { + .name = kfd_dev_name, +}; static inline struct kfd_process_device *kfd_lock_pdd_by_id(struct kfd_process *p, __u32 gpu_id) { @@ -94,14 +96,13 @@ int kfd_chardev_init(void) if (err < 0) goto err_register_chrdev; - kfd_class = class_create(kfd_dev_name); - err = PTR_ERR(kfd_class); - if (IS_ERR(kfd_class)) + err = class_register(_class); + if (err) goto err_class_create; - kfd_device = device_create(kfd_class, NULL, - MKDEV(kfd_char_dev_major, 0), - NULL, kfd_dev_name); + kfd_device = device_create(_class, NULL, + MKDEV(kfd_char_dev_major, 0), + NULL, kfd_dev_name); err = PTR_ERR(kfd_device); if (IS_ERR(kfd_device)) goto err_device_create; @@ -109,7 +110,7 @@ int kfd_chardev_init(void) return 0; err_device_create: - class_destroy(kfd_class); + class_unregister(_class); err_class_create: unregister_chrdev(kfd_char_dev_major, kfd_dev_name); err_register_chrdev: @@ -118,8 +119,8 @@ int kfd_chardev_init(void) void kfd_chardev_exit(void) { - device_destroy(kfd_class, MKDEV(kfd_char_dev_major, 0)); - class_destroy(kfd_class); + device_destroy(_class, MKDEV(kfd_char_dev_major, 0)); + class_unregister(_class); unregister_chrdev(kfd_char_dev_major, kfd_dev_name); kfd_device = NULL; } --- base-commit: 8bc75586ea01f1c645063d3472c115ecab03e76c change-id: 20240305-class_cleanup-drm-amd-bdc7255b7540 Best regards,
Re: [PATCH] drm/amdgpu: Init zone device and drm client after mode-1 reset on reload
On 2024-03-04 19:20, Rehman, Ahmad wrote: [AMD Official Use Only - General] Hey, Due to mode-1 reset (pending_reset), the amdgpu_amdkfd_device_init will not be called and hence adev->kfd.init_complete will not be set. The function amdgpu_amdkfd_drm_client_create has condition: if (!adev->kfd.init_complete) return 0; So, in probe function, when we return from device_init the KFD is not initialized and amdgpu_amdkfd_drm_client_create returns without doing anything. I think your change could result in calling amdgpu_amdkfd_drm_client_create multiple times. IIRC, one purpose of moving the call to amdgpu_pci_probe was to ensure that it is only called once, because it only gets unregistered once when the driver is unloaded. Maybe it would be better to remove the if (!adev->kfd.init_complete) condition from amdgpu_amdkfd_drm_client_create. That way we would always create the client at probe and it would be ready when it's needed after the GPU reset. There is a chance that the client would get created unnecessarily if KFD init never succeeds. But that should be rare, and it's not a big resource waste. There were some comments on a previous code review, that creating the DRM client too early could cause problems. But I don't understand what that problem could be. As I understand it, the adev->kfd.client is just a place to put GEM handles for KFD BOs that we don't want to expose to user mode. I see no harm in creating this client too early or when it's not needed. Regards, Felix Thanks, Ahmad *From:* Kuehling, Felix *Sent:* Monday, March 4, 2024 6:39 PM *To:* Rehman, Ahmad ; amd-gfx@lists.freedesktop.org *Cc:* Wan, Gavin *Subject:* Re: [PATCH] drm/amdgpu: Init zone device and drm client after mode-1 reset on reload On 2024-03-04 17:05, Ahmad Rehman wrote: > In passthrough environment, when amdgpu is reloaded after unload, mode-1 > is triggered after initializing the necessary IPs, That init does not > include KFD, and KFD init waits until the reset is completed. KFD init > is called in the reset handler, but in this case, the zone device and > drm client is not initialized, causing app to create kernel panic. > > Signed-off-by: Ahmad Rehman > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 5 - > 1 file changed, 4 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > index 15b188aaf681..80b9642f2bc4 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > @@ -2479,8 +2479,11 @@ static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work) > } > for (i = 0; i < mgpu_info.num_dgpu; i++) { > adev = mgpu_info.gpu_ins[i].adev; > - if (!adev->kfd.init_complete) > + if (!adev->kfd.init_complete) { > + kgd2kfd_init_zone_device(adev); > amdgpu_amdkfd_device_init(adev); > + amdgpu_amdkfd_drm_client_create(adev); I don't see what's preventing the DRM client initialization in the reset-on-driver-load case. It only needs to be created once and that happens in amdgpu_pci_probe. Am I missing anything? Regards, Felix > + } > amdgpu_ttm_set_buffer_funcs_status(adev, true); > } > }
Re: [PATCH 2/3] drm/amdgpu: sdma support for sriov cpx mode
On 2024-03-04 10:19, Samir Dhume wrote: Signed-off-by: Samir Dhume Please add a meaningful commit description to all the patches in the series. See one more comment below. --- drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 34 +++- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index fec5a3d1c4bc..f666ececbe7d 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -82,17 +82,37 @@ static unsigned sdma_v4_4_2_seq_to_irq_id(int seq_num) } } -static int sdma_v4_4_2_irq_id_to_seq(unsigned client_id) +static int sdma_v4_4_2_irq_id_to_seq(struct amdgpu_device *adev, unsigned client_id) { + + struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr; + bool sriov_cpx_odd = false; + int mode; + + if (amdgpu_sriov_vf(adev)) { + mode = xcp_mgr->funcs->query_partition_mode(xcp_mgr); This queries an MMIO register for the current mode. Is that really necessary to do in the interrupt handler? Could we use the partition mode stored in xcp_mgr->mode instead? Regards, Felix + + if (mode == AMDGPU_CPX_PARTITION_MODE) { + if (adev->gfx.funcs->get_xcc_id(adev, 0) & 0x1) + sriov_cpx_odd = true; + } + } + switch (client_id) { case SOC15_IH_CLIENTID_SDMA0: return 0; case SOC15_IH_CLIENTID_SDMA1: return 1; case SOC15_IH_CLIENTID_SDMA2: - return 2; + if (sriov_cpx_odd) + return 0; + else + return 2; case SOC15_IH_CLIENTID_SDMA3: - return 3; + if (sriov_cpx_odd) + return 1; + else + return 3; default: return -EINVAL; } @@ -1541,7 +1561,7 @@ static int sdma_v4_4_2_process_trap_irq(struct amdgpu_device *adev, uint32_t instance, i; DRM_DEBUG("IH: SDMA trap\n"); - instance = sdma_v4_4_2_irq_id_to_seq(entry->client_id); + instance = sdma_v4_4_2_irq_id_to_seq(adev, entry->client_id); /* Client id gives the SDMA instance in AID. To know the exact SDMA * instance, interrupt entry gives the node id which corresponds to the AID instance. @@ -1584,7 +1604,7 @@ static int sdma_v4_4_2_process_ras_data_cb(struct amdgpu_device *adev, if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) goto out; - instance = sdma_v4_4_2_irq_id_to_seq(entry->client_id); + instance = sdma_v4_4_2_irq_id_to_seq(adev, entry->client_id); if (instance < 0) goto out; @@ -1603,7 +1623,7 @@ static int sdma_v4_4_2_process_illegal_inst_irq(struct amdgpu_device *adev, DRM_ERROR("Illegal instruction in SDMA command stream\n"); - instance = sdma_v4_4_2_irq_id_to_seq(entry->client_id); + instance = sdma_v4_4_2_irq_id_to_seq(adev, entry->client_id); if (instance < 0) return 0; @@ -1647,7 +1667,7 @@ static int sdma_v4_4_2_print_iv_entry(struct amdgpu_device *adev, struct amdgpu_task_info task_info; u64 addr; - instance = sdma_v4_4_2_irq_id_to_seq(entry->client_id); + instance = sdma_v4_4_2_irq_id_to_seq(adev, entry->client_id); if (instance < 0 || instance >= adev->sdma.num_instances) { dev_err(adev->dev, "sdma instance invalid %d\n", instance); return -EINVAL;
Re: [PATCH] drm/amdgpu: Init zone device and drm client after mode-1 reset on reload
On 2024-03-04 17:05, Ahmad Rehman wrote: In passthrough environment, when amdgpu is reloaded after unload, mode-1 is triggered after initializing the necessary IPs, That init does not include KFD, and KFD init waits until the reset is completed. KFD init is called in the reset handler, but in this case, the zone device and drm client is not initialized, causing app to create kernel panic. Signed-off-by: Ahmad Rehman --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 15b188aaf681..80b9642f2bc4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2479,8 +2479,11 @@ static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work) } for (i = 0; i < mgpu_info.num_dgpu; i++) { adev = mgpu_info.gpu_ins[i].adev; - if (!adev->kfd.init_complete) + if (!adev->kfd.init_complete) { + kgd2kfd_init_zone_device(adev); amdgpu_amdkfd_device_init(adev); + amdgpu_amdkfd_drm_client_create(adev); I don't see what's preventing the DRM client initialization in the reset-on-driver-load case. It only needs to be created once and that happens in amdgpu_pci_probe. Am I missing anything? Regards, Felix + } amdgpu_ttm_set_buffer_funcs_status(adev, true); } }
Re: [PATCH V3] Revert "drm/amdgpu: remove vm sanity check from amdgpu_vm_make_compute" for Raven
On 2024-02-29 01:04, Jesse.Zhang wrote: fix the issue: "amdgpu: Failed to create process VM object". [Why]when amdgpu initialized, seq64 do mampping and update bo mapping in vm page table. But when clifo run. It also initializes a vm for a process device through the function kfd_process_device_init_vm and ensure the root PD is clean through the function amdgpu_vm_pt_is_root_clean. So they have a conflict, and clinfo always failed. v1: - remove all the pte_supports_ats stuff from the amdgpu_vm code (Felix) Signed-off-by: Jesse Zhang The headline should be updated. This is no longer a revert of the quoted patch. Other than that, this patch looks reasonable to me. One more comment inline. --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c| 23 -- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h| 3 -- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 56 +-- 3 files changed, 1 insertion(+), 81 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index ed4a8c5d26d7..d004ace79536 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -1385,10 +1385,6 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev, struct amdgpu_bo_va_mapping, list); list_del(>list); - if (vm->pte_support_ats && - mapping->start < AMDGPU_GMC_HOLE_START) - init_pte_value = AMDGPU_PTE_DEFAULT_ATC; - r = amdgpu_vm_update_range(adev, vm, false, false, true, false, resv, mapping->start, mapping->last, init_pte_value, 0, 0, NULL, NULL, @@ -2264,7 +2260,6 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, if (r) return r; - vm->pte_support_ats = false; vm->is_compute_context = false; vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode & @@ -2350,30 +2345,12 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, */ int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm) { - bool pte_support_ats = (adev->asic_type == CHIP_RAVEN); int r; r = amdgpu_bo_reserve(vm->root.bo, true); if (r) return r; - /* Check if PD needs to be reinitialized and do it before -* changing any other state, in case it fails. -*/ - if (pte_support_ats != vm->pte_support_ats) { - /* Sanity checks */ - if (!amdgpu_vm_pt_is_root_clean(adev, vm)) { - r = -EINVAL; - goto unreserve_bo; - } - - vm->pte_support_ats = pte_support_ats; - r = amdgpu_vm_pt_clear(adev, vm, to_amdgpu_bo_vm(vm->root.bo), - false); - if (r) - goto unreserve_bo; - } - /* Update VM state */ vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode & AMDGPU_VM_USE_CPU_FOR_COMPUTE); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 42f6ddec50c1..9f6b5e1ccf34 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -357,9 +357,6 @@ struct amdgpu_vm { /* Functions to use for VM table updates */ const struct amdgpu_vm_update_funcs *update_funcs; - /* Flag to indicate ATS support from PTE for GFX9 */ - boolpte_support_ats; - /* Up to 128 pending retry page faults */ DECLARE_KFIFO(faults, u64, 128); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index a160265ddc07..2835cb3f76eb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -89,22 +89,6 @@ static unsigned int amdgpu_vm_pt_num_entries(struct amdgpu_device *adev, return AMDGPU_VM_PTE_COUNT(adev); } -/** - * amdgpu_vm_pt_num_ats_entries - return the number of ATS entries in the root PD - * - * @adev: amdgpu_device pointer - * - * Returns: - * The number of entries in the root page directory which needs the ATS setting. - */ -static unsigned int amdgpu_vm_pt_num_ats_entries(struct amdgpu_device *adev) -{ - unsigned int shift; - - shift = amdgpu_vm_pt_level_shift(adev, adev->vm_manager.root_level); - return AMDGPU_GMC_HOLE_START >> (shift + AMDGPU_GPU_PAGE_SHIFT); -} - /** * amdgpu_vm_pt_entries_mask - the mask to get the entry number of a PD/PT * @@ -379,7 +363,7 @@ int amdgpu_vm_pt_clear(struct amdgpu_device *adev, struct amdgpu_vm *vm, struct ttm_operation_ctx ctx = { true, false }; struct amdgpu_vm_update_params params; struct amdgpu_bo *ancestor = >bo; - unsigned int entries,
Re: [PATCH v3] drm/amdgpu: change vm->task_info handling
On 2024-02-05 12:05, Shashank Sharma wrote: This patch changes the handling and lifecycle of vm->task_info object. The major changes are: - vm->task_info is a dynamically allocated ptr now, and its uasge is reference counted. - introducing two new helper funcs for task_info lifecycle management - amdgpu_vm_get_task_info: reference counts up task_info before returning this info - amdgpu_vm_put_task_info: reference counts down task_info - last put to task_info() frees task_info from the vm. This patch also does logistical changes required for existing usage of vm->task_info. V2: Do not block all the prints when task_info not found (Felix) V3: (Felix) - Fix wrong indentation - No debug message for -ENOMEM - Add NULL check for task_info - Do not duplicate the debug messages (ti vs no ti) - Get first reference of task_info in vm_init(), put last in vm_fini() Cc: Christian Koenig Cc: Alex Deucher Cc: Felix Kuehling Signed-off-by: Shashank Sharma One nit-pick and one bug inline. With those fixed, the patch Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 9 +- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 18 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 12 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 158 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 21 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 24 +-- drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 23 +-- drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c | 20 ++- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 23 +-- drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 23 +-- drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c| 22 +-- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 20 +-- 13 files changed, 251 insertions(+), 124 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 0e61ebdb3f3e..f9eb12697b95 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -1775,9 +1775,14 @@ static int amdgpu_debugfs_vm_info_show(struct seq_file *m, void *unused) list_for_each_entry(file, >filelist, lhead) { struct amdgpu_fpriv *fpriv = file->driver_priv; struct amdgpu_vm *vm = >vm; + struct amdgpu_task_info *ti; + + ti = amdgpu_vm_get_task_info_vm(vm); + if (ti) { + seq_printf(m, "pid:%d\tProcess:%s --\n", ti->pid, ti->process_name); + amdgpu_vm_put_task_info(ti); + } - seq_printf(m, "pid:%d\tProcess:%s --\n", - vm->task_info.pid, vm->task_info.process_name); r = amdgpu_bo_reserve(vm->root.bo, true); if (r) break; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 1f357198533f..e6e6d56398f2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -35,7 +35,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) { struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched); struct amdgpu_job *job = to_amdgpu_job(s_job); - struct amdgpu_task_info ti; + struct amdgpu_task_info *ti; struct amdgpu_device *adev = ring->adev; int idx; int r; @@ -48,7 +48,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) return DRM_GPU_SCHED_STAT_ENODEV; } - memset(, 0, sizeof(struct amdgpu_task_info)); + adev->job_hang = true; if (amdgpu_gpu_recovery && @@ -58,12 +58,16 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) goto exit; } - amdgpu_vm_get_task_info(ring->adev, job->pasid, ); DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n", - job->base.sched->name, atomic_read(>fence_drv.last_seq), - ring->fence_drv.sync_seq); - DRM_ERROR("Process information: process %s pid %d thread %s pid %d\n", - ti.process_name, ti.tgid, ti.task_name, ti.pid); + job->base.sched->name, atomic_read(>fence_drv.last_seq), + ring->fence_drv.sync_seq); + + ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid); + if (ti) { + DRM_ERROR("Process information: process %s pid %d thread %s pid %d\n", + ti->process_name, ti->tgid, ti->task_name, ti->pid); + amdgpu_vm_put_task_info(ti); + } dma_fence_set_error(_job->s_fence->
Re: [PATCH] Revert "drm/amdgpu: remove vm sanity check from amdgpu_vm_make_compute" for Raven
On 2024-02-28 01:41, Christian König wrote: Am 28.02.24 um 06:04 schrieb Jesse.Zhang: fix the issue when run clinfo: "amdgpu: Failed to create process VM object". when amdgpu initialized, seq64 do mampping and update bo mapping in vm page table. But when clifo run. It also initializes a vm for a process device through the function kfd_process_device_init_vm and ensure the root PD is clean through the function amdgpu_vm_pt_is_root_clean. So they have a conflict, and clinfo always failed. Big NAK for this, you removed the check but didn't solved the problem in any way. When Raven still needs the ats feature than it is intentional that this fails. I agree. I think we should just remove all the pte_supports_ats stuff from the amdgpu_vm code. We no longer use IOMMUv2. So there is no point setting invalid PTEs to fail over to ATS any more. As far as I can see, this will require changes in amdgpu_vm_clear_freed, amdgpu_vm_init, amdgpu_vm_make_compute. Then you can remove amdgpu_vm.pte_support_ats from the struct and remove amdgpu_vm_pt_is_root_clean. Regards, Felix Regards, Christian. Signed-off-by: Jesse Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 6 -- 1 file changed, 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index ed4a8c5d26d7..0bc0bc75be15 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2361,12 +2361,6 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm) * changing any other state, in case it fails. */ if (pte_support_ats != vm->pte_support_ats) { - /* Sanity checks */ - if (!amdgpu_vm_pt_is_root_clean(adev, vm)) { - r = -EINVAL; - goto unreserve_bo; - } - vm->pte_support_ats = pte_support_ats; r = amdgpu_vm_pt_clear(adev, vm, to_amdgpu_bo_vm(vm->root.bo), false);
Re: [PATCH] drm/amdkfd: Increase the size of the memory reserved for the TBA
On 2024-02-23 14:05, Laurent Morichetti wrote: In a future commit, the cwsr trap handler code size for gfx10.1 will increase to slightly above the one page mark. Since the TMA does not need to be page aligned, and only 2 pointers are stored in it, push the TMA offset by 2 KiB and keep the TBA+TMA reserved memory size to two pages. Signed-off-by: Laurent Morichetti Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 23 --- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 6 +++--- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 4d399c0c8a57..041ec3de55e7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -466,34 +466,43 @@ static void kfd_cwsr_init(struct kfd_dev *kfd) { if (cwsr_enable && kfd->device_info.supports_cwsr) { if (KFD_GC_VERSION(kfd) < IP_VERSION(9, 0, 1)) { - BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE); + BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) +> KFD_CWSR_TMA_OFFSET); kfd->cwsr_isa = cwsr_trap_gfx8_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex); } else if (KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 1)) { - BUILD_BUG_ON(sizeof(cwsr_trap_arcturus_hex) > PAGE_SIZE); + BUILD_BUG_ON(sizeof(cwsr_trap_arcturus_hex) +> KFD_CWSR_TMA_OFFSET); kfd->cwsr_isa = cwsr_trap_arcturus_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_arcturus_hex); } else if (KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 2)) { - BUILD_BUG_ON(sizeof(cwsr_trap_aldebaran_hex) > PAGE_SIZE); + BUILD_BUG_ON(sizeof(cwsr_trap_aldebaran_hex) +> KFD_CWSR_TMA_OFFSET); kfd->cwsr_isa = cwsr_trap_aldebaran_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_aldebaran_hex); } else if (KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 3)) { - BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_4_3_hex) > PAGE_SIZE); + BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_4_3_hex) +> KFD_CWSR_TMA_OFFSET); kfd->cwsr_isa = cwsr_trap_gfx9_4_3_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_4_3_hex); } else if (KFD_GC_VERSION(kfd) < IP_VERSION(10, 1, 1)) { - BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_hex) > PAGE_SIZE); + BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_hex) +> KFD_CWSR_TMA_OFFSET); kfd->cwsr_isa = cwsr_trap_gfx9_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_hex); } else if (KFD_GC_VERSION(kfd) < IP_VERSION(10, 3, 0)) { - BUILD_BUG_ON(sizeof(cwsr_trap_nv1x_hex) > PAGE_SIZE); + BUILD_BUG_ON(sizeof(cwsr_trap_nv1x_hex) +> KFD_CWSR_TMA_OFFSET); kfd->cwsr_isa = cwsr_trap_nv1x_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_nv1x_hex); } else if (KFD_GC_VERSION(kfd) < IP_VERSION(11, 0, 0)) { - BUILD_BUG_ON(sizeof(cwsr_trap_gfx10_hex) > PAGE_SIZE); + BUILD_BUG_ON(sizeof(cwsr_trap_gfx10_hex) +> KFD_CWSR_TMA_OFFSET); kfd->cwsr_isa = cwsr_trap_gfx10_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx10_hex); } else { + /* The gfx11 cwsr trap handler must fit inside a single + page. */ BUILD_BUG_ON(sizeof(cwsr_trap_gfx11_hex) > PAGE_SIZE); kfd->cwsr_isa = cwsr_trap_gfx11_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx11_hex); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 80320b8603fc..42d40560cd30 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -99,11 +99,11 @@ /* * Size of the per-process TBA+TMA buffer: 2 pages * - * The first page is the TBA used for the CWSR ISA code. The second - * page is used as TMA for user-mode trap handler setup in daisy-chain mode. + * The first chunk is the TBA used for the CWSR ISA code. The second + * chunk is used as TMA for user-mode trap handler setup in daisy-chain mode. */
Re: [PATCH] drm/amdkfd: fix process reference drop on debug ioctl
On 2024-02-21 05:54, Jonathan Kim wrote: Prevent dropping the KFD process reference at the end of a debug IOCTL call where the acquired process value is an error. Signed-off-by: Jonathan Kim Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 80e90fdef291..824e660283b2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -2935,6 +2935,7 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v if (IS_ERR_OR_NULL(target)) { pr_debug("Cannot find process PID %i to debug\n", args->pid); r = target ? PTR_ERR(target) : -ESRCH; + target = NULL; goto out; }
Re: [PATCH 1/2] drm/amdkfd: Document and define SVM event tracing macro
On 2024-02-15 10:18, Philip Yang wrote: Document how to use SMI system management interface to receive SVM events. Define SVM events message string format macro that could use by user mode for sscanf to parse the event. Add it to uAPI header file to make it obvious that is changing uAPI in future. No functional changes. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 51 +++--- include/uapi/linux/kfd_ioctl.h | 77 - 2 files changed, 102 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index d9953c2b2661..85465eb303a9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -225,15 +225,16 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset) event = KFD_SMI_EVENT_GPU_PRE_RESET; ++(dev->reset_seq_num); } - kfd_smi_event_add(0, dev, event, "%x\n", dev->reset_seq_num); + kfd_smi_event_add(0, dev, event, + KFD_EVENT_FMT_UPDATE_GPU_RESET(dev->reset_seq_num)); } void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev, uint64_t throttle_bitmask) { - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, "%llx:%llx\n", - throttle_bitmask, - amdgpu_dpm_get_thermal_throttling_counter(dev->adev)); + kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, + KFD_EVENT_FMT_UPDATE_THERMAL_THROTTLING(throttle_bitmask, + amdgpu_dpm_get_thermal_throttling_counter(dev->adev))); } void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid) @@ -246,8 +247,8 @@ void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid) if (!task_info.pid) return; - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n", - task_info.pid, task_info.task_name); + kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, + KFD_EVENT_FMT_VMFAULT(task_info.pid, task_info.task_name)); } void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid, @@ -255,16 +256,16 @@ void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid, ktime_t ts) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_START, - "%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid, - address, node->id, write_fault ? 'W' : 'R'); + KFD_EVENT_FMT_PAGEFAULT_START(ktime_to_ns(ts), pid, + address, node->id, write_fault ? 'W' : 'R')); } void kfd_smi_event_page_fault_end(struct kfd_node *node, pid_t pid, unsigned long address, bool migration) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_END, - "%lld -%d @%lx(%x) %c\n", ktime_get_boottime_ns(), - pid, address, node->id, migration ? 'M' : 'U'); + KFD_EVENT_FMT_PAGEFAULT_END(ktime_get_boottime_ns(), + pid, address, node->id, migration ? 'M' : 'U')); } void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid, @@ -274,9 +275,9 @@ void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid, uint32_t trigger) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_START, - "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n", - ktime_get_boottime_ns(), pid, start, end - start, - from, to, prefetch_loc, preferred_loc, trigger); + KFD_EVENT_FMT_MIGRATE_START(ktime_get_boottime_ns(), + pid, start, end - start, from, to, prefetch_loc, + preferred_loc, trigger)); } void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid, @@ -284,24 +285,23 @@ void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid, uint32_t from, uint32_t to, uint32_t trigger) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_END, - "%lld -%d @%lx(%lx) %x->%x %d\n", - ktime_get_boottime_ns(), pid, start, end - start, - from, to, trigger); + KFD_EVENT_FMT_MIGRATE_END(ktime_get_boottime_ns(), pid, + start, end - start, from, to, trigger)); } void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid, uint32_t trigger) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_QUEUE_EVICTION, - "%lld -%d
[PATCH v3] drm/amdkfd: Relocate TBA/TMA to opposite side of VM hole
The TBA and TMA, along with an unused IB allocation, reside at low addresses in the VM address space. A stray VM fault which hits these pages must be serviced by making their page table entries invalid. The scheduler depends upon these pages being resident and fails, preventing a debugger from inspecting the failure state. By relocating these pages above 47 bits in the VM address space they can only be reached when bits [63:48] are set to 1. This makes it much less likely for a misbehaving program to generate accesses to them. The current placement at VA (PAGE_SIZE*2) is readily hit by a NULL access with a small offset. v2: - Move it to the reserved space to avoid concflicts with Mesa - Add macros to make reserved space management easier v3: - Move VM max PFN calculation into AMDGPU_VA_RESERVED macros Cc: Arunpravin Paneer Selvam Cc: Christian Koenig Signed-off-by: Jay Cornwall Signed-off-by: Felix Kuehling --- drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c| 6 +--- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 11 +++- drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c | 29 ++-- 4 files changed, 27 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c index 823d31f4a2a3..b0fb14a4b43c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c @@ -28,9 +28,8 @@ uint64_t amdgpu_csa_vaddr(struct amdgpu_device *adev) { - uint64_t addr = adev->vm_manager.max_pfn << AMDGPU_GPU_PAGE_SHIFT; + uint64_t addr = AMDGPU_VA_RESERVED_CSA_START(adev); - addr -= AMDGPU_VA_RESERVED_CSA_SIZE; addr = amdgpu_gmc_sign_extend(addr); return addr; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c index 3d0d56087d41..4b9afc4df031 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c @@ -45,11 +45,7 @@ */ static inline u64 amdgpu_seq64_get_va_base(struct amdgpu_device *adev) { - u64 addr = adev->vm_manager.max_pfn << AMDGPU_GPU_PAGE_SHIFT; - - addr -= AMDGPU_VA_RESERVED_TOP; - - return addr; + return AMDGPU_VA_RESERVED_SEQ64_START(adev); } /** diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 2c4053b29bb3..42f6ddec50c1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -137,9 +137,18 @@ struct amdgpu_mem_stats; /* Reserve space at top/bottom of address space for kernel use */ #define AMDGPU_VA_RESERVED_CSA_SIZE(2ULL << 20) +#define AMDGPU_VA_RESERVED_CSA_START(adev) (((adev)->vm_manager.max_pfn \ + << AMDGPU_GPU_PAGE_SHIFT) \ +- AMDGPU_VA_RESERVED_CSA_SIZE) #define AMDGPU_VA_RESERVED_SEQ64_SIZE (2ULL << 20) +#define AMDGPU_VA_RESERVED_SEQ64_START(adev) (AMDGPU_VA_RESERVED_CSA_START(adev) \ +- AMDGPU_VA_RESERVED_SEQ64_SIZE) +#define AMDGPU_VA_RESERVED_TRAP_SIZE (2ULL << 12) +#define AMDGPU_VA_RESERVED_TRAP_START(adev) (AMDGPU_VA_RESERVED_SEQ64_START(adev) \ +- AMDGPU_VA_RESERVED_TRAP_SIZE) #define AMDGPU_VA_RESERVED_BOTTOM (1ULL << 16) -#define AMDGPU_VA_RESERVED_TOP (AMDGPU_VA_RESERVED_SEQ64_SIZE + \ +#define AMDGPU_VA_RESERVED_TOP (AMDGPU_VA_RESERVED_TRAP_SIZE + \ +AMDGPU_VA_RESERVED_SEQ64_SIZE + \ AMDGPU_VA_RESERVED_CSA_SIZE) /* See vm_update_mode */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c index 6604a3f99c5e..4a64307bc438 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c @@ -36,6 +36,7 @@ #include #include #include +#include "amdgpu_vm.h" /* * The primary memory I/O features being added for revisions of gfxip @@ -326,10 +327,16 @@ static void kfd_init_apertures_vi(struct kfd_process_device *pdd, uint8_t id) * with small reserved space for kernel. * Set them to CANONICAL addresses. */ - pdd->gpuvm_base = SVM_USER_BASE; + pdd->gpuvm_base = max(SVM_USER_BASE, AMDGPU_VA_RESERVED_BOTTOM); pdd->gpuvm_limit = pdd->dev->kfd->shared_resources.gpuvm_size - 1; + /* dGPUs: the reserved space for kernel +* before SVM +*/ + pdd->qpd.cwsr_base = SVM_CWSR_BASE; + pdd->qpd.ib_base = SVM_IB_BASE; + pdd->scratch_base = MAKE_SCRATCH_APP_BASE_VI(); pdd->scra
Re: [Patch v2 1/2] drm/amdkfd: update SIMD distribution algo for GFXIP 9.4.2 onwards
On 2024-02-13 16:39, Rajneesh Bhardwaj wrote: In certain cooperative group dispatch scenarios the default SPI resource allocation may cause reduced per-CU workgroup occupancy. Set COMPUTE_RESOURCE_LIMITS.FORCE_SIMD_DIST=1 to mitigate soft hang scenarions. Suggested-by: Joseph Greathouse Signed-off-by: Rajneesh Bhardwaj Reviewed-by: Felix Kuehling --- * Change the enum bitfield to 4 to avoid ORing condition of previous member flags. * Incorporate review feedback from Felix from https://www.mail-archive.com/amd-gfx@lists.freedesktop.org/msg102840.html and split one of the suggested gfx11 changes as a seperate patch. drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c| 9 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 4 +++- 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c index 42d881809dc7..697b6d530d12 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c @@ -303,6 +303,15 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, update_cu_mask(mm, mqd, minfo, 0); set_priority(m, q); + if (minfo && KFD_GC_VERSION(mm->dev) >= IP_VERSION(9, 4, 2)) { + if (minfo->update_flag & UPDATE_FLAG_IS_GWS) + m->compute_resource_limits |= + COMPUTE_RESOURCE_LIMITS__FORCE_SIMD_DIST_MASK; + else + m->compute_resource_limits &= + ~COMPUTE_RESOURCE_LIMITS__FORCE_SIMD_DIST_MASK; + } + q->is_active = QUEUE_IS_ACTIVE(*q); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 677281c0793e..80320b8603fc 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -532,6 +532,7 @@ struct queue_properties { enum mqd_update_flag { UPDATE_FLAG_DBG_WA_ENABLE = 1, UPDATE_FLAG_DBG_WA_DISABLE = 2, + UPDATE_FLAG_IS_GWS = 4, /* quirk for gfx9 IP */ }; struct mqd_update_info { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index 43eff221eae5..4858112f9a53 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -95,6 +95,7 @@ void kfd_process_dequeue_from_device(struct kfd_process_device *pdd) int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid, void *gws) { + struct mqd_update_info minfo = {0}; struct kfd_node *dev = NULL; struct process_queue_node *pqn; struct kfd_process_device *pdd; @@ -146,9 +147,10 @@ int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid, } pdd->qpd.num_gws = gws ? dev->adev->gds.gws_size : 0; + minfo.update_flag = gws ? UPDATE_FLAG_IS_GWS : 0; return pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm, - pqn->q, NULL); + pqn->q, ); } void kfd_process_dequeue_from_all_devices(struct kfd_process *p)
Re: [PATCH 2/2] drm/amdgpu: Fix implicit assumtion in gfx11 debug flags
On 2024-02-09 20:49, Rajneesh Bhardwaj wrote: Gfx11 debug flags mask is currently set with an implicit assumption that no other mqd update flags exist. This needs to be fixed with newly introduced flag UPDATE_FLAG_IS_GWS by the previous patch. Signed-off-by: Rajneesh Bhardwaj Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c index d722cbd31783..826bc4f6c8a7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c @@ -55,8 +55,8 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd, m = get_mqd(mqd); if (has_wa_flag) { - uint32_t wa_mask = minfo->update_flag == UPDATE_FLAG_DBG_WA_ENABLE ? - 0x : 0x; + uint32_t wa_mask = + (minfo->update_flag & UPDATE_FLAG_DBG_WA_ENABLE) ? 0x : 0x; m->compute_static_thread_mgmt_se0 = wa_mask; m->compute_static_thread_mgmt_se1 = wa_mask;
Re: [PATCH 1/2] drm/amdkfd: update SIMD distribution algo for GFXIP 9.4.2 onwards
On 2024-02-09 20:49, Rajneesh Bhardwaj wrote: In certain cooperative group dispatch scenarios the default SPI resource allocation may cause reduced per-CU workgroup occupancy. Set COMPUTE_RESOURCE_LIMITS.FORCE_SIMD_DIST=1 to mitigate soft hang scenarions. Suggested-by: Joseph Greathouse Signed-off-by: Rajneesh Bhardwaj --- * Incorporate review feedback from Felix from https://www.mail-archive.com/amd-gfx@lists.freedesktop.org/msg102840.html and split one of the suggested gfx11 changes as a seperate patch. drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c| 9 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 4 +++- 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c index 42d881809dc7..697b6d530d12 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c @@ -303,6 +303,15 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, update_cu_mask(mm, mqd, minfo, 0); set_priority(m, q); + if (minfo && KFD_GC_VERSION(mm->dev) >= IP_VERSION(9, 4, 2)) { + if (minfo->update_flag & UPDATE_FLAG_IS_GWS) + m->compute_resource_limits |= + COMPUTE_RESOURCE_LIMITS__FORCE_SIMD_DIST_MASK; + else + m->compute_resource_limits &= + ~COMPUTE_RESOURCE_LIMITS__FORCE_SIMD_DIST_MASK; + } + q->is_active = QUEUE_IS_ACTIVE(*q); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 677281c0793e..65b504813576 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -532,6 +532,7 @@ struct queue_properties { enum mqd_update_flag { UPDATE_FLAG_DBG_WA_ENABLE = 1, UPDATE_FLAG_DBG_WA_DISABLE = 2, + UPDATE_FLAG_IS_GWS = 3, /* quirk for gfx9 IP */ This flat needs to be a separate bit. So it should be defined as 4. Otherwise it looks just like UPDATE_FLAG_DBG_WA_ENABLE | UPDATE_FLAG_DBG_WA_DISABLE. I agree that defining bit-masks in an enum is not ideal, but I've seen the same in other places. Regards, Felix }; struct mqd_update_info { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index 43eff221eae5..4858112f9a53 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -95,6 +95,7 @@ void kfd_process_dequeue_from_device(struct kfd_process_device *pdd) int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid, void *gws) { + struct mqd_update_info minfo = {0}; struct kfd_node *dev = NULL; struct process_queue_node *pqn; struct kfd_process_device *pdd; @@ -146,9 +147,10 @@ int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid, } pdd->qpd.num_gws = gws ? dev->adev->gds.gws_size : 0; + minfo.update_flag = gws ? UPDATE_FLAG_IS_GWS : 0; return pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm, - pqn->q, NULL); + pqn->q, ); } void kfd_process_dequeue_from_all_devices(struct kfd_process *p)
Re: [Patch v2] drm/amdkfd: update SIMD distribution algo for GFXIP 9.4.2 onwards
On 2024-02-08 15:01, Bhardwaj, Rajneesh wrote: On 2/8/2024 2:41 PM, Felix Kuehling wrote: On 2024-02-07 23:14, Rajneesh Bhardwaj wrote: In certain cooperative group dispatch scenarios the default SPI resource allocation may cause reduced per-CU workgroup occupancy. Set COMPUTE_RESOURCE_LIMITS.FORCE_SIMD_DIST=1 to mitigate soft hang scenarions. Suggested-by: Joseph Greathouse Signed-off-by: Rajneesh Bhardwaj --- * Found a bug in the previous reviewed version https://lists.freedesktop.org/archives/amd-gfx/2024-February/104101.html since the q->is_gws is unset for keeping the count. * updated pqm_set_gws to pass minfo holding gws state for the active queues and use that to apply the FORCE_SIMD_DIST_MASK. drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 4 drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 4 +++- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c index 42d881809dc7..0b71db4c96b5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c @@ -303,6 +303,10 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, update_cu_mask(mm, mqd, minfo, 0); set_priority(m, q); + if (minfo && KFD_GC_VERSION(mm->dev) >= IP_VERSION(9, 4, 2)) + m->compute_resource_limits = minfo->gws ? + COMPUTE_RESOURCE_LIMITS__FORCE_SIMD_DIST_MASK : 0; + This looks OK because we don't set anything else in m->compute_resource_limits. If that ever changes, we have to be more careful here to not wipe out other fields in that register. Yes, Should I change it to below and send a v3? m->compute_resource_limits |= minfo->gws ? COMPUTE_RESOURCE_LIMITS__FORCE_SIMD_DIST_MASK : 0; I think you need to do if (minfo->gws) m->compute_resource_limits |= COMPUTE_RESOURCE_LIMITS__FORCE_SIMD_DIST_MASK; else m->compute_resource_limits &= ~COMPUTE_RESOURCE_LIMITS__FORCE_SIMD_DIST_MASK; That way you can clear the resource limit when GWS is disable for the queue. q->is_active = QUEUE_IS_ACTIVE(*q); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 677281c0793e..f4b327a2d4a8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -542,6 +542,7 @@ struct mqd_update_info { } cu_mask; }; enum mqd_update_flag update_flag; + bool gws; Instead of adding a new bool, can we add a flag to mqd_update_flag? Maybe, I initially thought about it but then I chose the bool approach since those debug flags are generic KFD non per-Asic flags while this bool is per-Asic request so I felt they didn't fit together. On the other hand, those flags and this bool are both quirks anyways so maybe they can be together. Please let me know your preference. I'd prefer to used the flags. They are currently used for a GFX11 quirk, now we can add another flag for a GFX9 quirk. The GFX11 code currently has an implicit assumption that no other flags exist. That would need to be fixed: if (has_wa_flag) { - uint32_t wa_mask = minfo->update_flag == UPDATE_FLAG_DBG_WA_ENABLE ? + uint32_t wa_mask = (minfo->update_flag & UPDATE_FLAG_DBG_WA_ENABLE) ? 0x : 0x; Regards, Felix Looks good to me otherwise. Regards, Felix }; /** diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index 43eff221eae5..5416a110ced9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -95,6 +95,7 @@ void kfd_process_dequeue_from_device(struct kfd_process_device *pdd) int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid, void *gws) { + struct mqd_update_info minfo = {0}; struct kfd_node *dev = NULL; struct process_queue_node *pqn; struct kfd_process_device *pdd; @@ -146,9 +147,10 @@ int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid, } pdd->qpd.num_gws = gws ? dev->adev->gds.gws_size : 0; + minfo.gws = !!gws; return pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm, - pqn->q, NULL); + pqn->q, ); } void kfd_process_dequeue_from_all_devices(struct kfd_process *p)
Re: [Patch v2] drm/amdkfd: update SIMD distribution algo for GFXIP 9.4.2 onwards
On 2024-02-07 23:14, Rajneesh Bhardwaj wrote: In certain cooperative group dispatch scenarios the default SPI resource allocation may cause reduced per-CU workgroup occupancy. Set COMPUTE_RESOURCE_LIMITS.FORCE_SIMD_DIST=1 to mitigate soft hang scenarions. Suggested-by: Joseph Greathouse Signed-off-by: Rajneesh Bhardwaj --- * Found a bug in the previous reviewed version https://lists.freedesktop.org/archives/amd-gfx/2024-February/104101.html since the q->is_gws is unset for keeping the count. * updated pqm_set_gws to pass minfo holding gws state for the active queues and use that to apply the FORCE_SIMD_DIST_MASK. drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c| 4 drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 4 +++- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c index 42d881809dc7..0b71db4c96b5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c @@ -303,6 +303,10 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, update_cu_mask(mm, mqd, minfo, 0); set_priority(m, q); + if (minfo && KFD_GC_VERSION(mm->dev) >= IP_VERSION(9, 4, 2)) + m->compute_resource_limits = minfo->gws ? + COMPUTE_RESOURCE_LIMITS__FORCE_SIMD_DIST_MASK : 0; + This looks OK because we don't set anything else in m->compute_resource_limits. If that ever changes, we have to be more careful here to not wipe out other fields in that register. q->is_active = QUEUE_IS_ACTIVE(*q); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 677281c0793e..f4b327a2d4a8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -542,6 +542,7 @@ struct mqd_update_info { } cu_mask; }; enum mqd_update_flag update_flag; + bool gws; Instead of adding a new bool, can we add a flag to mqd_update_flag? Looks good to me otherwise. Regards, Felix }; /** diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index 43eff221eae5..5416a110ced9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -95,6 +95,7 @@ void kfd_process_dequeue_from_device(struct kfd_process_device *pdd) int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid, void *gws) { + struct mqd_update_info minfo = {0}; struct kfd_node *dev = NULL; struct process_queue_node *pqn; struct kfd_process_device *pdd; @@ -146,9 +147,10 @@ int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid, } pdd->qpd.num_gws = gws ? dev->adev->gds.gws_size : 0; + minfo.gws = !!gws; return pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm, - pqn->q, NULL); + pqn->q, ); } void kfd_process_dequeue_from_all_devices(struct kfd_process *p)
Re: [PATCH v2] drm/amdkfd: Initialize kfd_gpu_cache_info for KFD topology
On 2024-02-07 0:32, Joseph Greathouse wrote: The current kfd_gpu_cache_info structure is only partially filled in for some architectures. This means that for devices where we do not fill in some fields, we can returned uninitialized values through the KFD topology. Zero out the kfd_gpu_cache_info before asking the remaining fields to be filled in by lower-level functions. Fixes: 04756ac9a24c ("drm/amdkfd: Add cache line sizes to KFD topology") Signed-off-by: Joseph Greathouse Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index 3df2a8ad86fb..5cb0465493b8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -1707,6 +1707,7 @@ static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct gpu_processor_id = dev->node_props.simd_id_base; + memset(cache_info, 0, sizeof(cache_info)); pcache_info = cache_info; num_of_cache_types = kfd_get_gpu_cache_info(kdev, _info); if (!num_of_cache_types) {
Re: [PATCH] drm/amdkfd: Don't divide L2 cache by partition mode
On 2024-02-06 16:24, Kent Russell wrote: Partition mode only affects L3 cache size. After removing the L2 check in the previous patch, make sure we aren't dividing all cache sizes by partition mode, just L3. Fixes: a75bfb3c4045 ("drm/amdkfd: Fix L2 cache size reporting in GFX9.4.3") The fixes tag looks wrong. I can't find the commit a75bfb3c4045 anywhere. Did your previous patch actually make it into the branch yet? Maybe you can still abandon it in Gerrit. Regards, Felix Signed-off-by: Kent Russell --- drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index 64bf2a56f010..533b8292b136 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -1640,10 +1640,10 @@ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext, else mode = UNKNOWN_MEMORY_PARTITION_MODE; - if (mode) - pcache->cache_size = pcache_info[cache_type].cache_size / mode; - else - pcache->cache_size = pcache_info[cache_type].cache_size; + pcache->cache_size = pcache_info[cache_type].cache_size; + /* Partition mode only affects L3 cache size */ + if (mode && pcache->cache_level == 3) + pcache->cache_size /= mode; if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_DATA_CACHE) pcache->cache_type |= HSA_CACHE_TYPE_DATA;
Re: [PATCH] drm/amdkfd: Initialize kfd_gpu_cache_info for KFD topology
On 2024-02-06 15:55, Joseph Greathouse wrote: The current kfd_gpu_cache_info structure is only partially filled in for some architectures. This means that for devices where we do not fill in some fields, we can returned uninitialized values through the KFD topology. Zero out the kfd_gpu_cache_info before asking the remaining fields to be filled in by lower-level functions. Signed-off-by: Joseph Greathouse This fixes your previous patch "drm/amdkfd: Add cache line sizes to KFD topology". Alex, I think the previous patch hasn't gone upstream yet. Do you want a Fixes: tag or is is possible to squash this with Joe's previous patch before upstreaming? One nit-pick below. --- drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index 3df2a8ad86fb..67c1e7f84750 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -1707,6 +1707,7 @@ static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct gpu_processor_id = dev->node_props.simd_id_base; + memset(cache_info, 0, sizeof(struct kfd_gpu_cache_info) * KFD_MAX_CACHE_TYPES); Just use sizeof(cache_info). No need to calculate the size of the array and risk getting it wrong. Regards, Felix pcache_info = cache_info; num_of_cache_types = kfd_get_gpu_cache_info(kdev, _info); if (!num_of_cache_types) {
Re: [PATCH 1/2] drm/amdgpu: Unmap only clear the page table leaves
On 2024-02-01 11:50, Philip Yang wrote: SVM migration unmap pages from GPU and then update mapping to GPU to recover page fault. Currently unmap clears the PDE entry for range length >= huge page and free PTB bo, update mapping to alloc new PT bo. There is race bug that the freed entry bo maybe still on the pt_free list, reused when updating mapping and then freed, leave invalid PDE entry and cause GPU page fault. By setting the update to clear only one PDE entry or clear PTB, to avoid unmap to free PTE bo. This fixes the race bug and improve the unmap and map to GPU performance. Update mapping to huge page will still free the PTB bo. With this change, the vm->pt_freed list and work is not needed. Add WARN_ON(unlocked) in amdgpu_vm_pt_free_dfs to catch if unmap to free the PTB. Signed-off-by: Philip Yang As we discussed offline, I think this is the wrong approach. This can lead to resource leaks when lots of virtual address space is released bug the page tables remain allocated indefinitely. I think we need some solution that either * prevents reuse of page tables that are about to be free * prevents reused page tables from being freed by the worker (e.g. a ref count or cancelling the work) Regards, Felix --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c| 4 --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h| 4 --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 43 ++- 3 files changed, 10 insertions(+), 41 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 82e5fd66a10d..3bde77dfc63f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2256,8 +2256,6 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, spin_lock_init(>status_lock); INIT_LIST_HEAD(>freed); INIT_LIST_HEAD(>done); - INIT_LIST_HEAD(>pt_freed); - INIT_WORK(>pt_free_work, amdgpu_vm_pt_free_work); INIT_KFIFO(vm->faults); r = amdgpu_vm_init_entities(adev, vm); @@ -2446,8 +2444,6 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm) amdgpu_amdkfd_gpuvm_destroy_cb(adev, vm); - flush_work(>pt_free_work); - root = amdgpu_bo_ref(vm->root.bo); amdgpu_bo_reserve(root, true); amdgpu_vm_set_pasid(adev, vm, 0); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index cdb61f1e7c35..74fe211b9ecd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -316,10 +316,6 @@ struct amdgpu_vm { /* BOs which are invalidated, has been updated in the PTs */ struct list_headdone; - /* PT BOs scheduled to free and fill with zero if vm_resv is not hold */ - struct list_headpt_freed; - struct work_struct pt_free_work; - /* contains the page directory */ struct amdgpu_vm_bo_base root; struct dma_fence*last_update; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index a160265ddc07..a3d609655ce3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -657,27 +657,6 @@ static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry) amdgpu_bo_unref(>bo); } -void amdgpu_vm_pt_free_work(struct work_struct *work) -{ - struct amdgpu_vm_bo_base *entry, *next; - struct amdgpu_vm *vm; - LIST_HEAD(pt_freed); - - vm = container_of(work, struct amdgpu_vm, pt_free_work); - - spin_lock(>status_lock); - list_splice_init(>pt_freed, _freed); - spin_unlock(>status_lock); - - /* flush_work in amdgpu_vm_fini ensure vm->root.bo is valid. */ - amdgpu_bo_reserve(vm->root.bo, true); - - list_for_each_entry_safe(entry, next, _freed, vm_status) - amdgpu_vm_pt_free(entry); - - amdgpu_bo_unreserve(vm->root.bo); -} - /** * amdgpu_vm_pt_free_dfs - free PD/PT levels * @@ -696,17 +675,7 @@ static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev, struct amdgpu_vm_pt_cursor cursor; struct amdgpu_vm_bo_base *entry; - if (unlocked) { - spin_lock(>status_lock); - for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry) - list_move(>vm_status, >pt_freed); - - if (start) - list_move(>entry->vm_status, >pt_freed); - spin_unlock(>status_lock); - schedule_work(>pt_free_work); - return; - } + WARN_ON(unlocked); for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry) amdgpu_vm_pt_free(entry); @@ -1009,7 +978,15 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params, incr = (uint64_t)AMDGPU_GPU_PAGE_SIZE << shift; mask =
Re: [PATCH] drm/amdkfd: update SIMD distribution algo for GFXIP 9.4.2 onwards
On 2024-02-02 17:24, Greathouse, Joseph wrote: [AMD Official Use Only - General] -Original Message- From: Kuehling, Felix Sent: Friday, February 2, 2024 10:21 AM To: Bhardwaj, Rajneesh ; amd-gfx@lists.freedesktop.org Cc: Greathouse, Joseph Subject: Re: [PATCH] drm/amdkfd: update SIMD distribution algo for GFXIP 9.4.2 onwards On 2024-02-01 13:54, Rajneesh Bhardwaj wrote: In certain cooperative group dispatch scenarios the default SPI resource allocation may cause reduced per-CU workgroup occupancy. Set COMPUTE_RESOURCE_LIMITS.FORCE_SIMD_DIST=1 to mitigate soft hang scenarions. Suggested-by: Joseph Greathouse Signed-off-by: Rajneesh Bhardwaj --- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 4 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c index 42d881809dc7..4b28e7dcb62f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c @@ -303,6 +303,10 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, update_cu_mask(mm, mqd, minfo, 0); set_priority(m, q); + if (KFD_GC_VERSION(mm->dev) >= IP_VERSION(9, 4, 2)) Are we sure this is only needed on GFX 9.4.2 and later GPUs? Does it affect older GFX 9.x GPUs as well? What about GFX 10 and 11? It seems the register bit exists for all those GPUs? On gfx9 devices, it is only necessary for GFX 9.4.2 and beyond. This was a side effect of the move from 10 wave-slots per SIMD to 8 wave-slots per SIMD. Checking with the hardware group (and running some basic tests against the problem we saw on gfx9 parts), this should not be necessary for gfx10 parts, either those with 20 wave-slots per SIMD or 16. Thanks for checking. The patch ls Reviewed-by: Felix Kuehling Thanks, -Joe Regards, Felix + m->compute_resource_limits = q->is_gws ? + COMPUTE_RESOURCE_LIMITS__FORCE_SIMD_DIST_MASK : 0; + q->is_active = QUEUE_IS_ACTIVE(*q); }
Re: [PATCH] drm/amdkfd: update SIMD distribution algo for GFXIP 9.4.2 onwards
On 2024-02-01 13:54, Rajneesh Bhardwaj wrote: In certain cooperative group dispatch scenarios the default SPI resource allocation may cause reduced per-CU workgroup occupancy. Set COMPUTE_RESOURCE_LIMITS.FORCE_SIMD_DIST=1 to mitigate soft hang scenarions. Suggested-by: Joseph Greathouse Signed-off-by: Rajneesh Bhardwaj --- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 4 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c index 42d881809dc7..4b28e7dcb62f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c @@ -303,6 +303,10 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, update_cu_mask(mm, mqd, minfo, 0); set_priority(m, q); + if (KFD_GC_VERSION(mm->dev) >= IP_VERSION(9, 4, 2)) Are we sure this is only needed on GFX 9.4.2 and later GPUs? Does it affect older GFX 9.x GPUs as well? What about GFX 10 and 11? It seems the register bit exists for all those GPUs? Regards, Felix + m->compute_resource_limits = q->is_gws ? + COMPUTE_RESOURCE_LIMITS__FORCE_SIMD_DIST_MASK : 0; + q->is_active = QUEUE_IS_ACTIVE(*q); }
Re: [PATCH v3] drm/amdkfd: reserve the BO before validating it
On 2024-01-30 04:45, Lang Yu wrote: Fixes: 410f08516e0f ("drm/amdkfd: Move dma unmapping after TLB flush") v2: Avoid unmapping attachment twice when ERESTARTSYS. v3: Lock the BO before accessing ttm->sg to avoid race conditions.(Felix) [ 41.708711] WARNING: CPU: 0 PID: 1463 at drivers/gpu/drm/ttm/ttm_bo.c:846 ttm_bo_validate+0x146/0x1b0 [ttm] [ 41.708989] Call Trace: [ 41.708992] [ 41.708996] ? show_regs+0x6c/0x80 [ 41.709000] ? ttm_bo_validate+0x146/0x1b0 [ttm] [ 41.709008] ? __warn+0x93/0x190 [ 41.709014] ? ttm_bo_validate+0x146/0x1b0 [ttm] [ 41.709024] ? report_bug+0x1f9/0x210 [ 41.709035] ? handle_bug+0x46/0x80 [ 41.709041] ? exc_invalid_op+0x1d/0x80 [ 41.709048] ? asm_exc_invalid_op+0x1f/0x30 [ 41.709057] ? amdgpu_amdkfd_gpuvm_dmaunmap_mem+0x2c/0x80 [amdgpu] [ 41.709185] ? ttm_bo_validate+0x146/0x1b0 [ttm] [ 41.709197] ? amdgpu_amdkfd_gpuvm_dmaunmap_mem+0x2c/0x80 [amdgpu] [ 41.709337] ? srso_alias_return_thunk+0x5/0x7f [ 41.709346] kfd_mem_dmaunmap_attachment+0x9e/0x1e0 [amdgpu] [ 41.709467] amdgpu_amdkfd_gpuvm_dmaunmap_mem+0x56/0x80 [amdgpu] [ 41.709586] kfd_ioctl_unmap_memory_from_gpu+0x1b7/0x300 [amdgpu] [ 41.709710] kfd_ioctl+0x1ec/0x650 [amdgpu] [ 41.709822] ? __pfx_kfd_ioctl_unmap_memory_from_gpu+0x10/0x10 [amdgpu] [ 41.709945] ? srso_alias_return_thunk+0x5/0x7f [ 41.709949] ? tomoyo_file_ioctl+0x20/0x30 [ 41.709959] __x64_sys_ioctl+0x9c/0xd0 [ 41.709967] do_syscall_64+0x3f/0x90 [ 41.709973] entry_SYSCALL_64_after_hwframe+0x6e/0xd8 Signed-off-by: Lang Yu Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h| 2 +- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 20 --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 4 +++- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 298fc52a35bc..e60f63ccf79a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -313,7 +313,7 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(struct amdgpu_device *adev, struct kgd_mem *mem, void *drm_priv); int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( struct amdgpu_device *adev, struct kgd_mem *mem, void *drm_priv); -void amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv); +int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv); int amdgpu_amdkfd_gpuvm_sync_memory( struct amdgpu_device *adev, struct kgd_mem *mem, bool intr); int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 6f3a4cb2a9ef..ef71b12062a1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -2088,21 +2088,35 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( return ret; } -void amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv) +int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv) { struct kfd_mem_attachment *entry; struct amdgpu_vm *vm; + int ret; vm = drm_priv_to_vm(drm_priv); mutex_lock(>lock); + ret = amdgpu_bo_reserve(mem->bo, true); + if (ret) + goto out; + list_for_each_entry(entry, >attachments, list) { - if (entry->bo_va->base.vm == vm) - kfd_mem_dmaunmap_attachment(mem, entry); + if (entry->bo_va->base.vm != vm) + continue; + if (entry->bo_va->base.bo->tbo.ttm && + !entry->bo_va->base.bo->tbo.ttm->sg) + continue; + + kfd_mem_dmaunmap_attachment(mem, entry); } + amdgpu_bo_unreserve(mem->bo); +out: mutex_unlock(>lock); + + return ret; } int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index ce4c52ec34d8..80e90fdef291 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1442,7 +1442,9 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT); /* Remove dma mapping after tlb flush to avoid IO_PAGE_FAULT */ - amdgpu_amdkfd_gpuvm_dmaunmap_mem(mem, peer_pdd->drm_priv); + err = amdgpu_amdkfd_gpuvm_dmaunmap_mem(mem, peer_pdd->drm_priv); + if (err) + goto sync_memory_failed; } mutex_unlock(>mutex);