RE: [PATCH 1/2] drm/amdkfd: fix trap handling work around for debugging

2023-07-18 Thread Ji, Ruili
[Public]

Hi  Jonathan,

Your change looks fine.
Acknowledged-by: Ji, Ruili 

Thanks,
Ruili
-Original Message-
From: Kim, Jonathan 
Sent: Wednesday, July 19, 2023 6:13 AM
To: amd-gfx@lists.freedesktop.org
Cc: Kuehling, Felix ; Ji, Ruili 
Subject: RE: [PATCH 1/2] drm/amdkfd: fix trap handling work around for debugging

[Public]

+ Ruiji Li as this is a follow up to

commit 52223c7e74d124bea47beec467e59fdfc77559fc
Author: Ruili Ji 
Date:   Tue Jun 6 14:06:01 2023 +0800

drm/amdkfd: To enable traps for GC_11_0_4 and up

Flag trap_en should be enabled for trap handler.

Signed-off-by: Ruili Ji 
Signed-off-by: Aaron Liu 
Reviewed-by: Alex Deucher 

To ensure debugger is consistent with other checks.

Thanks,

Jon

> -Original Message-
> From: Kim, Jonathan 
> Sent: Friday, July 14, 2023 5:38 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Kuehling, Felix ; Kim, Jonathan
> 
> Subject: [PATCH 1/2] drm/amdkfd: fix trap handling work around for
> debugging
>
> Update the list of devices that require the cwsr trap handling
> workaround for debugging use cases.
>
> Signed-off-by: Jonathan Kim 
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_debug.c| 5 ++---
>  drivers/gpu/drm/amd/amdkfd/kfd_debug.h| 6 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 6 ++
>  3 files changed, 10 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index 190b03efe5ff..ccfc81f085ce 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -302,8 +302,7 @@ static int kfd_dbg_set_queue_workaround(struct
> queue *q, bool enable)
>   if (!q)
>   return 0;
>
> - if (KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) ||
> - KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0))
> + if (!kfd_dbg_has_cwsr_workaround(q->device))
>   return 0;
>
>   if (enable && q->properties.is_user_cu_masked) @@ -349,7 +348,7
> @@ int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)  {
>   uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd-
> >spi_dbg_launch_mode;
>   uint32_t flags = pdd->process->dbg_flags;
> - bool sq_trap_en = !!spi_dbg_cntl;
> + bool sq_trap_en = !!spi_dbg_cntl ||
> !kfd_dbg_has_cwsr_workaround(pdd->dev);
>
>   if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
>   return 0;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index ba616ed17dee..586d7f886712 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -101,6 +101,12 @@ static inline bool
> kfd_dbg_is_rlc_restore_supported(struct kfd_node *dev)
>KFD_GC_VERSION(dev) == IP_VERSION(10, 1, 1));  }
>
> +static inline bool kfd_dbg_has_cwsr_workaround(struct kfd_node *dev)
> +{
> + return KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0) &&
> +KFD_GC_VERSION(dev) <= IP_VERSION(11, 0, 3); }
> +
>  static inline bool kfd_dbg_has_gws_support(struct kfd_node *dev)  {
>   if ((KFD_GC_VERSION(dev) == IP_VERSION(9, 0, 1) diff --git
> a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 31cac1fd0d58..761963ad6154 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -226,8 +226,7 @@ static int add_queue_mes(struct
> device_queue_manager *dqm, struct queue *q,
>   queue_input.paging = false;
>   queue_input.tba_addr = qpd->tba_addr;
>   queue_input.tma_addr = qpd->tma_addr;
> - queue_input.trap_en = KFD_GC_VERSION(q->device) <
> IP_VERSION(11, 0, 0) ||
> -   KFD_GC_VERSION(q->device) > IP_VERSION(11, 0,
> 3);
> + queue_input.trap_en = !kfd_dbg_has_cwsr_workaround(q->device);
>   queue_input.skip_process_ctx_clear = qpd->pqm->process-
> >debug_trap_enabled;
>
>   queue_type = convert_to_mes_queue_type(q->properties.type);
> @@ -1827,8 +1826,7 @@ static int create_queue_cpsch(struct
> device_queue_manager *dqm, struct queue *q,
>*/
>   q->properties.is_evicted = !!qpd->evicted;
>   q->properties.is_dbg_wa = qpd->pqm->process-
> >debug_trap_enabled &&
> - KFD_GC_VERSION(q->device) >= IP_VERSION(11, 0, 0)
> &&
> - KFD_GC_VERSION(q->device) <= IP_VERSION(11, 0, 3);
> +
> + kfd_dbg_has_cwsr_workaround(q->device);
>
>   if (qd)
>   mqd_mgr->restore_mqd(mqd_mgr, >mqd, q-
> >mqd_mem_obj, >gart_mqd_addr,
> --
> 2.25.1




[PATCH v2] drm/amdkfd: To enable traps for GC_11_0_4 and up

2023-06-08 Thread Ji, Ruili
From: Ruili Ji 

Flag trap_en should be enabled for trap handler.

Signed-off-by: Ruili Ji 
Signed-off-by: Aaron Liu 
Reviewed-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index d6b15493fffd..8a39a9e0ed5a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -227,7 +227,7 @@ static int add_queue_mes(struct device_queue_manager *dqm, 
struct queue *q,
queue_input.tba_addr = qpd->tba_addr;
queue_input.tma_addr = qpd->tma_addr;
queue_input.trap_en = KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) 
||
- KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0);
+ KFD_GC_VERSION(q->device) > IP_VERSION(11, 0, 3);
queue_input.skip_process_ctx_clear = 
qpd->pqm->process->debug_trap_enabled;
 
queue_type = convert_to_mes_queue_type(q->properties.type);
@@ -1807,7 +1807,7 @@ static int create_queue_cpsch(struct device_queue_manager 
*dqm, struct queue *q,
q->properties.is_evicted = !!qpd->evicted;
q->properties.is_dbg_wa = qpd->pqm->process->debug_trap_enabled &&
KFD_GC_VERSION(q->device) >= IP_VERSION(11, 0, 0) &&
-   KFD_GC_VERSION(q->device) < IP_VERSION(12, 0, 0);
+   KFD_GC_VERSION(q->device) <= IP_VERSION(11, 0, 3);
 
if (qd)
mqd_mgr->restore_mqd(mqd_mgr, >mqd, q->mqd_mem_obj, 
>gart_mqd_addr,
-- 
2.40.1



[PATCH] drm/amdkfd: to fix cwsr hang issue

2023-06-07 Thread Ji, Ruili
From: Ruili Ji 

Starting from GC_11_0_4, flag trap_en should be enabled for trap handler.

Signed-off-by: Ruili Ji 
Signed-off-by: Aaron Liu 
Reviewed-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 0c1be91a87c6..b695d7a3058c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -227,7 +227,7 @@ static int add_queue_mes(struct device_queue_manager *dqm, 
struct queue *q,
queue_input.tba_addr = qpd->tba_addr;
queue_input.tma_addr = qpd->tma_addr;
queue_input.trap_en = KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) 
||
- KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0) 
||
+ KFD_GC_VERSION(q->device) > IP_VERSION(11, 0, 3) 
||
  q->properties.is_dbg_wa;
queue_input.skip_process_ctx_clear = 
qpd->pqm->process->debug_trap_enabled;
 
@@ -1808,7 +1808,7 @@ static int create_queue_cpsch(struct device_queue_manager 
*dqm, struct queue *q,
q->properties.is_evicted = !!qpd->evicted;
q->properties.is_dbg_wa = qpd->pqm->process->debug_trap_enabled &&
KFD_GC_VERSION(q->device) >= IP_VERSION(11, 0, 0) &&
-   KFD_GC_VERSION(q->device) < IP_VERSION(12, 0, 0);
+   KFD_GC_VERSION(q->device) <= IP_VERSION(11, 0, 3);
 
if (qd)
mqd_mgr->restore_mqd(mqd_mgr, >mqd, q->mqd_mem_obj, 
>gart_mqd_addr,
-- 
2.40.1



[PATCH v2] drm/amdkfd: To fix sdma page fault issue for GC 11.x

2023-02-08 Thread Ji, Ruili
From: Ruili Ji 

For the MQD memory, KMD would always allocate 4K memory,
and mes scheduler would write to the end of MQD for unmap flag.

Signed-off-by: Ruili Ji 
---
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c |  5 +++--
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  | 15 ++-
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index c06ada0844ba..7a95698d83f7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -2373,7 +2373,7 @@ struct device_queue_manager 
*device_queue_manager_init(struct kfd_dev *dev)
if (init_mqd_managers(dqm))
goto out_free;
 
-   if (allocate_hiq_sdma_mqd(dqm)) {
+   if (!dev->shared_resources.enable_mes && allocate_hiq_sdma_mqd(dqm)) {
pr_err("Failed to allocate hiq sdma mqd trunk buffer\n");
goto out_free;
}
@@ -2397,7 +2397,8 @@ static void deallocate_hiq_sdma_mqd(struct kfd_dev *dev,
 void device_queue_manager_uninit(struct device_queue_manager *dqm)
 {
dqm->ops.uninitialize(dqm);
-   deallocate_hiq_sdma_mqd(dqm->dev, >hiq_sdma_mqd);
+   if (!dqm->dev->shared_resources.enable_mes)
+   deallocate_hiq_sdma_mqd(dqm->dev, >hiq_sdma_mqd);
kfree(dqm);
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
index 4f6390f3236e..4a9af800b1f1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
@@ -308,11 +308,16 @@ static void init_mqd_sdma(struct mqd_manager *mm, void 
**mqd,
struct queue_properties *q)
 {
struct v11_sdma_mqd *m;
+   int size;
 
m = (struct v11_sdma_mqd *) mqd_mem_obj->cpu_ptr;
 
-   memset(m, 0, sizeof(struct v11_sdma_mqd));
+   if (mm->dev->shared_resources.enable_mes)
+   size = PAGE_SIZE;
+   else
+   size = sizeof(struct v11_sdma_mqd);
 
+   memset(m, 0, size);
*mqd = m;
if (gart_addr)
*gart_addr = mqd_mem_obj->gpu_addr;
@@ -443,6 +448,14 @@ struct mqd_manager *mqd_manager_init_v11(enum KFD_MQD_TYPE 
type,
 #if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
 #endif
+   /*
+* To allocate SDMA MQDs by generic functions
+* when MES is enabled.
+*/
+   if (dev->shared_resources.enable_mes) {
+   mqd->allocate_mqd = allocate_mqd;
+   mqd->free_mqd = kfd_free_mqd_cp;
+   }
pr_debug("%s@%i\n", __func__, __LINE__);
break;
default:
-- 
2.25.1



[PATCH] drm/amdkfd: To fix sdma page fault issue for GC 11.x

2023-02-06 Thread Ji, Ruili
From: Ruili Ji 

For the MQD memory, KMD would always allocate 4K memory,
and mes scheduler would write to the end of MQD for unmap flag.

Signed-off-by: Ruili Ji 
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 20 +++
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c  | 12 +--
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index c06ada0844ba..d682e6921438 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -2244,10 +2244,22 @@ static int allocate_hiq_sdma_mqd(struct 
device_queue_manager *dqm)
int retval;
struct kfd_dev *dev = dqm->dev;
struct kfd_mem_obj *mem_obj = >hiq_sdma_mqd;
-   uint32_t size = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size *
-   get_num_all_sdma_engines(dqm) *
-   dev->device_info.num_sdma_queues_per_engine +
-   dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
+   uint32_t size;
+   /*
+* MES write to areas beyond MQD size. So allocate
+* 1 PAGE_SIZE memory for MQD is MES is enabled.
+*/
+   if (dev->shared_resources.enable_mes) {
+   size = PAGE_SIZE *
+   get_num_all_sdma_engines(dqm) *
+   dev->device_info.num_sdma_queues_per_engine +
+   dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
+   } else {
+   size = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size *
+   get_num_all_sdma_engines(dqm) *
+   dev->device_info.num_sdma_queues_per_engine +
+   dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
+   }
 
retval = amdgpu_amdkfd_alloc_gtt_mem(dev->adev, size,
&(mem_obj->gtt_mem), &(mem_obj->gpu_addr),
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
index 623ccd227b7d..ea176a515898 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
@@ -66,15 +66,23 @@ struct kfd_mem_obj *allocate_sdma_mqd(struct kfd_dev *dev,
 {
struct kfd_mem_obj *mqd_mem_obj = NULL;
uint64_t offset;
+   uint32_t size;
 
mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL);
if (!mqd_mem_obj)
return NULL;
+   /*
+* MES write to areas beyond MQD size. So allocate
+* 1 PAGE_SIZE memory for MQD is MES is enabled.
+*/
+   if (dev->shared_resources.enable_mes)
+   size = PAGE_SIZE;
+   else
+   size = dev->dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size;
 
offset = (q->sdma_engine_id *
dev->device_info.num_sdma_queues_per_engine +
-   q->sdma_queue_id) *
-   dev->dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size;
+   q->sdma_queue_id) * size;
 
offset += dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
 
-- 
2.25.1



[PATCH] drm/amdgpu: Enable F32_WPTR_POLL_ENABLE in mqd

2022-09-30 Thread Ji, Ruili
From: Ruili Ji 

This patch is to fix the SDMA user queue doorbell missing issue on
SDMA 6.0. F32_WPTR_POLL_ENABLE has to be set if doorbell mode is used.
Otherwise ringing SDMA user queue doorbell can't wake up system from gfxoff.

Signed-off-by: Yifan Zhang 
Signed-off-by: Ruili Ji 
Change-Id: Icfb97c3551509b4d7fb172ebc4200edf5844e5e1
---
 drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c   | 3 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
index db51230163c5..0150f66a5ae6 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
@@ -846,7 +846,8 @@ static int sdma_v6_0_mqd_init(struct amdgpu_device *adev, 
void *mqd,
m->sdmax_rlcx_rb_cntl =
order_base_2(prop->queue_size / 4) << 
SDMA0_QUEUE0_RB_CNTL__RB_SIZE__SHIFT |
1 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT |
-   4 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT;
+   4 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT |
+   1 << SDMA0_QUEUE0_RB_CNTL__F32_WPTR_POLL_ENABLE__SHIFT;
 
m->sdmax_rlcx_rb_base = lower_32_bits(prop->hqd_base_gpu_addr >> 8);
m->sdmax_rlcx_rb_base_hi = upper_32_bits(prop->hqd_base_gpu_addr >> 8);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
index 26b53b6d673e..4f6390f3236e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
@@ -333,7 +333,8 @@ static void update_mqd_sdma(struct mqd_manager *mm, void 
*mqd,
<< SDMA0_QUEUE0_RB_CNTL__RB_SIZE__SHIFT |
q->vmid << SDMA0_QUEUE0_RB_CNTL__RB_VMID__SHIFT |
1 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT |
-   6 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT;
+   6 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT |
+   1 << SDMA0_QUEUE0_RB_CNTL__F32_WPTR_POLL_ENABLE__SHIFT;
 
m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8);
m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8);
-- 
2.25.1



[PATCH v3] drm/amdgpu: To flush tlb for MMHUB of RAVEN series

2022-06-22 Thread Ji, Ruili
From: Ruili Ji 

amdgpu: [mmhub0] no-retry page fault (src_id:0 ring:40 vmid:8 pasid:32769, for 
process test_basic pid 3305 thread test_basic pid 3305)
amdgpu: in page starting at address 0x7ff990003000 from IH client 0x12 (VMC)
amdgpu: VM_L2_PROTECTION_FAULT_STATUS:0x00840051
amdgpu: Faulty UTCL2 client ID: MP1 (0x0)
amdgpu: MORE_FAULTS: 0x1
amdgpu: WALKER_ERROR: 0x0
amdgpu: PERMISSION_FAULTS: 0x5
amdgpu: MAPPING_ERROR: 0x0
amdgpu: RW: 0x1

When memory is allocated by kfd, no one triggers the tlb flush for MMHUB0.
There is page fault from MMHUB0.

v2:fix indentation
v3:change subject and fix indentation

Signed-off-by: Ruili Ji 
Reviewed-by: Philip Yang 
Acked-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 1d0c9762ebfb..a6801df038a2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -739,7 +739,8 @@ int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct amdgpu_device 
*adev,
 {
bool all_hub = false;
 
-   if (adev->family == AMDGPU_FAMILY_AI)
+   if (adev->family == AMDGPU_FAMILY_AI ||
+   adev->family == AMDGPU_FAMILY_RV)
all_hub = true;
 
return amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub);
-- 
2.25.1



[PATCH v2] mdkfd: To flush tlb for MMHUB of GFX9 series

2022-06-22 Thread Ji, Ruili
From: Ruili Ji 

amdgpu: [mmhub0] no-retry page fault (src_id:0 ring:40 vmid:8 pasid:32769, for 
process test_basic pid 3305 thread test_basic pid 3305)
amdgpu: in page starting at address 0x7ff990003000 from IH client 0x12 (VMC)
amdgpu: VM_L2_PROTECTION_FAULT_STATUS:0x00840051
amdgpu: Faulty UTCL2 client ID: MP1 (0x0)
amdgpu: MORE_FAULTS: 0x1
amdgpu: WALKER_ERROR: 0x0
amdgpu: PERMISSION_FAULTS: 0x5
amdgpu: MAPPING_ERROR: 0x0
amdgpu: RW: 0x1

When memory is allocated by kfd, no one triggers the tlb flush for MMHUB0.
There is page fault from MMHUB0.

v2:fix indentation

Signed-off-by: Ruili Ji 
Reviewed-by: Philip Yang 
Acked-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 1d0c9762ebfb..1dfd82d5d379 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -739,7 +739,8 @@ int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct amdgpu_device 
*adev,
 {
bool all_hub = false;
 
-   if (adev->family == AMDGPU_FAMILY_AI)
+   if (adev->family == AMDGPU_FAMILY_AI ||
+ adev->family == AMDGPU_FAMILY_RV)
all_hub = true;
 
return amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub);
-- 
2.25.1



[PATCH] drm/amdkfd: To flush tlb for MMHUB of GFX9 series

2022-06-20 Thread Ji, Ruili
From: Ruili Ji 

amdgpu: [mmhub0] no-retry page fault (src_id:0 ring:40 vmid:8 pasid:32769, for 
process test_basic pid 3305 thread test_basic pid 3305)
amdgpu: in page starting at address 0x7ff990003000 from IH client 0x12 (VMC)
amdgpu: VM_L2_PROTECTION_FAULT_STATUS:0x00840051
amdgpu: Faulty UTCL2 client ID: MP1 (0x0)
amdgpu: MORE_FAULTS: 0x1
amdgpu: WALKER_ERROR: 0x0
amdgpu: PERMISSION_FAULTS: 0x5
amdgpu: MAPPING_ERROR: 0x0
amdgpu: RW: 0x1

When memory is allocated by kfd, no one triggers the tlb flush for MMHUB0.
There is page fault from MMHUB0.

Signed-off-by: Ruili Ji 
Change-Id: I97786f02849dd047703d6e8feff53916b307715c
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 1d0c9762ebfb..12fc822c0a92 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -739,7 +739,8 @@ int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct amdgpu_device 
*adev,
 {
bool all_hub = false;
 
-   if (adev->family == AMDGPU_FAMILY_AI)
+   if (adev->family == AMDGPU_FAMILY_AI
+   || adev->family == AMDGPU_FAMILY_RV)
all_hub = true;
 
return amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub);
-- 
2.25.1



RE: [PATCH V2] drm/amdgpu: fix incorrect GCR_GENERAL_CNTL address

2022-03-29 Thread Ji, Ruili
[AMD Official Use Only]

Hi Paul,

This is not related to any issue.

Kind regards,
Ruili

-Original Message-
From: Paul Menzel 
Sent: 2022年3月29日 16:16
To: Ji, Ruili 
Cc: amd-gfx@lists.freedesktop.org; Zhang, Yifan ; Liu, 
Aaron ; Liang, Prike ; Huang, Ray 
; Deucher, Alexander ; Ji, Ruili 

Subject: Re: [PATCH V2] drm/amdgpu: fix incorrect GCR_GENERAL_CNTL address

[CAUTION: External Email]

Dear Ruili,


Thank you for your patch.

Am 28.03.22 um 06:58 schrieb Ji, Ruili:
> From: Ruili Ji 
>
> gfx10.3.3/gfx10.3.6/gfx10.3.7 shall use 0x1580 address for
> GCR_GENERAL_CNTL

Is any “user-visible“ problem fixed by this?

Please add a Fixes tag.


Kind regards,

Paul


> Signed-off-by: Ruili Ji 
> ---
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 6 +++---
>   1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 99df18ae7316..e4c9d92ac381 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -3300,7 +3300,7 @@ static const struct soc15_reg_golden 
> golden_settings_gc_10_3_3[] =
>   SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG3, 0x, 0x0280),
>   SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG4, 0x, 0x0080),
>   SOC15_REG_GOLDEN_VALUE(GC, 0, mmGB_ADDR_CONFIG, 0x0c1807ff, 0x0242),
> - SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL, 0x1ff1, 
> 0x0500),
> + SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL_Vangogh,
> + 0x1ff1, 0x0500),
>   SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL1_PIPE_STEER, 0x00ff, 0x00e4),
>   SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_0, 0x, 
> 0x32103210),
>   SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_1, 0x,
> 0x32103210), @@ -3436,7 +3436,7 @@ static const struct soc15_reg_golden 
> golden_settings_gc_10_3_6[] =
>   SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG3, 0x, 0x0280),
>   SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG4, 0x, 0x0080),
>   SOC15_REG_GOLDEN_VALUE(GC, 0, mmGB_ADDR_CONFIG, 0x0c1807ff, 0x0042),
> - SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL, 0x1ff1, 
> 0x0500),
> + SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL_Vangogh,
> + 0x1ff1, 0x0500),
>   SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL1_PIPE_STEER, 0x00ff, 0x0044),
>   SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_0, 0x, 
> 0x32103210),
>   SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_1, 0x,
> 0x32103210), @@ -3461,7 +3461,7 @@ static const struct soc15_reg_golden 
> golden_settings_gc_10_3_7[] = {
>   SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG3, 0x, 0x0280),
>   SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG4, 0x, 0x0080),
>   SOC15_REG_GOLDEN_VALUE(GC, 0, mmGB_ADDR_CONFIG, 0x0c1807ff, 0x0041),
> - SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL, 0x1ff1, 
> 0x0500),
> + SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL_Vangogh,
> + 0x1ff1, 0x0500),
>   SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL1_PIPE_STEER, 0x00ff, 0x00e4),
>   SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_0, 0x, 
> 0x32103210),
>   SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_1, 0x,
> 0x32103210),


[PATCH V2] drm/amdgpu: fix incorrect GCR_GENERAL_CNTL address

2022-03-27 Thread Ji, Ruili
From: Ruili Ji 

gfx10.3.3/gfx10.3.6/gfx10.3.7 shall use 0x1580 address for GCR_GENERAL_CNTL

Signed-off-by: Ruili Ji 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 99df18ae7316..e4c9d92ac381 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -3300,7 +3300,7 @@ static const struct soc15_reg_golden 
golden_settings_gc_10_3_3[] =
SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG3, 0x, 0x0280),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG4, 0x, 0x0080),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmGB_ADDR_CONFIG, 0x0c1807ff, 0x0242),
-   SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL, 0x1ff1, 
0x0500),
+   SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL_Vangogh, 0x1ff1, 
0x0500),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL1_PIPE_STEER, 0x00ff, 0x00e4),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_0, 0x, 
0x32103210),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_1, 0x, 
0x32103210),
@@ -3436,7 +3436,7 @@ static const struct soc15_reg_golden 
golden_settings_gc_10_3_6[] =
SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG3, 0x, 0x0280),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG4, 0x, 0x0080),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmGB_ADDR_CONFIG, 0x0c1807ff, 0x0042),
-   SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL, 0x1ff1, 
0x0500),
+   SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL_Vangogh, 0x1ff1, 
0x0500),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL1_PIPE_STEER, 0x00ff, 0x0044),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_0, 0x, 
0x32103210),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_1, 0x, 
0x32103210),
@@ -3461,7 +3461,7 @@ static const struct soc15_reg_golden 
golden_settings_gc_10_3_7[] = {
SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG3, 0x, 0x0280),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG4, 0x, 0x0080),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmGB_ADDR_CONFIG, 0x0c1807ff, 0x0041),
-   SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL, 0x1ff1, 
0x0500),
+   SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL_Vangogh, 0x1ff1, 
0x0500),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL1_PIPE_STEER, 0x00ff, 0x00e4),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_0, 0x, 
0x32103210),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_1, 0x, 
0x32103210),
-- 
2.25.1



[PATCH] drm/amdgpu: fix incorrect GCR_GENERAL_CNTL address

2022-03-27 Thread Ji, Ruili
From: Ruili Ji 

RMB shall use 0x1580 address for GCR_GENERAL_CNTL

Signed-off-by: Ruili Ji 
Change-Id: I10a85891986f31411f85fa3db46970aaa8a5bd03
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 99df18ae7316..e4c9d92ac381 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -3300,7 +3300,7 @@ static const struct soc15_reg_golden 
golden_settings_gc_10_3_3[] =
SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG3, 0x, 0x0280),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG4, 0x, 0x0080),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmGB_ADDR_CONFIG, 0x0c1807ff, 0x0242),
-   SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL, 0x1ff1, 
0x0500),
+   SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL_Vangogh, 0x1ff1, 
0x0500),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL1_PIPE_STEER, 0x00ff, 0x00e4),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_0, 0x, 
0x32103210),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_1, 0x, 
0x32103210),
@@ -3436,7 +3436,7 @@ static const struct soc15_reg_golden 
golden_settings_gc_10_3_6[] =
SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG3, 0x, 0x0280),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG4, 0x, 0x0080),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmGB_ADDR_CONFIG, 0x0c1807ff, 0x0042),
-   SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL, 0x1ff1, 
0x0500),
+   SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL_Vangogh, 0x1ff1, 
0x0500),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL1_PIPE_STEER, 0x00ff, 0x0044),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_0, 0x, 
0x32103210),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_1, 0x, 
0x32103210),
@@ -3461,7 +3461,7 @@ static const struct soc15_reg_golden 
golden_settings_gc_10_3_7[] = {
SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG3, 0x, 0x0280),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG4, 0x, 0x0080),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmGB_ADDR_CONFIG, 0x0c1807ff, 0x0041),
-   SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL, 0x1ff1, 
0x0500),
+   SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL_Vangogh, 0x1ff1, 
0x0500),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL1_PIPE_STEER, 0x00ff, 0x00e4),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_0, 0x, 
0x32103210),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_1, 0x, 
0x32103210),
-- 
2.25.1



RE: [PATCH 2/2] drm/amdkfd: svm range restore work deadlock when process exit

2022-01-20 Thread Ji, Ruili
[AMD Official Use Only]

sudo ./kfdtest --gtest_filter=KFDSVM*
sudo ./kfdtest
Test results are pass.
Tested-by: Ruili Ji 

-Original Message-
From: Yang, Philip 
Sent: 2022年1月20日 0:23
To: amd-gfx@lists.freedesktop.org
Cc: Kuehling, Felix ; Ji, Ruili ; 
Yang, Philip 
Subject: [PATCH 2/2] drm/amdkfd: svm range restore work deadlock when process 
exit

kfd_process_notifier_release flush svm_range_restore_work which calls 
svm_range_list_lock_and_flush_work to flush deferred_list work, but if 
deferred_list work mmput release the last user, it will call exit_mmap -> 
notifier_release, it is deadlock with below backtrace.

Move flush svm_range_restore_work to kfd_process_wq_release to avoid deadlock. 
Then svm_range_restore_work take task->mm ref to avoid mm is gone while 
validating and mapping ranges to GPU.

Workqueue: events svm_range_deferred_list_work [amdgpu] Call Trace:
 wait_for_completion+0x94/0x100
 __flush_work+0x12a/0x1e0
 __cancel_work_timer+0x10e/0x190
 cancel_delayed_work_sync+0x13/0x20
 kfd_process_notifier_release+0x98/0x2a0 [amdgpu]
 __mmu_notifier_release+0x74/0x1f0
 exit_mmap+0x170/0x200
 mmput+0x5d/0x130
 svm_range_deferred_list_work+0x104/0x230 [amdgpu]
 process_one_work+0x220/0x3c0

Signed-off-by: Philip Yang 
Reported-by: Ruili Ji 
Tested-by: Ruili Ji 
---
 drivers/gpu/drm/amd/amdkfd/kfd_process.c |  1 -
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 15 +--
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index d1145da5348f..74f162887d3b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1150,7 +1150,6 @@ static void kfd_process_notifier_release(struct 
mmu_notifier *mn,

cancel_delayed_work_sync(>eviction_work);
cancel_delayed_work_sync(>restore_work);
-   cancel_delayed_work_sync(>svms.restore_work);

mutex_lock(>mutex);

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 9ec195e1ef23..2d2cae05dbea 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1643,13 +1643,14 @@ static void svm_range_restore_work(struct work_struct 
*work)

pr_debug("restore svm ranges\n");

-   /* kfd_process_notifier_release destroys this worker thread. So during
-* the lifetime of this thread, kfd_process and mm will be valid.
-*/
p = container_of(svms, struct kfd_process, svms);
-   mm = p->mm;
-   if (!mm)
+
+   /* Keep mm reference when svm_range_validate_and_map ranges */
+   mm = get_task_mm(p->lead_thread);
+   if (!mm) {
+   pr_debug("svms 0x%p process mm gone\n", svms);
return;
+   }

svm_range_list_lock_and_flush_work(svms, mm);
mutex_lock(>lock);
@@ -1703,6 +1704,7 @@ static void svm_range_restore_work(struct work_struct 
*work)
 out_reschedule:
mutex_unlock(>lock);
mmap_write_unlock(mm);
+   mmput(mm);

/* If validation failed, reschedule another attempt */
if (evicted_ranges) {
@@ -2837,6 +2839,8 @@ void svm_range_list_fini(struct kfd_process *p)

pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, >svms);

+   cancel_delayed_work_sync(>svms.restore_work);
+
/* Ensure list work is finished before process is destroyed */
flush_work(>svms.deferred_list_work);

@@ -2847,7 +2851,6 @@ void svm_range_list_fini(struct kfd_process *p)
atomic_inc(>svms.drain_pagefaults);
svm_range_drain_retry_fault(>svms);

-
list_for_each_entry_safe(prange, next, >svms.list, list) {
svm_range_unlink(prange);
svm_range_remove_notifier(prange);
--
2.17.1



RE: [PATCH 1/2] drm/amdkfd: svm deferred_list work continue cleanup after mm gone

2022-01-20 Thread Ji, Ruili
[AMD Official Use Only]

sudo ./kfdtest --gtest_filter=KFDSVM*
sudo ./kfdtest
Test results are pass.
Tested-by: Ruili Ji 

-Original Message-
From: Yang, Philip 
Sent: 2022年1月20日 0:23
To: amd-gfx@lists.freedesktop.org
Cc: Kuehling, Felix ; Ji, Ruili ; 
Yang, Philip 
Subject: [PATCH 1/2] drm/amdkfd: svm deferred_list work continue cleanup after 
mm gone

After mm is removed from task->mm, deferred_list work should continue to handle 
deferred_range_list which maybe split to child range to avoid child range leak, 
and remove ranges mmu interval notifier to avoid mm mm_count leak, but skip 
updating notifier and inserting new notifier.

Signed-off-by: Philip Yang 
Reported-by: Ruili Ji 
Tested-by: Ruili Ji 
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 41 
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index f2805ba74c80..9ec195e1ef23 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1985,10 +1985,9 @@ svm_range_update_notifier_and_interval_tree(struct 
mm_struct *mm,  }

 static void
-svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange)
+svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange,
+struct mm_struct *mm)
 {
-   struct mm_struct *mm = prange->work_item.mm;
-
switch (prange->work_item.op) {
case SVM_OP_NULL:
pr_debug("NULL OP 0x%p prange 0x%p [0x%lx 0x%lx]\n", @@ 
-2004,25 +2003,29 @@ svm_range_handle_list_op(struct svm_range_list *svms, 
struct svm_range *prange)
case SVM_OP_UPDATE_RANGE_NOTIFIER:
pr_debug("update notifier 0x%p prange 0x%p [0x%lx 0x%lx]\n",
 svms, prange, prange->start, prange->last);
-   svm_range_update_notifier_and_interval_tree(mm, prange);
+   if (mm)
+   svm_range_update_notifier_and_interval_tree(mm, prange);
break;
case SVM_OP_UPDATE_RANGE_NOTIFIER_AND_MAP:
pr_debug("update and map 0x%p prange 0x%p [0x%lx 0x%lx]\n",
 svms, prange, prange->start, prange->last);
-   svm_range_update_notifier_and_interval_tree(mm, prange);
+   if (mm)
+   svm_range_update_notifier_and_interval_tree(mm, prange);
/* TODO: implement deferred validation and mapping */
break;
case SVM_OP_ADD_RANGE:
pr_debug("add 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, prange,
 prange->start, prange->last);
svm_range_add_to_svms(prange);
-   svm_range_add_notifier_locked(mm, prange);
+   if (mm)
+   svm_range_add_notifier_locked(mm, prange);
break;
case SVM_OP_ADD_RANGE_AND_MAP:
pr_debug("add and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms,
 prange, prange->start, prange->last);
svm_range_add_to_svms(prange);
-   svm_range_add_notifier_locked(mm, prange);
+   if (mm)
+   svm_range_add_notifier_locked(mm, prange);
/* TODO: implement deferred validation and mapping */
break;
default:
@@ -2071,20 +2074,22 @@ static void svm_range_deferred_list_work(struct 
work_struct *work)
pr_debug("enter svms 0x%p\n", svms);

p = container_of(svms, struct kfd_process, svms);
-   /* Avoid mm is gone when inserting mmu notifier */
+
+   /* If mm is gone, continue cleanup the deferred_range_list */
mm = get_task_mm(p->lead_thread);
-   if (!mm) {
+   if (!mm)
pr_debug("svms 0x%p process mm gone\n", svms);
-   return;
-   }
+
 retry:
-   mmap_write_lock(mm);
+   if (mm)
+   mmap_write_lock(mm);

/* Checking for the need to drain retry faults must be inside
 * mmap write lock to serialize with munmap notifiers.
 */
if (unlikely(atomic_read(>drain_pagefaults))) {
-   mmap_write_unlock(mm);
+   if (mm)
+   mmap_write_unlock(mm);
svm_range_drain_retry_fault(svms);
goto retry;
}
@@ -2109,19 +2114,21 @@ static void svm_range_deferred_list_work(struct 
work_struct *work)
pr_debug("child prange 0x%p op %d\n", pchild,
 pchild->work_item.op);
list_del_init(>child_list);
-   svm_range_handle_list_op(svms, pchild);
+   svm_range_handle_list_op(svms, pchild, mm);
}
mutex_unlock(>mi