RE: [PATCH 1/2] drm/amdkfd: fix trap handling work around for debugging
[Public] Hi Jonathan, Your change looks fine. Acknowledged-by: Ji, Ruili Thanks, Ruili -Original Message- From: Kim, Jonathan Sent: Wednesday, July 19, 2023 6:13 AM To: amd-gfx@lists.freedesktop.org Cc: Kuehling, Felix ; Ji, Ruili Subject: RE: [PATCH 1/2] drm/amdkfd: fix trap handling work around for debugging [Public] + Ruiji Li as this is a follow up to commit 52223c7e74d124bea47beec467e59fdfc77559fc Author: Ruili Ji Date: Tue Jun 6 14:06:01 2023 +0800 drm/amdkfd: To enable traps for GC_11_0_4 and up Flag trap_en should be enabled for trap handler. Signed-off-by: Ruili Ji Signed-off-by: Aaron Liu Reviewed-by: Alex Deucher To ensure debugger is consistent with other checks. Thanks, Jon > -Original Message- > From: Kim, Jonathan > Sent: Friday, July 14, 2023 5:38 AM > To: amd-gfx@lists.freedesktop.org > Cc: Kuehling, Felix ; Kim, Jonathan > > Subject: [PATCH 1/2] drm/amdkfd: fix trap handling work around for > debugging > > Update the list of devices that require the cwsr trap handling > workaround for debugging use cases. > > Signed-off-by: Jonathan Kim > --- > drivers/gpu/drm/amd/amdkfd/kfd_debug.c| 5 ++--- > drivers/gpu/drm/amd/amdkfd/kfd_debug.h| 6 ++ > drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 6 ++ > 3 files changed, 10 insertions(+), 7 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c > b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c > index 190b03efe5ff..ccfc81f085ce 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c > @@ -302,8 +302,7 @@ static int kfd_dbg_set_queue_workaround(struct > queue *q, bool enable) > if (!q) > return 0; > > - if (KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) || > - KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0)) > + if (!kfd_dbg_has_cwsr_workaround(q->device)) > return 0; > > if (enable && q->properties.is_user_cu_masked) @@ -349,7 +348,7 > @@ int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd) { > uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd- > >spi_dbg_launch_mode; > uint32_t flags = pdd->process->dbg_flags; > - bool sq_trap_en = !!spi_dbg_cntl; > + bool sq_trap_en = !!spi_dbg_cntl || > !kfd_dbg_has_cwsr_workaround(pdd->dev); > > if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) > return 0; > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h > b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h > index ba616ed17dee..586d7f886712 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h > @@ -101,6 +101,12 @@ static inline bool > kfd_dbg_is_rlc_restore_supported(struct kfd_node *dev) >KFD_GC_VERSION(dev) == IP_VERSION(10, 1, 1)); } > > +static inline bool kfd_dbg_has_cwsr_workaround(struct kfd_node *dev) > +{ > + return KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0) && > +KFD_GC_VERSION(dev) <= IP_VERSION(11, 0, 3); } > + > static inline bool kfd_dbg_has_gws_support(struct kfd_node *dev) { > if ((KFD_GC_VERSION(dev) == IP_VERSION(9, 0, 1) diff --git > a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > index 31cac1fd0d58..761963ad6154 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > @@ -226,8 +226,7 @@ static int add_queue_mes(struct > device_queue_manager *dqm, struct queue *q, > queue_input.paging = false; > queue_input.tba_addr = qpd->tba_addr; > queue_input.tma_addr = qpd->tma_addr; > - queue_input.trap_en = KFD_GC_VERSION(q->device) < > IP_VERSION(11, 0, 0) || > - KFD_GC_VERSION(q->device) > IP_VERSION(11, 0, > 3); > + queue_input.trap_en = !kfd_dbg_has_cwsr_workaround(q->device); > queue_input.skip_process_ctx_clear = qpd->pqm->process- > >debug_trap_enabled; > > queue_type = convert_to_mes_queue_type(q->properties.type); > @@ -1827,8 +1826,7 @@ static int create_queue_cpsch(struct > device_queue_manager *dqm, struct queue *q, >*/ > q->properties.is_evicted = !!qpd->evicted; > q->properties.is_dbg_wa = qpd->pqm->process- > >debug_trap_enabled && > - KFD_GC_VERSION(q->device) >= IP_VERSION(11, 0, 0) > && > - KFD_GC_VERSION(q->device) <= IP_VERSION(11, 0, 3); > + > + kfd_dbg_has_cwsr_workaround(q->device); > > if (qd) > mqd_mgr->restore_mqd(mqd_mgr, >mqd, q- > >mqd_mem_obj, >gart_mqd_addr, > -- > 2.25.1
[PATCH v2] drm/amdkfd: To enable traps for GC_11_0_4 and up
From: Ruili Ji Flag trap_en should be enabled for trap handler. Signed-off-by: Ruili Ji Signed-off-by: Aaron Liu Reviewed-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index d6b15493fffd..8a39a9e0ed5a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -227,7 +227,7 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, queue_input.tba_addr = qpd->tba_addr; queue_input.tma_addr = qpd->tma_addr; queue_input.trap_en = KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) || - KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0); + KFD_GC_VERSION(q->device) > IP_VERSION(11, 0, 3); queue_input.skip_process_ctx_clear = qpd->pqm->process->debug_trap_enabled; queue_type = convert_to_mes_queue_type(q->properties.type); @@ -1807,7 +1807,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, q->properties.is_evicted = !!qpd->evicted; q->properties.is_dbg_wa = qpd->pqm->process->debug_trap_enabled && KFD_GC_VERSION(q->device) >= IP_VERSION(11, 0, 0) && - KFD_GC_VERSION(q->device) < IP_VERSION(12, 0, 0); + KFD_GC_VERSION(q->device) <= IP_VERSION(11, 0, 3); if (qd) mqd_mgr->restore_mqd(mqd_mgr, >mqd, q->mqd_mem_obj, >gart_mqd_addr, -- 2.40.1
[PATCH] drm/amdkfd: to fix cwsr hang issue
From: Ruili Ji Starting from GC_11_0_4, flag trap_en should be enabled for trap handler. Signed-off-by: Ruili Ji Signed-off-by: Aaron Liu Reviewed-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 0c1be91a87c6..b695d7a3058c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -227,7 +227,7 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, queue_input.tba_addr = qpd->tba_addr; queue_input.tma_addr = qpd->tma_addr; queue_input.trap_en = KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) || - KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0) || + KFD_GC_VERSION(q->device) > IP_VERSION(11, 0, 3) || q->properties.is_dbg_wa; queue_input.skip_process_ctx_clear = qpd->pqm->process->debug_trap_enabled; @@ -1808,7 +1808,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, q->properties.is_evicted = !!qpd->evicted; q->properties.is_dbg_wa = qpd->pqm->process->debug_trap_enabled && KFD_GC_VERSION(q->device) >= IP_VERSION(11, 0, 0) && - KFD_GC_VERSION(q->device) < IP_VERSION(12, 0, 0); + KFD_GC_VERSION(q->device) <= IP_VERSION(11, 0, 3); if (qd) mqd_mgr->restore_mqd(mqd_mgr, >mqd, q->mqd_mem_obj, >gart_mqd_addr, -- 2.40.1
[PATCH v2] drm/amdkfd: To fix sdma page fault issue for GC 11.x
From: Ruili Ji For the MQD memory, KMD would always allocate 4K memory, and mes scheduler would write to the end of MQD for unmap flag. Signed-off-by: Ruili Ji --- .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 5 +++-- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c | 15 ++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index c06ada0844ba..7a95698d83f7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -2373,7 +2373,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) if (init_mqd_managers(dqm)) goto out_free; - if (allocate_hiq_sdma_mqd(dqm)) { + if (!dev->shared_resources.enable_mes && allocate_hiq_sdma_mqd(dqm)) { pr_err("Failed to allocate hiq sdma mqd trunk buffer\n"); goto out_free; } @@ -2397,7 +2397,8 @@ static void deallocate_hiq_sdma_mqd(struct kfd_dev *dev, void device_queue_manager_uninit(struct device_queue_manager *dqm) { dqm->ops.uninitialize(dqm); - deallocate_hiq_sdma_mqd(dqm->dev, >hiq_sdma_mqd); + if (!dqm->dev->shared_resources.enable_mes) + deallocate_hiq_sdma_mqd(dqm->dev, >hiq_sdma_mqd); kfree(dqm); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c index 4f6390f3236e..4a9af800b1f1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c @@ -308,11 +308,16 @@ static void init_mqd_sdma(struct mqd_manager *mm, void **mqd, struct queue_properties *q) { struct v11_sdma_mqd *m; + int size; m = (struct v11_sdma_mqd *) mqd_mem_obj->cpu_ptr; - memset(m, 0, sizeof(struct v11_sdma_mqd)); + if (mm->dev->shared_resources.enable_mes) + size = PAGE_SIZE; + else + size = sizeof(struct v11_sdma_mqd); + memset(m, 0, size); *mqd = m; if (gart_addr) *gart_addr = mqd_mem_obj->gpu_addr; @@ -443,6 +448,14 @@ struct mqd_manager *mqd_manager_init_v11(enum KFD_MQD_TYPE type, #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; #endif + /* +* To allocate SDMA MQDs by generic functions +* when MES is enabled. +*/ + if (dev->shared_resources.enable_mes) { + mqd->allocate_mqd = allocate_mqd; + mqd->free_mqd = kfd_free_mqd_cp; + } pr_debug("%s@%i\n", __func__, __LINE__); break; default: -- 2.25.1
[PATCH] drm/amdkfd: To fix sdma page fault issue for GC 11.x
From: Ruili Ji For the MQD memory, KMD would always allocate 4K memory, and mes scheduler would write to the end of MQD for unmap flag. Signed-off-by: Ruili Ji --- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 20 +++ drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 12 +-- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index c06ada0844ba..d682e6921438 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -2244,10 +2244,22 @@ static int allocate_hiq_sdma_mqd(struct device_queue_manager *dqm) int retval; struct kfd_dev *dev = dqm->dev; struct kfd_mem_obj *mem_obj = >hiq_sdma_mqd; - uint32_t size = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size * - get_num_all_sdma_engines(dqm) * - dev->device_info.num_sdma_queues_per_engine + - dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size; + uint32_t size; + /* +* MES write to areas beyond MQD size. So allocate +* 1 PAGE_SIZE memory for MQD is MES is enabled. +*/ + if (dev->shared_resources.enable_mes) { + size = PAGE_SIZE * + get_num_all_sdma_engines(dqm) * + dev->device_info.num_sdma_queues_per_engine + + dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size; + } else { + size = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size * + get_num_all_sdma_engines(dqm) * + dev->device_info.num_sdma_queues_per_engine + + dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size; + } retval = amdgpu_amdkfd_alloc_gtt_mem(dev->adev, size, &(mem_obj->gtt_mem), &(mem_obj->gpu_addr), diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c index 623ccd227b7d..ea176a515898 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c @@ -66,15 +66,23 @@ struct kfd_mem_obj *allocate_sdma_mqd(struct kfd_dev *dev, { struct kfd_mem_obj *mqd_mem_obj = NULL; uint64_t offset; + uint32_t size; mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); if (!mqd_mem_obj) return NULL; + /* +* MES write to areas beyond MQD size. So allocate +* 1 PAGE_SIZE memory for MQD is MES is enabled. +*/ + if (dev->shared_resources.enable_mes) + size = PAGE_SIZE; + else + size = dev->dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size; offset = (q->sdma_engine_id * dev->device_info.num_sdma_queues_per_engine + - q->sdma_queue_id) * - dev->dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size; + q->sdma_queue_id) * size; offset += dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size; -- 2.25.1
[PATCH] drm/amdgpu: Enable F32_WPTR_POLL_ENABLE in mqd
From: Ruili Ji This patch is to fix the SDMA user queue doorbell missing issue on SDMA 6.0. F32_WPTR_POLL_ENABLE has to be set if doorbell mode is used. Otherwise ringing SDMA user queue doorbell can't wake up system from gfxoff. Signed-off-by: Yifan Zhang Signed-off-by: Ruili Ji Change-Id: Icfb97c3551509b4d7fb172ebc4200edf5844e5e1 --- drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c | 3 ++- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c index db51230163c5..0150f66a5ae6 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c @@ -846,7 +846,8 @@ static int sdma_v6_0_mqd_init(struct amdgpu_device *adev, void *mqd, m->sdmax_rlcx_rb_cntl = order_base_2(prop->queue_size / 4) << SDMA0_QUEUE0_RB_CNTL__RB_SIZE__SHIFT | 1 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | - 4 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; + 4 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT | + 1 << SDMA0_QUEUE0_RB_CNTL__F32_WPTR_POLL_ENABLE__SHIFT; m->sdmax_rlcx_rb_base = lower_32_bits(prop->hqd_base_gpu_addr >> 8); m->sdmax_rlcx_rb_base_hi = upper_32_bits(prop->hqd_base_gpu_addr >> 8); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c index 26b53b6d673e..4f6390f3236e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c @@ -333,7 +333,8 @@ static void update_mqd_sdma(struct mqd_manager *mm, void *mqd, << SDMA0_QUEUE0_RB_CNTL__RB_SIZE__SHIFT | q->vmid << SDMA0_QUEUE0_RB_CNTL__RB_VMID__SHIFT | 1 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | - 6 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; + 6 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT | + 1 << SDMA0_QUEUE0_RB_CNTL__F32_WPTR_POLL_ENABLE__SHIFT; m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); -- 2.25.1
[PATCH v3] drm/amdgpu: To flush tlb for MMHUB of RAVEN series
From: Ruili Ji amdgpu: [mmhub0] no-retry page fault (src_id:0 ring:40 vmid:8 pasid:32769, for process test_basic pid 3305 thread test_basic pid 3305) amdgpu: in page starting at address 0x7ff990003000 from IH client 0x12 (VMC) amdgpu: VM_L2_PROTECTION_FAULT_STATUS:0x00840051 amdgpu: Faulty UTCL2 client ID: MP1 (0x0) amdgpu: MORE_FAULTS: 0x1 amdgpu: WALKER_ERROR: 0x0 amdgpu: PERMISSION_FAULTS: 0x5 amdgpu: MAPPING_ERROR: 0x0 amdgpu: RW: 0x1 When memory is allocated by kfd, no one triggers the tlb flush for MMHUB0. There is page fault from MMHUB0. v2:fix indentation v3:change subject and fix indentation Signed-off-by: Ruili Ji Reviewed-by: Philip Yang Acked-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 1d0c9762ebfb..a6801df038a2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -739,7 +739,8 @@ int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct amdgpu_device *adev, { bool all_hub = false; - if (adev->family == AMDGPU_FAMILY_AI) + if (adev->family == AMDGPU_FAMILY_AI || + adev->family == AMDGPU_FAMILY_RV) all_hub = true; return amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub); -- 2.25.1
[PATCH v2] mdkfd: To flush tlb for MMHUB of GFX9 series
From: Ruili Ji amdgpu: [mmhub0] no-retry page fault (src_id:0 ring:40 vmid:8 pasid:32769, for process test_basic pid 3305 thread test_basic pid 3305) amdgpu: in page starting at address 0x7ff990003000 from IH client 0x12 (VMC) amdgpu: VM_L2_PROTECTION_FAULT_STATUS:0x00840051 amdgpu: Faulty UTCL2 client ID: MP1 (0x0) amdgpu: MORE_FAULTS: 0x1 amdgpu: WALKER_ERROR: 0x0 amdgpu: PERMISSION_FAULTS: 0x5 amdgpu: MAPPING_ERROR: 0x0 amdgpu: RW: 0x1 When memory is allocated by kfd, no one triggers the tlb flush for MMHUB0. There is page fault from MMHUB0. v2:fix indentation Signed-off-by: Ruili Ji Reviewed-by: Philip Yang Acked-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 1d0c9762ebfb..1dfd82d5d379 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -739,7 +739,8 @@ int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct amdgpu_device *adev, { bool all_hub = false; - if (adev->family == AMDGPU_FAMILY_AI) + if (adev->family == AMDGPU_FAMILY_AI || + adev->family == AMDGPU_FAMILY_RV) all_hub = true; return amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub); -- 2.25.1
[PATCH] drm/amdkfd: To flush tlb for MMHUB of GFX9 series
From: Ruili Ji amdgpu: [mmhub0] no-retry page fault (src_id:0 ring:40 vmid:8 pasid:32769, for process test_basic pid 3305 thread test_basic pid 3305) amdgpu: in page starting at address 0x7ff990003000 from IH client 0x12 (VMC) amdgpu: VM_L2_PROTECTION_FAULT_STATUS:0x00840051 amdgpu: Faulty UTCL2 client ID: MP1 (0x0) amdgpu: MORE_FAULTS: 0x1 amdgpu: WALKER_ERROR: 0x0 amdgpu: PERMISSION_FAULTS: 0x5 amdgpu: MAPPING_ERROR: 0x0 amdgpu: RW: 0x1 When memory is allocated by kfd, no one triggers the tlb flush for MMHUB0. There is page fault from MMHUB0. Signed-off-by: Ruili Ji Change-Id: I97786f02849dd047703d6e8feff53916b307715c --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 1d0c9762ebfb..12fc822c0a92 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -739,7 +739,8 @@ int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct amdgpu_device *adev, { bool all_hub = false; - if (adev->family == AMDGPU_FAMILY_AI) + if (adev->family == AMDGPU_FAMILY_AI + || adev->family == AMDGPU_FAMILY_RV) all_hub = true; return amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub); -- 2.25.1
RE: [PATCH V2] drm/amdgpu: fix incorrect GCR_GENERAL_CNTL address
[AMD Official Use Only] Hi Paul, This is not related to any issue. Kind regards, Ruili -Original Message- From: Paul Menzel Sent: 2022年3月29日 16:16 To: Ji, Ruili Cc: amd-gfx@lists.freedesktop.org; Zhang, Yifan ; Liu, Aaron ; Liang, Prike ; Huang, Ray ; Deucher, Alexander ; Ji, Ruili Subject: Re: [PATCH V2] drm/amdgpu: fix incorrect GCR_GENERAL_CNTL address [CAUTION: External Email] Dear Ruili, Thank you for your patch. Am 28.03.22 um 06:58 schrieb Ji, Ruili: > From: Ruili Ji > > gfx10.3.3/gfx10.3.6/gfx10.3.7 shall use 0x1580 address for > GCR_GENERAL_CNTL Is any “user-visible“ problem fixed by this? Please add a Fixes tag. Kind regards, Paul > Signed-off-by: Ruili Ji > --- > drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 6 +++--- > 1 file changed, 3 insertions(+), 3 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > index 99df18ae7316..e4c9d92ac381 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > @@ -3300,7 +3300,7 @@ static const struct soc15_reg_golden > golden_settings_gc_10_3_3[] = > SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG3, 0x, 0x0280), > SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG4, 0x, 0x0080), > SOC15_REG_GOLDEN_VALUE(GC, 0, mmGB_ADDR_CONFIG, 0x0c1807ff, 0x0242), > - SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL, 0x1ff1, > 0x0500), > + SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL_Vangogh, > + 0x1ff1, 0x0500), > SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL1_PIPE_STEER, 0x00ff, 0x00e4), > SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_0, 0x, > 0x32103210), > SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_1, 0x, > 0x32103210), @@ -3436,7 +3436,7 @@ static const struct soc15_reg_golden > golden_settings_gc_10_3_6[] = > SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG3, 0x, 0x0280), > SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG4, 0x, 0x0080), > SOC15_REG_GOLDEN_VALUE(GC, 0, mmGB_ADDR_CONFIG, 0x0c1807ff, 0x0042), > - SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL, 0x1ff1, > 0x0500), > + SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL_Vangogh, > + 0x1ff1, 0x0500), > SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL1_PIPE_STEER, 0x00ff, 0x0044), > SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_0, 0x, > 0x32103210), > SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_1, 0x, > 0x32103210), @@ -3461,7 +3461,7 @@ static const struct soc15_reg_golden > golden_settings_gc_10_3_7[] = { > SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG3, 0x, 0x0280), > SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG4, 0x, 0x0080), > SOC15_REG_GOLDEN_VALUE(GC, 0, mmGB_ADDR_CONFIG, 0x0c1807ff, 0x0041), > - SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL, 0x1ff1, > 0x0500), > + SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL_Vangogh, > + 0x1ff1, 0x0500), > SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL1_PIPE_STEER, 0x00ff, 0x00e4), > SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_0, 0x, > 0x32103210), > SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_1, 0x, > 0x32103210),
[PATCH V2] drm/amdgpu: fix incorrect GCR_GENERAL_CNTL address
From: Ruili Ji gfx10.3.3/gfx10.3.6/gfx10.3.7 shall use 0x1580 address for GCR_GENERAL_CNTL Signed-off-by: Ruili Ji --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 99df18ae7316..e4c9d92ac381 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -3300,7 +3300,7 @@ static const struct soc15_reg_golden golden_settings_gc_10_3_3[] = SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG3, 0x, 0x0280), SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG4, 0x, 0x0080), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGB_ADDR_CONFIG, 0x0c1807ff, 0x0242), - SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL, 0x1ff1, 0x0500), + SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL_Vangogh, 0x1ff1, 0x0500), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL1_PIPE_STEER, 0x00ff, 0x00e4), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_0, 0x, 0x32103210), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_1, 0x, 0x32103210), @@ -3436,7 +3436,7 @@ static const struct soc15_reg_golden golden_settings_gc_10_3_6[] = SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG3, 0x, 0x0280), SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG4, 0x, 0x0080), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGB_ADDR_CONFIG, 0x0c1807ff, 0x0042), - SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL, 0x1ff1, 0x0500), + SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL_Vangogh, 0x1ff1, 0x0500), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL1_PIPE_STEER, 0x00ff, 0x0044), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_0, 0x, 0x32103210), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_1, 0x, 0x32103210), @@ -3461,7 +3461,7 @@ static const struct soc15_reg_golden golden_settings_gc_10_3_7[] = { SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG3, 0x, 0x0280), SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG4, 0x, 0x0080), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGB_ADDR_CONFIG, 0x0c1807ff, 0x0041), - SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL, 0x1ff1, 0x0500), + SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL_Vangogh, 0x1ff1, 0x0500), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL1_PIPE_STEER, 0x00ff, 0x00e4), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_0, 0x, 0x32103210), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_1, 0x, 0x32103210), -- 2.25.1
[PATCH] drm/amdgpu: fix incorrect GCR_GENERAL_CNTL address
From: Ruili Ji RMB shall use 0x1580 address for GCR_GENERAL_CNTL Signed-off-by: Ruili Ji Change-Id: I10a85891986f31411f85fa3db46970aaa8a5bd03 --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 99df18ae7316..e4c9d92ac381 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -3300,7 +3300,7 @@ static const struct soc15_reg_golden golden_settings_gc_10_3_3[] = SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG3, 0x, 0x0280), SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG4, 0x, 0x0080), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGB_ADDR_CONFIG, 0x0c1807ff, 0x0242), - SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL, 0x1ff1, 0x0500), + SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL_Vangogh, 0x1ff1, 0x0500), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL1_PIPE_STEER, 0x00ff, 0x00e4), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_0, 0x, 0x32103210), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_1, 0x, 0x32103210), @@ -3436,7 +3436,7 @@ static const struct soc15_reg_golden golden_settings_gc_10_3_6[] = SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG3, 0x, 0x0280), SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG4, 0x, 0x0080), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGB_ADDR_CONFIG, 0x0c1807ff, 0x0042), - SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL, 0x1ff1, 0x0500), + SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL_Vangogh, 0x1ff1, 0x0500), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL1_PIPE_STEER, 0x00ff, 0x0044), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_0, 0x, 0x32103210), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_1, 0x, 0x32103210), @@ -3461,7 +3461,7 @@ static const struct soc15_reg_golden golden_settings_gc_10_3_7[] = { SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG3, 0x, 0x0280), SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG4, 0x, 0x0080), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGB_ADDR_CONFIG, 0x0c1807ff, 0x0041), - SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL, 0x1ff1, 0x0500), + SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL_Vangogh, 0x1ff1, 0x0500), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL1_PIPE_STEER, 0x00ff, 0x00e4), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_0, 0x, 0x32103210), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2_PIPE_STEER_1, 0x, 0x32103210), -- 2.25.1
RE: [PATCH 2/2] drm/amdkfd: svm range restore work deadlock when process exit
[AMD Official Use Only] sudo ./kfdtest --gtest_filter=KFDSVM* sudo ./kfdtest Test results are pass. Tested-by: Ruili Ji -Original Message- From: Yang, Philip Sent: 2022年1月20日 0:23 To: amd-gfx@lists.freedesktop.org Cc: Kuehling, Felix ; Ji, Ruili ; Yang, Philip Subject: [PATCH 2/2] drm/amdkfd: svm range restore work deadlock when process exit kfd_process_notifier_release flush svm_range_restore_work which calls svm_range_list_lock_and_flush_work to flush deferred_list work, but if deferred_list work mmput release the last user, it will call exit_mmap -> notifier_release, it is deadlock with below backtrace. Move flush svm_range_restore_work to kfd_process_wq_release to avoid deadlock. Then svm_range_restore_work take task->mm ref to avoid mm is gone while validating and mapping ranges to GPU. Workqueue: events svm_range_deferred_list_work [amdgpu] Call Trace: wait_for_completion+0x94/0x100 __flush_work+0x12a/0x1e0 __cancel_work_timer+0x10e/0x190 cancel_delayed_work_sync+0x13/0x20 kfd_process_notifier_release+0x98/0x2a0 [amdgpu] __mmu_notifier_release+0x74/0x1f0 exit_mmap+0x170/0x200 mmput+0x5d/0x130 svm_range_deferred_list_work+0x104/0x230 [amdgpu] process_one_work+0x220/0x3c0 Signed-off-by: Philip Yang Reported-by: Ruili Ji Tested-by: Ruili Ji --- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 1 - drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 15 +-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index d1145da5348f..74f162887d3b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -1150,7 +1150,6 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, cancel_delayed_work_sync(>eviction_work); cancel_delayed_work_sync(>restore_work); - cancel_delayed_work_sync(>svms.restore_work); mutex_lock(>mutex); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 9ec195e1ef23..2d2cae05dbea 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1643,13 +1643,14 @@ static void svm_range_restore_work(struct work_struct *work) pr_debug("restore svm ranges\n"); - /* kfd_process_notifier_release destroys this worker thread. So during -* the lifetime of this thread, kfd_process and mm will be valid. -*/ p = container_of(svms, struct kfd_process, svms); - mm = p->mm; - if (!mm) + + /* Keep mm reference when svm_range_validate_and_map ranges */ + mm = get_task_mm(p->lead_thread); + if (!mm) { + pr_debug("svms 0x%p process mm gone\n", svms); return; + } svm_range_list_lock_and_flush_work(svms, mm); mutex_lock(>lock); @@ -1703,6 +1704,7 @@ static void svm_range_restore_work(struct work_struct *work) out_reschedule: mutex_unlock(>lock); mmap_write_unlock(mm); + mmput(mm); /* If validation failed, reschedule another attempt */ if (evicted_ranges) { @@ -2837,6 +2839,8 @@ void svm_range_list_fini(struct kfd_process *p) pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, >svms); + cancel_delayed_work_sync(>svms.restore_work); + /* Ensure list work is finished before process is destroyed */ flush_work(>svms.deferred_list_work); @@ -2847,7 +2851,6 @@ void svm_range_list_fini(struct kfd_process *p) atomic_inc(>svms.drain_pagefaults); svm_range_drain_retry_fault(>svms); - list_for_each_entry_safe(prange, next, >svms.list, list) { svm_range_unlink(prange); svm_range_remove_notifier(prange); -- 2.17.1
RE: [PATCH 1/2] drm/amdkfd: svm deferred_list work continue cleanup after mm gone
[AMD Official Use Only] sudo ./kfdtest --gtest_filter=KFDSVM* sudo ./kfdtest Test results are pass. Tested-by: Ruili Ji -Original Message- From: Yang, Philip Sent: 2022年1月20日 0:23 To: amd-gfx@lists.freedesktop.org Cc: Kuehling, Felix ; Ji, Ruili ; Yang, Philip Subject: [PATCH 1/2] drm/amdkfd: svm deferred_list work continue cleanup after mm gone After mm is removed from task->mm, deferred_list work should continue to handle deferred_range_list which maybe split to child range to avoid child range leak, and remove ranges mmu interval notifier to avoid mm mm_count leak, but skip updating notifier and inserting new notifier. Signed-off-by: Philip Yang Reported-by: Ruili Ji Tested-by: Ruili Ji --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 41 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index f2805ba74c80..9ec195e1ef23 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1985,10 +1985,9 @@ svm_range_update_notifier_and_interval_tree(struct mm_struct *mm, } static void -svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange) +svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange, +struct mm_struct *mm) { - struct mm_struct *mm = prange->work_item.mm; - switch (prange->work_item.op) { case SVM_OP_NULL: pr_debug("NULL OP 0x%p prange 0x%p [0x%lx 0x%lx]\n", @@ -2004,25 +2003,29 @@ svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange) case SVM_OP_UPDATE_RANGE_NOTIFIER: pr_debug("update notifier 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, prange, prange->start, prange->last); - svm_range_update_notifier_and_interval_tree(mm, prange); + if (mm) + svm_range_update_notifier_and_interval_tree(mm, prange); break; case SVM_OP_UPDATE_RANGE_NOTIFIER_AND_MAP: pr_debug("update and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, prange, prange->start, prange->last); - svm_range_update_notifier_and_interval_tree(mm, prange); + if (mm) + svm_range_update_notifier_and_interval_tree(mm, prange); /* TODO: implement deferred validation and mapping */ break; case SVM_OP_ADD_RANGE: pr_debug("add 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, prange, prange->start, prange->last); svm_range_add_to_svms(prange); - svm_range_add_notifier_locked(mm, prange); + if (mm) + svm_range_add_notifier_locked(mm, prange); break; case SVM_OP_ADD_RANGE_AND_MAP: pr_debug("add and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, prange, prange->start, prange->last); svm_range_add_to_svms(prange); - svm_range_add_notifier_locked(mm, prange); + if (mm) + svm_range_add_notifier_locked(mm, prange); /* TODO: implement deferred validation and mapping */ break; default: @@ -2071,20 +2074,22 @@ static void svm_range_deferred_list_work(struct work_struct *work) pr_debug("enter svms 0x%p\n", svms); p = container_of(svms, struct kfd_process, svms); - /* Avoid mm is gone when inserting mmu notifier */ + + /* If mm is gone, continue cleanup the deferred_range_list */ mm = get_task_mm(p->lead_thread); - if (!mm) { + if (!mm) pr_debug("svms 0x%p process mm gone\n", svms); - return; - } + retry: - mmap_write_lock(mm); + if (mm) + mmap_write_lock(mm); /* Checking for the need to drain retry faults must be inside * mmap write lock to serialize with munmap notifiers. */ if (unlikely(atomic_read(>drain_pagefaults))) { - mmap_write_unlock(mm); + if (mm) + mmap_write_unlock(mm); svm_range_drain_retry_fault(svms); goto retry; } @@ -2109,19 +2114,21 @@ static void svm_range_deferred_list_work(struct work_struct *work) pr_debug("child prange 0x%p op %d\n", pchild, pchild->work_item.op); list_del_init(>child_list); - svm_range_handle_list_op(svms, pchild); + svm_range_handle_list_op(svms, pchild, mm); } mutex_unlock(>mi