[PATCH] drm/amdgpu: GC_9.4.3 requires at least 280MB TMR
On GC_9.4.3, if atombios reports TMR size less than 280MB, firmware area will be overwritten by driver or user application use. Remove !adev->bios condition since reserve_size is initialized as 0, it'll fail into else if (!reserve_size) condition. Signed-off-by: Amber Lin --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 7c6dd3de1867..fa5721b3139c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1718,7 +1718,7 @@ static int amdgpu_ttm_reserve_tmr(struct amdgpu_device *adev) reserve_size = amdgpu_atomfirmware_get_fw_reserved_fb_size(adev); - if (!adev->bios && adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3)) + if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3)) reserve_size = max(reserve_size, (uint32_t)280 << 20); else if (!reserve_size) reserve_size = DISCOVERY_TMR_OFFSET; -- 2.25.1
[PATCH] drm/amdkfd: Remove unused entries in table
Remove unused entries in kfd_device_info table: num_xgmi_sdma_engines and num_sdma_queues_per_engine. They are calculated in kfd_get_num_sdma_engines and kfd_get_num_xgmi_sdma_engines instead. Signed-off-by: Amber Lin --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 58 - drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 - 2 files changed, 60 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 3fea47e37c17..e1294fba0c26 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -68,8 +68,6 @@ static const struct kfd_device_info kaveri_device_info = { .supports_cwsr = false, .needs_iommu_device = true, .needs_pci_atomics = false, - .num_sdma_engines = 2, - .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -87,8 +85,6 @@ static const struct kfd_device_info carrizo_device_info = { .supports_cwsr = true, .needs_iommu_device = true, .needs_pci_atomics = false, - .num_sdma_engines = 2, - .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -105,8 +101,6 @@ static const struct kfd_device_info raven_device_info = { .supports_cwsr = true, .needs_iommu_device = true, .needs_pci_atomics = true, - .num_sdma_engines = 1, - .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; #endif @@ -126,8 +120,6 @@ static const struct kfd_device_info hawaii_device_info = { .supports_cwsr = false, .needs_iommu_device = false, .needs_pci_atomics = false, - .num_sdma_engines = 2, - .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; #endif @@ -145,8 +137,6 @@ static const struct kfd_device_info tonga_device_info = { .supports_cwsr = false, .needs_iommu_device = false, .needs_pci_atomics = true, - .num_sdma_engines = 2, - .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -163,8 +153,6 @@ static const struct kfd_device_info fiji_device_info = { .supports_cwsr = true, .needs_iommu_device = false, .needs_pci_atomics = true, - .num_sdma_engines = 2, - .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -181,8 +169,6 @@ static const struct kfd_device_info fiji_vf_device_info = { .supports_cwsr = true, .needs_iommu_device = false, .needs_pci_atomics = false, - .num_sdma_engines = 2, - .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -200,8 +186,6 @@ static const struct kfd_device_info polaris10_device_info = { .supports_cwsr = true, .needs_iommu_device = false, .needs_pci_atomics = true, - .num_sdma_engines = 2, - .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -218,8 +202,6 @@ static const struct kfd_device_info polaris10_vf_device_info = { .supports_cwsr = true, .needs_iommu_device = false, .needs_pci_atomics = false, - .num_sdma_engines = 2, - .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -236,8 +218,6 @@ static const struct kfd_device_info polaris11_device_info = { .supports_cwsr = true, .needs_iommu_device = false, .needs_pci_atomics = true, - .num_sdma_engines = 2, - .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -254,8 +234,6 @@ static const struct kfd_device_info polaris12_device_info = { .supports_cwsr = true, .needs_iommu_device = false, .needs_pci_atomics = true, - .num_sdma_engines = 2, - .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -272,8 +250,6 @@ static const struct kfd_device_info vegam_device_info = { .supports_cwsr = true, .needs_iommu_device = false, .needs_pci_atomics = true, - .num_sdma_engines = 2, - .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -290,8 +266,6 @@ static const struct kfd_device_info vega10_device_info = { .supports_cwsr = true, .needs_iommu_device = false, .needs_pci_atomics = false, - .num_sdma_engines = 2, - .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -308,8 +282,6 @@ static const struct kfd_device_info vega10_vf_device_info = { .supports_cwsr = true, .needs_iommu_device = false, .needs_pci_atomics = false, - .num_sdma_engines = 2, - .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -326,8 +298,6 @@ static const struct kfd_device_info vega12_device_info = { .supports_cwsr = true, .needs_iommu_device = false, .needs_pci_atomics = false, - .num_sdma_engines = 2
[PATCH v2] drm/amdkfd: Retrieve SDMA numbers from amdgpu
Instead of hard coding the number of sdma engines and the number of sdma_xgmi engines in the device_info table, get the number of toal SDMA instances from amdgpu. The first two engines are sdma engines and the rest are sdma-xgmi engines unless the ASIC doesn't support XGMI. v2: add kfd_ prefix to non static function names Signed-off-by: Amber Lin --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 20 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 32 +++ drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 ++ drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 4 +-- 4 files changed, 37 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index ce9f4e562bac..3fea47e37c17 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -1516,6 +1516,26 @@ void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint64_t throttle_bitmask) kfd_smi_event_update_thermal_throttling(kfd, throttle_bitmask); } +/* kfd_get_num_sdma_engines returns the number of PCIe optimized SDMA and + * kfd_get_num_xgmi_sdma_engines returns the number of XGMI SDMA. + * When the device has more than two engines, we reserve two for PCIe to enable + * full-duplex and the rest are used as XGMI. + */ +unsigned int kfd_get_num_sdma_engines(struct kfd_dev *kdev) +{ + /* If XGMI is not supported, all SDMA engines are PCIe */ + if (!kdev->adev->gmc.xgmi.supported) + return kdev->adev->sdma.num_instances; + + return min(kdev->adev->sdma.num_instances, 2); +} + +unsigned int kfd_get_num_xgmi_sdma_engines(struct kfd_dev *kdev) +{ + /* After reserved for PCIe, the rest of engines are XGMI */ + return kdev->adev->sdma.num_instances - kfd_get_num_sdma_engines(kdev); +} + #if defined(CONFIG_DEBUG_FS) /* This function will send a package to HIQ to hang the HWS diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 62fe28244a80..2af2b3268171 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -99,31 +99,22 @@ unsigned int get_pipes_per_mec(struct device_queue_manager *dqm) return dqm->dev->shared_resources.num_pipe_per_mec; } -static unsigned int get_num_sdma_engines(struct device_queue_manager *dqm) -{ - return dqm->dev->device_info->num_sdma_engines; -} - -static unsigned int get_num_xgmi_sdma_engines(struct device_queue_manager *dqm) -{ - return dqm->dev->device_info->num_xgmi_sdma_engines; -} - static unsigned int get_num_all_sdma_engines(struct device_queue_manager *dqm) { - return get_num_sdma_engines(dqm) + get_num_xgmi_sdma_engines(dqm); + return kfd_get_num_sdma_engines(dqm->dev) + + kfd_get_num_xgmi_sdma_engines(dqm->dev); } unsigned int get_num_sdma_queues(struct device_queue_manager *dqm) { - return dqm->dev->device_info->num_sdma_engines - * dqm->dev->device_info->num_sdma_queues_per_engine; + return kfd_get_num_sdma_engines(dqm->dev) * + dqm->dev->device_info->num_sdma_queues_per_engine; } unsigned int get_num_xgmi_sdma_queues(struct device_queue_manager *dqm) { - return dqm->dev->device_info->num_xgmi_sdma_engines - * dqm->dev->device_info->num_sdma_queues_per_engine; + return kfd_get_num_xgmi_sdma_engines(dqm->dev) * + dqm->dev->device_info->num_sdma_queues_per_engine; } void program_sh_mem_settings(struct device_queue_manager *dqm, @@ -1054,9 +1045,9 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, dqm->sdma_bitmap &= ~(1ULL << bit); q->sdma_id = bit; q->properties.sdma_engine_id = q->sdma_id % - get_num_sdma_engines(dqm); + kfd_get_num_sdma_engines(dqm->dev); q->properties.sdma_queue_id = q->sdma_id / - get_num_sdma_engines(dqm); + kfd_get_num_sdma_engines(dqm->dev); } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { if (dqm->xgmi_sdma_bitmap == 0) { pr_err("No more XGMI SDMA queue to allocate\n"); @@ -1071,10 +1062,11 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, * assumes the first N engines are always * PCIe-optimized ones */ - q->properties.sdma_engine_id = get_num_sdma_engines(dqm) + - q->sdma_id % get_num_xgmi_sdma_engines(dqm); + q->properties.
[PATCH] drm/amdkfd: Retrieve SDMA numbers from amdgpu
Instead of hard coding the number of sdma engines and the number of sdma_xgmi engines in the device_info table, get the number of toal SDMA instances from amdgpu. The first two engines are sdma engines and the rest are sdma-xgmi engines unless the ASIC doesn't support XGMI. v2: Move get_num_*_sdma_engines to global and shared by queues manager and topology. v3: Use gmc.xgmi.supported to justify the SDMA PCIe/XGMI assignment Signed-off-by: Amber Lin Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 20 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 31 +++ drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 ++ drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 5 ++- 4 files changed, 36 insertions(+), 23 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index ce9f4e562bac..ec1f6bacb61e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -1516,6 +1516,26 @@ void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint64_t throttle_bitmask) kfd_smi_event_update_thermal_throttling(kfd, throttle_bitmask); } +/* get_num_sdma_engines returns the number of PCIe optimized SDMA and + * get_num_xgmi_sdma_engines returns the number of XGMI SDMA. + * When the device has more than two engines, we reserve two for PCIe to enable + * full-duplex and the rest are used as XGMI. + */ +unsigned int get_num_sdma_engines(struct kfd_dev *kdev) +{ + /* If XGMI is not supported, all SDMA engines are PCIe */ + if (!kdev->adev->gmc.xgmi.supported) + return kdev->adev->sdma.num_instances; + + return min(kdev->adev->sdma.num_instances, 2); +} + +unsigned int get_num_xgmi_sdma_engines(struct kfd_dev *kdev) +{ + /* After reserved for PCIe, the rest of engines are XGMI */ + return kdev->adev->sdma.num_instances - get_num_sdma_engines(kdev); +} + #if defined(CONFIG_DEBUG_FS) /* This function will send a package to HIQ to hang the HWS diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 62fe28244a80..5f2886cf4d7e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -99,31 +99,22 @@ unsigned int get_pipes_per_mec(struct device_queue_manager *dqm) return dqm->dev->shared_resources.num_pipe_per_mec; } -static unsigned int get_num_sdma_engines(struct device_queue_manager *dqm) -{ - return dqm->dev->device_info->num_sdma_engines; -} - -static unsigned int get_num_xgmi_sdma_engines(struct device_queue_manager *dqm) -{ - return dqm->dev->device_info->num_xgmi_sdma_engines; -} - static unsigned int get_num_all_sdma_engines(struct device_queue_manager *dqm) { - return get_num_sdma_engines(dqm) + get_num_xgmi_sdma_engines(dqm); + return get_num_sdma_engines(dqm->dev) + + get_num_xgmi_sdma_engines(dqm->dev); } unsigned int get_num_sdma_queues(struct device_queue_manager *dqm) { - return dqm->dev->device_info->num_sdma_engines - * dqm->dev->device_info->num_sdma_queues_per_engine; + return get_num_sdma_engines(dqm->dev) * + dqm->dev->device_info->num_sdma_queues_per_engine; } unsigned int get_num_xgmi_sdma_queues(struct device_queue_manager *dqm) { - return dqm->dev->device_info->num_xgmi_sdma_engines - * dqm->dev->device_info->num_sdma_queues_per_engine; + return get_num_xgmi_sdma_engines(dqm->dev) * + dqm->dev->device_info->num_sdma_queues_per_engine; } void program_sh_mem_settings(struct device_queue_manager *dqm, @@ -1054,9 +1045,9 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, dqm->sdma_bitmap &= ~(1ULL << bit); q->sdma_id = bit; q->properties.sdma_engine_id = q->sdma_id % - get_num_sdma_engines(dqm); + get_num_sdma_engines(dqm->dev); q->properties.sdma_queue_id = q->sdma_id / - get_num_sdma_engines(dqm); + get_num_sdma_engines(dqm->dev); } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { if (dqm->xgmi_sdma_bitmap == 0) { pr_err("No more XGMI SDMA queue to allocate\n"); @@ -1071,10 +1062,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, * assumes the first N engines are always * PCIe-optimized ones */ - q->properties.sdma_engine_id = get_num_sdma_engines(dqm) + -
Re: [PATCH] drm/amdkfd: CWSR with sw scheduler on Aldebaran and Arcturus
Reviewed-by: Amber Lin On 8/20/21 3:11 PM, Mukul Joshi wrote: Program trap handler settings to enable CWSR with software scheduler on Aldebaran and Arcturus. Signed-off-by: Mukul Joshi --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c| 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h| 2 ++ 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c index a5434b713856..46cd4ee6bafb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c @@ -44,4 +44,5 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = { .get_atc_vmid_pasid_mapping_info = kgd_gfx_v9_get_atc_vmid_pasid_mapping_info, .set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base, + .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c index 6409d6b1b2df..5a7f680bcb3f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c @@ -305,5 +305,6 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = { kgd_gfx_v9_get_atc_vmid_pasid_mapping_info, .set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base, - .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy + .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy, + .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c index 154244916727..bcc1cbeb8799 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c @@ -882,7 +882,7 @@ void kgd_gfx_v9_get_cu_occupancy(struct kgd_dev *kgd, int pasid, adev->gfx.cu_info.max_waves_per_simd; } -static void kgd_gfx_v9_program_trap_handler_settings(struct kgd_dev *kgd, +void kgd_gfx_v9_program_trap_handler_settings(struct kgd_dev *kgd, uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr) { struct amdgpu_device *adev = get_amdgpu_device(kgd); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h index e64deba8900f..c63591106879 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h @@ -65,3 +65,5 @@ void kgd_gfx_v9_set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, uint64_t page_table_base); void kgd_gfx_v9_get_cu_occupancy(struct kgd_dev *kgd, int pasid, int *pasid_wave_cnt, int *max_waves_per_cu); +void kgd_gfx_v9_program_trap_handler_settings(struct kgd_dev *kgd, + uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr);
[PATCH] drm/amdkfd: Fix circular lock in nocpsch path
Calling free_mqd inside of destroy_queue_nocpsch_locked can cause a circular lock. destroy_queue_nocpsch_locked is called under a DQM lock, which is taken in MMU notifiers, potentially in FS reclaim context. Taking another lock, which is BO reservation lock from free_mqd, while causing an FS reclaim inside the DQM lock creates a problematic circular lock dependency. Therefore move free_mqd out of destroy_queue_nocpsch_locked and call it after unlocking DQM. Signed-off-by: Amber Lin Reviewed-by: Felix Kuehling --- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 18 +- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 72bea5278add..c069fa259b30 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -486,9 +486,6 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm, if (retval == -ETIME) qpd->reset_wavefronts = true; - - mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); - list_del(>list); if (list_empty(>queues_list)) { if (qpd->reset_wavefronts) { @@ -523,6 +520,8 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm, int retval; uint64_t sdma_val = 0; struct kfd_process_device *pdd = qpd_to_pdd(qpd); + struct mqd_manager *mqd_mgr = + dqm->mqd_mgrs[get_mqd_type_from_queue_type(q->properties.type)]; /* Get the SDMA queue stats */ if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) || @@ -540,6 +539,8 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm, pdd->sdma_past_activity_counter += sdma_val; dqm_unlock(dqm); + mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); + return retval; } @@ -1629,7 +1630,7 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm, static int process_termination_nocpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { - struct queue *q, *next; + struct queue *q; struct device_process_node *cur, *next_dpn; int retval = 0; bool found = false; @@ -1637,12 +1638,19 @@ static int process_termination_nocpsch(struct device_queue_manager *dqm, dqm_lock(dqm); /* Clear all user mode queues */ - list_for_each_entry_safe(q, next, >queues_list, list) { + while (!list_empty(>queues_list)) { + struct mqd_manager *mqd_mgr; int ret; + q = list_first_entry(>queues_list, struct queue, list); + mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type( + q->properties.type)]; ret = destroy_queue_nocpsch_locked(dqm, qpd, q); if (ret) retval = ret; + dqm_unlock(dqm); + mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); + dqm_lock(dqm); } /* Unregister process */ -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdkfd: Avoid null pointer in SMI event
Power Management IP is initialized/enabled before KFD init. When a thermal throttling happens before kfd_smi_init is done, calling the KFD SMI update function causes a stack dump by referring a NULL pointer ( smi_clients list). Check if kfd_init is completed before calling the function. Signed-off-by: Amber Lin --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 72c893fff61a..3cd46d7190b3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -1297,7 +1297,7 @@ void kfd_dec_compute_active(struct kfd_dev *kfd) void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask) { - if (kfd) + if (kfd && kfd->init_complete) kfd_smi_event_update_thermal_throttling(kfd, throttle_bitmask); } -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH v2] drm/amdkfd: Add thermal throttling SMI event
On 2020-07-23 5:41 p.m., Joshi, Mukul wrote: [AMD Official Use Only - Internal Distribution Only] -Original Message- From: Lin, Amber Sent: Thursday, July 23, 2020 5:27 PM To: Joshi, Mukul ; amd-gfx@lists.freedesktop.org Cc: Kuehling, Felix Subject: Re: [PATCH v2] drm/amdkfd: Add thermal throttling SMI event On 2020-07-22 12:08 p.m., Mukul Joshi wrote: Add support for reporting thermal throttling events through SMI. Also, add a counter to count the number of throttling interrupts observed and report the count in the SMI event message. Signed-off-by: Mukul Joshi --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c| 4 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h| 1 + drivers/gpu/drm/amd/amdkfd/kfd_device.c | 7 ++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 68 ++- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 2 + drivers/gpu/drm/amd/powerplay/amdgpu_smu.c| 1 + drivers/gpu/drm/amd/powerplay/arcturus_ppt.c | 1 + .../gpu/drm/amd/powerplay/inc/amdgpu_smu.h| 1 + drivers/gpu/drm/amd/powerplay/smu_v11_0.c | 5 ++ include/uapi/linux/kfd_ioctl.h| 3 +- 10 files changed, 75 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 1b865fed74ca..19e4658756d8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -755,4 +755,8 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd) { } + +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t +throttle_bitmask) { } #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 3f2b695cf19e..e8b0258aae24 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -269,5 +269,6 @@ int kgd2kfd_resume_mm(struct mm_struct *mm); int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, struct dma_fence *fence); void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd); +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t +throttle_bitmask); #endif /* AMDGPU_AMDKFD_H_INCLUDED */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 4bfedaab183f..d5e790f046b4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -29,6 +29,7 @@ #include "cwsr_trap_handler.h" #include "kfd_iommu.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" #define MQD_SIZE_ALIGNED 768 @@ -1245,6 +1246,12 @@ void kfd_dec_compute_active(struct kfd_dev *kfd) WARN_ONCE(count < 0, "Compute profile ref. count error"); } +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t +throttle_bitmask) { + if (kfd) + kfd_smi_event_update_thermal_throttling(kfd, throttle_bitmask); } + #if defined(CONFIG_DEBUG_FS) /* This function will send a package to HIQ to hang the HWS diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index 7b348bf9df21..00c90b47155b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -24,6 +24,7 @@ #include #include #include +#include "amdgpu.h" #include "amdgpu_vm.h" #include "kfd_priv.h" #include "kfd_smi_events.h" @@ -148,6 +149,55 @@ static int kfd_smi_ev_release(struct inode *inode, struct file *filep) return 0; } +static void add_event_to_kfifo(struct kfd_dev *dev, unsigned long long smi_event, + char *event_msg, int len) +{ + struct kfd_smi_client *client; + + rcu_read_lock(); + + list_for_each_entry_rcu(client, >smi_clients, list) { + if (!(READ_ONCE(client->events) & smi_event)) + continue; + spin_lock(>lock); + if (kfifo_avail(>fifo) >= len) { + kfifo_in(>fifo, event_msg, len); + wake_up_all(>wait_queue); + } else { + pr_debug("smi_event(EventID: %llu): no space left\n", + smi_event); + } + spin_unlock(>lock); + } + + rcu_read_unlock(); +} + +void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, +uint32_t throttle_bitmask) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd; + /* +* ThermalThrottle msg = gpu_id(4):throttle_bitmask(4): gpu_id is not needed. The user calls ioctl with GPU specified and KFD returns an anonymous fd. Read from this anon_fd already identify the GPU. I agree with you. But I
Re: [PATCH v2] drm/amdkfd: Add thermal throttling SMI event
On 2020-07-22 12:08 p.m., Mukul Joshi wrote: Add support for reporting thermal throttling events through SMI. Also, add a counter to count the number of throttling interrupts observed and report the count in the SMI event message. Signed-off-by: Mukul Joshi --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c| 4 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h| 1 + drivers/gpu/drm/amd/amdkfd/kfd_device.c | 7 ++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 68 ++- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 2 + drivers/gpu/drm/amd/powerplay/amdgpu_smu.c| 1 + drivers/gpu/drm/amd/powerplay/arcturus_ppt.c | 1 + .../gpu/drm/amd/powerplay/inc/amdgpu_smu.h| 1 + drivers/gpu/drm/amd/powerplay/smu_v11_0.c | 5 ++ include/uapi/linux/kfd_ioctl.h| 3 +- 10 files changed, 75 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 1b865fed74ca..19e4658756d8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -755,4 +755,8 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd) { } + +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask) +{ +} #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 3f2b695cf19e..e8b0258aae24 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -269,5 +269,6 @@ int kgd2kfd_resume_mm(struct mm_struct *mm); int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, struct dma_fence *fence); void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd); +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask); #endif /* AMDGPU_AMDKFD_H_INCLUDED */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 4bfedaab183f..d5e790f046b4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -29,6 +29,7 @@ #include "cwsr_trap_handler.h" #include "kfd_iommu.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" #define MQD_SIZE_ALIGNED 768 @@ -1245,6 +1246,12 @@ void kfd_dec_compute_active(struct kfd_dev *kfd) WARN_ONCE(count < 0, "Compute profile ref. count error"); } +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask) +{ + if (kfd) + kfd_smi_event_update_thermal_throttling(kfd, throttle_bitmask); +} + #if defined(CONFIG_DEBUG_FS) /* This function will send a package to HIQ to hang the HWS diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index 7b348bf9df21..00c90b47155b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -24,6 +24,7 @@ #include #include #include +#include "amdgpu.h" #include "amdgpu_vm.h" #include "kfd_priv.h" #include "kfd_smi_events.h" @@ -148,6 +149,55 @@ static int kfd_smi_ev_release(struct inode *inode, struct file *filep) return 0; } +static void add_event_to_kfifo(struct kfd_dev *dev, unsigned long long smi_event, + char *event_msg, int len) +{ + struct kfd_smi_client *client; + + rcu_read_lock(); + + list_for_each_entry_rcu(client, >smi_clients, list) { + if (!(READ_ONCE(client->events) & smi_event)) + continue; + spin_lock(>lock); + if (kfifo_avail(>fifo) >= len) { + kfifo_in(>fifo, event_msg, len); + wake_up_all(>wait_queue); + } else { + pr_debug("smi_event(EventID: %llu): no space left\n", + smi_event); + } + spin_unlock(>lock); + } + + rcu_read_unlock(); +} + +void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, +uint32_t throttle_bitmask) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd; + /* +* ThermalThrottle msg = gpu_id(4):throttle_bitmask(4): gpu_id is not needed. The user calls ioctl with GPU specified and KFD returns an anonymous fd. Read from this anon_fd already identify the GPU. +* thermal_interrupt_count(8): +* 16 bytes event + 1 byte space + 4 bytes gpu_id + 1 byte : + +* 4 byte throttle_bitmask + 1 byte : + +* 8 byte thermal_interupt_counter + 1 byte \n = 36 +*/ + char fifo_in[36]; + int len; + + if (list_empty(>smi_clients)) + return; + + len =
[PATCH] drm/amdkfd: Provide SMI events watch
When the compute is malfunctioning or performance drops, the system admin will use SMI (System Management Interface) tool to monitor/diagnostic what went wrong. This patch provides an event watch interface for the user space to register devices and subscribe events they are interested. After registered, the user can use annoymous file descriptor's poll function with wait-time specified and wait for events to happen. Once an event happens, the user can use read() to retrieve information related to the event. VM fault event is done in this patch. v2: - remove UNREGISTER and add event ENABLE/DISABLE - correct kfifo usage - move event message API to kfd_ioctl.h v3: send the event msg in text than in binary v4: support multiple clients v5: move events enablement from ioctl to fd write Signed-off-by: Amber Lin --- drivers/gpu/drm/amd/amdkfd/Makefile | 1 + drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 18 ++ drivers/gpu/drm/amd/amdkfd/kfd_device.c | 7 + drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h| 4 + drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 214 +++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 29 +++ include/uapi/linux/kfd_ioctl.h | 16 +- 9 files changed, 292 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 6147462..e1e4115 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -53,6 +53,7 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_int_process_v9.o \ $(AMDKFD_PATH)/kfd_dbgdev.o \ $(AMDKFD_PATH)/kfd_dbgmgr.o \ + $(AMDKFD_PATH)/kfd_smi_events.o \ $(AMDKFD_PATH)/kfd_crat.o ifneq ($(CONFIG_AMD_IOMMU_V2),) diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 9f59ba9..24b4717 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -24,6 +24,7 @@ #include "kfd_events.h" #include "cik_int.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static bool cik_event_interrupt_isr(struct kfd_dev *dev, const uint32_t *ih_ring_entry, @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev, ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { struct kfd_vm_fault_info info; + kfd_smi_event_update_vmfault(dev, pasid); kfd_process_vm_fault(dev->dqm, pasid); memset(, 0, sizeof(info)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index cf0017f..e9b96ad 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -39,6 +39,7 @@ #include "kfd_device_queue_manager.h" #include "kfd_dbgmgr.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static long kfd_ioctl(struct file *, unsigned int, unsigned long); static int kfd_open(struct inode *, struct file *); @@ -1740,6 +1741,20 @@ static int kfd_ioctl_import_dmabuf(struct file *filep, return r; } +/* Handle requests for watching SMI events */ +static int kfd_ioctl_smi_events(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_smi_events_args *args = data; + struct kfd_dev *dev; + + dev = kfd_device_by_id(args->gpuid); + if (!dev) + return -EINVAL; + + return kfd_smi_event_open(dev, >anon_fd); +} + #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \ [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \ .cmd_drv = 0, .name = #ioctl} @@ -1835,6 +1850,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS, kfd_ioctl_alloc_queue_gws, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS, + kfd_ioctl_smi_events, 0), }; #define AMDKFD_CORE_IOCTL_COUNTARRAY_SIZE(amdkfd_ioctls) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 0491ab2..2c030c2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -586,6 +586,11 @@ static int kfd_gws_init(struct kfd_dev *kfd) return ret; } +static void kfd_smi_init(struct kfd_dev *dev) { + INIT_LIST_HEAD(>smi_
[PATCH v7 1/2] drm/amdkfd: Provide SMI events watch
When the compute is malfunctioning or performance drops, the system admin will use SMI (System Management Interface) tool to monitor/diagnostic what went wrong. This patch provides an event watch interface for the user space to register devices and subscribe events they are interested. After registered, the user can use annoymous file descriptor's poll function with wait-time specified and wait for events to happen. Once an event happens, the user can use read() to retrieve information related to the event. VM fault event is done in this patch. v2: - remove UNREGISTER and add event ENABLE/DISABLE - correct kfifo usage - move event message API to kfd_ioctl.h v3: send the event msg in text than in binary v4: support multiple clients v5: move events enablement from ioctl to fd write Signed-off-by: Amber Lin --- drivers/gpu/drm/amd/amdkfd/Makefile | 1 + drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 18 ++ drivers/gpu/drm/amd/amdkfd/kfd_device.c | 7 + drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h| 4 + drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 210 +++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 29 include/uapi/linux/kfd_ioctl.h | 16 +- 9 files changed, 288 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 6147462..e1e4115 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -53,6 +53,7 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_int_process_v9.o \ $(AMDKFD_PATH)/kfd_dbgdev.o \ $(AMDKFD_PATH)/kfd_dbgmgr.o \ + $(AMDKFD_PATH)/kfd_smi_events.o \ $(AMDKFD_PATH)/kfd_crat.o ifneq ($(CONFIG_AMD_IOMMU_V2),) diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 9f59ba9..24b4717 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -24,6 +24,7 @@ #include "kfd_events.h" #include "cik_int.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static bool cik_event_interrupt_isr(struct kfd_dev *dev, const uint32_t *ih_ring_entry, @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev, ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { struct kfd_vm_fault_info info; + kfd_smi_event_update_vmfault(dev, pasid); kfd_process_vm_fault(dev->dqm, pasid); memset(, 0, sizeof(info)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index f8fa03a..2baaaec 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -39,6 +39,7 @@ #include "kfd_device_queue_manager.h" #include "kfd_dbgmgr.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static long kfd_ioctl(struct file *, unsigned int, unsigned long); static int kfd_open(struct inode *, struct file *); @@ -1732,6 +1733,20 @@ static int kfd_ioctl_import_dmabuf(struct file *filep, return r; } +/* Handle requests for watching SMI events */ +static int kfd_ioctl_smi_events(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_smi_events_args *args = data; + struct kfd_dev *dev; + + dev = kfd_device_by_id(args->gpuid); + if (!dev) + return -EINVAL; + + return kfd_smi_event_open(dev, >anon_fd); +} + #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \ [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \ .cmd_drv = 0, .name = #ioctl} @@ -1827,6 +1842,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS, kfd_ioctl_alloc_queue_gws, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS, + kfd_ioctl_smi_events, 0), }; #define AMDKFD_CORE_IOCTL_COUNTARRAY_SIZE(amdkfd_ioctls) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 0491ab2..2c030c2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -586,6 +586,11 @@ static int kfd_gws_init(struct kfd_dev *kfd) return ret; } +static void kfd_smi_init(struct kfd_dev *dev) { + INIT_LIST_HEAD(>smi_
[PATCH v7 2/2] include/uapi/linux: Update KFD ioctl version
Bump KFD ioctl after adding SMI events support Signed-off-by: Amber Lin --- include/uapi/linux/kfd_ioctl.h | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index ad33c18..46adbcc 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -26,8 +26,12 @@ #include #include +/* + * - 1.1 - initial version + * - 1.3 - Add SMI events support + */ #define KFD_IOCTL_MAJOR_VERSION 1 -#define KFD_IOCTL_MINOR_VERSION 1 +#define KFD_IOCTL_MINOR_VERSION 3 struct kfd_ioctl_get_version_args { __u32 major_version;/* from KFD */ -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH v6] drm/amdkfd: Provide SMI events watch
On 2020-04-17 6:31 p.m., Felix Kuehling wrote: Am 2020-04-17 um 4:07 p.m. schrieb Amber Lin: When the compute is malfunctioning or performance drops, the system admin will use SMI (System Management Interface) tool to monitor/diagnostic what went wrong. This patch provides an event watch interface for the user space to register devices and subscribe events they are interested. After registered, the user can use annoymous file descriptor's poll function with wait-time specified and wait for events to happen. Once an event happens, the user can use read() to retrieve information related to the event. VM fault event is done in this patch. v2: - remove UNREGISTER and add event ENABLE/DISABLE - correct kfifo usage - move event message API to kfd_ioctl.h v3: send the event msg in text than in binary v4: support multiple clients v5: move events enablement from ioctl to fd write Signed-off-by: Amber Lin --- drivers/gpu/drm/amd/amdkfd/Makefile | 1 + drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 18 ++ drivers/gpu/drm/amd/amdkfd/kfd_device.c | 7 + drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h| 4 + drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 215 +++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 29 +++ include/uapi/linux/kfd_ioctl.h | 16 +- 9 files changed, 293 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 6147462..e1e4115 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -53,6 +53,7 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_int_process_v9.o \ $(AMDKFD_PATH)/kfd_dbgdev.o \ $(AMDKFD_PATH)/kfd_dbgmgr.o \ + $(AMDKFD_PATH)/kfd_smi_events.o \ $(AMDKFD_PATH)/kfd_crat.o ifneq ($(CONFIG_AMD_IOMMU_V2),) diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 9f59ba9..24b4717 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -24,6 +24,7 @@ #include "kfd_events.h" #include "cik_int.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static bool cik_event_interrupt_isr(struct kfd_dev *dev, const uint32_t *ih_ring_entry, @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev, ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { struct kfd_vm_fault_info info; + kfd_smi_event_update_vmfault(dev, pasid); kfd_process_vm_fault(dev->dqm, pasid); memset(, 0, sizeof(info)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index f8fa03a..2baaaec 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -39,6 +39,7 @@ #include "kfd_device_queue_manager.h" #include "kfd_dbgmgr.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static long kfd_ioctl(struct file *, unsigned int, unsigned long); static int kfd_open(struct inode *, struct file *); @@ -1732,6 +1733,20 @@ static int kfd_ioctl_import_dmabuf(struct file *filep, return r; } +/* Handle requests for watching SMI events */ +static int kfd_ioctl_smi_events(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_smi_events_args *args = data; + struct kfd_dev *dev; + + dev = kfd_device_by_id(args->gpuid); + if (!dev) + return -EINVAL; + + return kfd_smi_event_open(dev, >anon_fd); +} + #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \ [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \ .cmd_drv = 0, .name = #ioctl} @@ -1827,6 +1842,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS, kfd_ioctl_alloc_queue_gws, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS, + kfd_ioctl_smi_events, 0), }; #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 0491ab2..2c030c2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -586,6 +586,11 @@ static int kfd_gws_init(struct kfd_dev *kfd) return ret;
[PATCH v6] drm/amdkfd: Provide SMI events watch
When the compute is malfunctioning or performance drops, the system admin will use SMI (System Management Interface) tool to monitor/diagnostic what went wrong. This patch provides an event watch interface for the user space to register devices and subscribe events they are interested. After registered, the user can use annoymous file descriptor's poll function with wait-time specified and wait for events to happen. Once an event happens, the user can use read() to retrieve information related to the event. VM fault event is done in this patch. v2: - remove UNREGISTER and add event ENABLE/DISABLE - correct kfifo usage - move event message API to kfd_ioctl.h v3: send the event msg in text than in binary v4: support multiple clients v5: move events enablement from ioctl to fd write Signed-off-by: Amber Lin --- drivers/gpu/drm/amd/amdkfd/Makefile | 1 + drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 18 ++ drivers/gpu/drm/amd/amdkfd/kfd_device.c | 7 + drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h| 4 + drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 215 +++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 29 +++ include/uapi/linux/kfd_ioctl.h | 16 +- 9 files changed, 293 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 6147462..e1e4115 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -53,6 +53,7 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_int_process_v9.o \ $(AMDKFD_PATH)/kfd_dbgdev.o \ $(AMDKFD_PATH)/kfd_dbgmgr.o \ + $(AMDKFD_PATH)/kfd_smi_events.o \ $(AMDKFD_PATH)/kfd_crat.o ifneq ($(CONFIG_AMD_IOMMU_V2),) diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 9f59ba9..24b4717 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -24,6 +24,7 @@ #include "kfd_events.h" #include "cik_int.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static bool cik_event_interrupt_isr(struct kfd_dev *dev, const uint32_t *ih_ring_entry, @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev, ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { struct kfd_vm_fault_info info; + kfd_smi_event_update_vmfault(dev, pasid); kfd_process_vm_fault(dev->dqm, pasid); memset(, 0, sizeof(info)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index f8fa03a..2baaaec 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -39,6 +39,7 @@ #include "kfd_device_queue_manager.h" #include "kfd_dbgmgr.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static long kfd_ioctl(struct file *, unsigned int, unsigned long); static int kfd_open(struct inode *, struct file *); @@ -1732,6 +1733,20 @@ static int kfd_ioctl_import_dmabuf(struct file *filep, return r; } +/* Handle requests for watching SMI events */ +static int kfd_ioctl_smi_events(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_smi_events_args *args = data; + struct kfd_dev *dev; + + dev = kfd_device_by_id(args->gpuid); + if (!dev) + return -EINVAL; + + return kfd_smi_event_open(dev, >anon_fd); +} + #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \ [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \ .cmd_drv = 0, .name = #ioctl} @@ -1827,6 +1842,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS, kfd_ioctl_alloc_queue_gws, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS, + kfd_ioctl_smi_events, 0), }; #define AMDKFD_CORE_IOCTL_COUNTARRAY_SIZE(amdkfd_ioctls) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 0491ab2..2c030c2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -586,6 +586,11 @@ static int kfd_gws_init(struct kfd_dev *kfd) return ret; } +static void kfd_smi_init(struct kfd_dev *dev) { + INIT_LIST_HEAD(>smi_
[PATCH v5] drm/amdkfd: Provide SMI events watch
When the compute is malfunctioning or performance drops, the system admin will use SMI (System Management Interface) tool to monitor/diagnostic what went wrong. This patch provides an event watch interface for the user space to register devices and subscribe events they are interested. After registered, the user can use annoymous file descriptor's poll function with wait-time specified and wait for events to happen. Once an event happens, the user can use read() to retrieve information related to the event. VM fault event is done in this patch. v2: - remove UNREGISTER and add event ENABLE/DISABLE - correct kfifo usage - move event message API to kfd_ioctl.h v3: send the event msg in text than in binary v4: support multiple clients v5: move events enablement from ioctl to fd write Signed-off-by: Amber Lin --- drivers/gpu/drm/amd/amdkfd/Makefile | 1 + drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 18 +++ drivers/gpu/drm/amd/amdkfd/kfd_device.c | 7 + drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h| 4 + drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 191 +++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 29 include/uapi/linux/kfd_ioctl.h | 16 +- 9 files changed, 269 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 6147462..e1e4115 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -53,6 +53,7 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_int_process_v9.o \ $(AMDKFD_PATH)/kfd_dbgdev.o \ $(AMDKFD_PATH)/kfd_dbgmgr.o \ + $(AMDKFD_PATH)/kfd_smi_events.o \ $(AMDKFD_PATH)/kfd_crat.o ifneq ($(CONFIG_AMD_IOMMU_V2),) diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 9f59ba9..24b4717 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -24,6 +24,7 @@ #include "kfd_events.h" #include "cik_int.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static bool cik_event_interrupt_isr(struct kfd_dev *dev, const uint32_t *ih_ring_entry, @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev, ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { struct kfd_vm_fault_info info; + kfd_smi_event_update_vmfault(dev, pasid); kfd_process_vm_fault(dev->dqm, pasid); memset(, 0, sizeof(info)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index f8fa03a..2baaaec 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -39,6 +39,7 @@ #include "kfd_device_queue_manager.h" #include "kfd_dbgmgr.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static long kfd_ioctl(struct file *, unsigned int, unsigned long); static int kfd_open(struct inode *, struct file *); @@ -1732,6 +1733,20 @@ static int kfd_ioctl_import_dmabuf(struct file *filep, return r; } +/* Handle requests for watching SMI events */ +static int kfd_ioctl_smi_events(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_smi_events_args *args = data; + struct kfd_dev *dev; + + dev = kfd_device_by_id(args->gpuid); + if (!dev) + return -EINVAL; + + return kfd_smi_event_open(dev, >anon_fd); +} + #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \ [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \ .cmd_drv = 0, .name = #ioctl} @@ -1827,6 +1842,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS, kfd_ioctl_alloc_queue_gws, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS, + kfd_ioctl_smi_events, 0), }; #define AMDKFD_CORE_IOCTL_COUNTARRAY_SIZE(amdkfd_ioctls) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 0491ab2..2c030c2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -586,6 +586,11 @@ static int kfd_gws_init(struct kfd_dev *kfd) return ret; } +static void kfd_smi_init(struct kfd_dev *dev) { + INIT_LIST_HEAD(>smi_
Re: [PATCH v4] drm/amdkfd: Provide SMI events watch
Thank you Felix. Now I understand the problem of global client ID is leaking a hole for potential attackers. I didn't take that into consideration. I'll change that following your advice below. Hi Alex, Thank you for the link. It's helpful. I have a question regarding the versioning. One topic in the article talks about how the userspace can figure out if the new ioctl is supported in a given kernel. Is it correct that with dkms driver, we use the driver version coming from AMDGPU_VERSION in amdgpu_drv.c, and in upstream kernel we use the kernel version? Thanks. Amber On 2020-04-14 11:03 p.m., Deucher, Alexander wrote: [AMD Public Use] Some good advice on getting ioctls right: https://www.kernel.org/doc/html/v5.4-preprc-cpu/ioctl/botching-up-ioctls.html Alex *From:* amd-gfx on behalf of Felix Kuehling *Sent:* Tuesday, April 14, 2020 10:40 PM *To:* Lin, Amber ; amd-gfx@lists.freedesktop.org *Subject:* Re: [PATCH v4] drm/amdkfd: Provide SMI events watch Hi Amber, I understand that different processes can get the same FD. My statement about FD being unique is relative to one process. The main problem with the global client ID is, that it allows process A to change the event mask of process B just by specifying process B's client ID. That can lead to denial of service attacks where process A can cause events not to be delivered to B or can flood process B with frequent events that it's not prepared to handle. Therefore you must make the lookup of the client from the client ID not from a global list, but from a per-process list. That way process A can only change event masks of process A clients, and not those of any other process. But if the client list is process-specific, you can use the FD as a unique identifier of the client within the process, so you don't need a separate client ID. Regards,  Felix Am 2020-04-14 um 8:09 p.m. schrieb Lin, Amber: [AMD Official Use Only - Internal Distribution Only] Hi Felix, That was my assumption too that each registration will get different file descriptor, but it turns out not. When I started two process and both register gpu0 and gpu1, they both got fd=15. If I have process A register gpu0+gpu1, and process B only register gpu0, process A gets fd=15 and process B gets fd=9. That’s why I added client ID. By multiple clients, I mean multiple processes. The ask is users want to have multiple tools and those different tools can use rsmi lib to watch events at the same time. Due to the reason above that two processes can actually get the same fd and I need to add client ID to distinguish the registration, I don’t see the point of limiting one registration per process unless I use pid to distinguish the client instead, which was in my consideration too when I was writing the code. But second thought is why adding this restriction when client ID can allow the tool to watch different events on different devices if they want to. Maybe client ID is a bad term and it misleads you. I should call it register ID. Regards, Amber *From:* Kuehling, Felix <mailto:felix.kuehl...@amd.com> *Sent:* Tuesday, April 14, 2020 7:04 PM *To:* Lin, Amber <mailto:amber@amd.com>; amd-gfx@lists.freedesktop.org <mailto:amd-gfx@lists.freedesktop.org> *Subject:* Re: [PATCH v4] drm/amdkfd: Provide SMI events watch Hi Amber, Some general remarks about the multi-client support. You added a global client id that's separate from the file descriptor. That's problematic for two reasons: 1. A process could change a different process' event mask 2. The FD should already be unique per process, no need to invent another ID If we want to allow one process to register for events multiple times (multiple FDs per process), then the list of clients should be per process. Each process should only be allowed to change the event masks of its own clients. The client could be identified by its FD. No need for another client ID. But you could also simplify it further by allowing only one event client per process. Then you don't need the client ID lookup at all. Just have a single event client in the kfd_process. Another approach would be to make enable/disable functions of the event FD, rather than the KFD FD ioctl. It could be an ioctl of the event FD, or even simpler, you could use the write file-operation to write an event mask (of arbitrary length if you want to enable growth in the future). That way everything would be neatly encapsulated in the event FD private data. Two more comments inline ... Am 2020-04-14 um 5:30 p.m. schrieb Amber Lin: When the compute is malfunctioning or performance drops, the system admin will use SMI (System Management Interface) tool to monitor/diagnostic what went wrong. This patch provides an event watch interface for the user space to register devices an
[PATCH v4] drm/amdkfd: Provide SMI events watch
When the compute is malfunctioning or performance drops, the system admin will use SMI (System Management Interface) tool to monitor/diagnostic what went wrong. This patch provides an event watch interface for the user space to register devices and subscribe events they are interested. After registered, the user can use annoymous file descriptor's poll function with wait-time specified and wait for events to happen. Once an event happens, the user can use read() to retrieve information related to the event. VM fault event is done in this patch. v2: - remove UNREGISTER and add event ENABLE/DISABLE - correct kfifo usage - move event message API to kfd_ioctl.h v3: send the event msg in text than in binary v4: support multiple clients Signed-off-by: Amber Lin --- drivers/gpu/drm/amd/amdkfd/Makefile | 1 + drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 43 + drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_module.c | 1 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h| 3 + drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 235 +++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 33 include/uapi/linux/kfd_ioctl.h | 35 +++- 9 files changed, 354 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 6147462..e1e4115 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -53,6 +53,7 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_int_process_v9.o \ $(AMDKFD_PATH)/kfd_dbgdev.o \ $(AMDKFD_PATH)/kfd_dbgmgr.o \ + $(AMDKFD_PATH)/kfd_smi_events.o \ $(AMDKFD_PATH)/kfd_crat.o ifneq ($(CONFIG_AMD_IOMMU_V2),) diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 9f59ba9..24b4717 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -24,6 +24,7 @@ #include "kfd_events.h" #include "cik_int.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static bool cik_event_interrupt_isr(struct kfd_dev *dev, const uint32_t *ih_ring_entry, @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev, ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { struct kfd_vm_fault_info info; + kfd_smi_event_update_vmfault(dev, pasid); kfd_process_vm_fault(dev->dqm, pasid); memset(, 0, sizeof(info)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index f8fa03a..f13fde59 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -39,6 +39,7 @@ #include "kfd_device_queue_manager.h" #include "kfd_dbgmgr.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static long kfd_ioctl(struct file *, unsigned int, unsigned long); static int kfd_open(struct inode *, struct file *); @@ -1732,6 +1733,45 @@ static int kfd_ioctl_import_dmabuf(struct file *filep, return r; } +/* Handle requests for watching SMI events */ +static int kfd_ioctl_smi_events(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_smi_events_args *args = data; + uint32_t *ids_array; + int ret = 0; + + switch (args->op) { + case KFD_SMI_EVENTS_REGISTER: + ids_array = kmalloc_array(args->num_gpuids, sizeof(uint32_t), + GFP_KERNEL); + if (!ids_array) + return -ENOMEM; + if (copy_from_user(ids_array, + (void __user *)args->gpuids_array_ptr, + args->num_gpuids * sizeof(uint32_t))) { + kfree(ids_array); + return -EFAULT; + } + + ret = kfd_smi_event_register(args->num_gpuids, ids_array, +>anon_fd, >client_id); + if (ret) + kfree(ids_array); + + return ret; + + case KFD_SMI_EVENTS_ENABLE: + /* subscribe events */ + return kfd_smi_event_enable(args->client_id, args->events); + case KFD_SMI_EVENTS_DISABLE: + /* unsubscribe events */ + return kfd_smi_event_disable(args->client_id, args-&
[PATCH v4] drm/amdkfd: Provide SMI events watch
When the compute is malfunctioning or performance drops, the system admin will use SMI (System Management Interface) tool to monitor/diagnostic what went wrong. This patch provides an event watch interface for the user space to register devices and subscribe events they are interested. After registered, the user can use annoymous file descriptor's poll function with wait-time specified and wait for events to happen. Once an event happens, the user can use read() to retrieve information related to the event. VM fault event is done in this patch. v2: - remove UNREGISTER and add event ENABLE/DISABLE - correct kfifo usage - move event message API to kfd_ioctl.h v3: send the event msg in text than in binary v4: support multiple clients Signed-off-by: Amber Lin --- drivers/gpu/drm/amd/amdkfd/Makefile | 1 + drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 43 + drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_module.c | 1 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h| 3 + drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 235 +++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 33 include/uapi/linux/kfd_ioctl.h | 35 +++- 9 files changed, 354 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 6147462..e1e4115 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -53,6 +53,7 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_int_process_v9.o \ $(AMDKFD_PATH)/kfd_dbgdev.o \ $(AMDKFD_PATH)/kfd_dbgmgr.o \ + $(AMDKFD_PATH)/kfd_smi_events.o \ $(AMDKFD_PATH)/kfd_crat.o ifneq ($(CONFIG_AMD_IOMMU_V2),) diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 9f59ba9..24b4717 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -24,6 +24,7 @@ #include "kfd_events.h" #include "cik_int.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static bool cik_event_interrupt_isr(struct kfd_dev *dev, const uint32_t *ih_ring_entry, @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev, ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { struct kfd_vm_fault_info info; + kfd_smi_event_update_vmfault(dev, pasid); kfd_process_vm_fault(dev->dqm, pasid); memset(, 0, sizeof(info)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index f8fa03a..f13fde59 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -39,6 +39,7 @@ #include "kfd_device_queue_manager.h" #include "kfd_dbgmgr.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static long kfd_ioctl(struct file *, unsigned int, unsigned long); static int kfd_open(struct inode *, struct file *); @@ -1732,6 +1733,45 @@ static int kfd_ioctl_import_dmabuf(struct file *filep, return r; } +/* Handle requests for watching SMI events */ +static int kfd_ioctl_smi_events(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_smi_events_args *args = data; + uint32_t *ids_array; + int ret = 0; + + switch (args->op) { + case KFD_SMI_EVENTS_REGISTER: + ids_array = kmalloc_array(args->num_gpuids, sizeof(uint32_t), + GFP_KERNEL); + if (!ids_array) + return -ENOMEM; + if (copy_from_user(ids_array, + (void __user *)args->gpuids_array_ptr, + args->num_gpuids * sizeof(uint32_t))) { + kfree(ids_array); + return -EFAULT; + } + + ret = kfd_smi_event_register(args->num_gpuids, ids_array, +>anon_fd, >client_id); + if (ret) + kfree(ids_array); + + return ret; + + case KFD_SMI_EVENTS_ENABLE: + /* subscribe events */ + return kfd_smi_event_enable(args->client_id, args->events); + case KFD_SMI_EVENTS_DISABLE: + /* unsubscribe events */ + return kfd_smi_event_disable(args->client_id, args-&
[PATCH v3] drm/amdkfd: Provide SMI events watch
When the compute is malfunctioning or performance drops, the system admin will use SMI (System Management Interface) tool to monitor/diagnostic what went wrong. This patch provides an event watch interface for the user space to register events they are interested. After the event is registered, the user can use annoymous file descriptor's poll function with wait-time specified to wait for the event to happen. Once the event happens, the user can use read() to retrieve information related to the event. VM fault event is done in this patch. v2: - remove UNREGISTER and add event ENABLE/DISABLE - correct kfifo usage - move event message API to kfd_ioctl.h v3: send the event msg in text than in binary Signed-off-by: Amber Lin --- drivers/gpu/drm/amd/amdkfd/Makefile | 1 + drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 30 drivers/gpu/drm/amd/amdkfd/kfd_device.c | 1 + drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h| 13 ++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 174 +++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 42 ++ include/uapi/linux/kfd_ioctl.h | 32 - 9 files changed, 296 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 6147462..e1e4115 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -53,6 +53,7 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_int_process_v9.o \ $(AMDKFD_PATH)/kfd_dbgdev.o \ $(AMDKFD_PATH)/kfd_dbgmgr.o \ + $(AMDKFD_PATH)/kfd_smi_events.o \ $(AMDKFD_PATH)/kfd_crat.o ifneq ($(CONFIG_AMD_IOMMU_V2),) diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 9f59ba9..24b4717 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -24,6 +24,7 @@ #include "kfd_events.h" #include "cik_int.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static bool cik_event_interrupt_isr(struct kfd_dev *dev, const uint32_t *ih_ring_entry, @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev, ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { struct kfd_vm_fault_info info; + kfd_smi_event_update_vmfault(dev, pasid); kfd_process_vm_fault(dev->dqm, pasid); memset(, 0, sizeof(info)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index f8fa03a..591ac28 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -39,6 +39,7 @@ #include "kfd_device_queue_manager.h" #include "kfd_dbgmgr.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static long kfd_ioctl(struct file *, unsigned int, unsigned long); static int kfd_open(struct inode *, struct file *); @@ -1243,6 +1244,32 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p, return ret; } +/* Handle requests for watching SMI events */ +static int kfd_ioctl_smi_events(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_smi_events_args *args = data; + struct kfd_dev *dev; + + dev = kfd_device_by_id(args->gpu_id); + if (!dev) + return -EINVAL; + + switch (args->op) { + case KFD_SMI_EVENTS_REGISTER: + /* register the device */ + return kfd_smi_event_register(dev, >data); + case KFD_SMI_EVENTS_ENABLE: + /* subscribe events to the device */ + return kfd_smi_event_enable(dev, args->events); + case KFD_SMI_EVENTS_DISABLE: + /* unsubscribe events */ + return kfd_smi_event_disable(dev, args->events); + } + + return -EINVAL; +} + bool kfd_dev_is_large_bar(struct kfd_dev *dev) { struct kfd_local_mem_info mem_info; @@ -1827,6 +1854,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS, kfd_ioctl_alloc_queue_gws, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS, + kfd_ioctl_smi_events, 0), }; #define AMDKFD_CORE_IOCTL_COUNTARRAY_SIZE(amdkfd_ioctls) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_devic
Re: [PATCH v2] drm/amdkfd: Provide SMI events watch
Further thinking about it, I'll use struct kfd_smi_msg_header. Instead of using struct kfd_smi_msg_vmfault, it's a description about the event. This way we make it generic to all events. On 2020-04-03 9:38 a.m., Amber Lin wrote: Thanks Felix. I'll make changes accordingly but please pay attention to my last reply inline. On 2020-04-02 7:51 p.m., Felix Kuehling wrote: On 2020-04-02 4:46 p.m., Amber Lin wrote: When the compute is malfunctioning or performance drops, the system admin will use SMI (System Management Interface) tool to monitor/diagnostic what went wrong. This patch provides an event watch interface for the user space to register events they are interested. After the event is registered, the user can use annoymous file descriptor's poll function with wait-time specified to wait for the event to happen. Once the event happens, the user can use read() to retrieve information related to the event. VM fault event is done in this patch. v2: - remove UNREGISTER and add event ENABLE/DISABLE - correct kfifo usage - move event message API to kfd_ioctl.h Signed-off-by: Amber Lin ---  drivers/gpu/drm/amd/amdkfd/Makefile |  3 +-  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |  2 +  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 30  drivers/gpu/drm/amd/amdkfd/kfd_device.c |  1 +  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c |  2 +  drivers/gpu/drm/amd/amdkfd/kfd_priv.h   | 12 ++  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 177 +++  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 31  include/uapi/linux/kfd_ioctl.h  | 30 +++-  9 files changed, 286 insertions(+), 2 deletions(-)  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 6147462..cc98b4a 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -53,7 +53,8 @@ AMDKFD_FILES   := $(AMDKFD_PATH)/kfd_module.o \  $(AMDKFD_PATH)/kfd_int_process_v9.o \  $(AMDKFD_PATH)/kfd_dbgdev.o \  $(AMDKFD_PATH)/kfd_dbgmgr.o \ -   $(AMDKFD_PATH)/kfd_crat.o +   $(AMDKFD_PATH)/kfd_crat.o \ +   $(AMDKFD_PATH)/kfd_smi_events.o   ifneq ($(CONFIG_AMD_IOMMU_V2),)  AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 9f59ba9..24b4717 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -24,6 +24,7 @@  #include "kfd_events.h"  #include "cik_int.h"  #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h"   static bool cik_event_interrupt_isr(struct kfd_dev *dev,  const uint32_t *ih_ring_entry, @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,  ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {  struct kfd_vm_fault_info info;  +   kfd_smi_event_update_vmfault(dev, pasid);  kfd_process_vm_fault(dev->dqm, pasid);   memset(, 0, sizeof(info)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index f8fa03a..591ac28 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -39,6 +39,7 @@  #include "kfd_device_queue_manager.h"  #include "kfd_dbgmgr.h"  #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h"   static long kfd_ioctl(struct file *, unsigned int, unsigned long);  static int kfd_open(struct inode *, struct file *); @@ -1243,6 +1244,32 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p,  return ret;  }  +/* Handle requests for watching SMI events */ +static int kfd_ioctl_smi_events(struct file *filep, +   struct kfd_process *p, void *data) +{ +   struct kfd_ioctl_smi_events_args *args = data; +   struct kfd_dev *dev; + +   dev = kfd_device_by_id(args->gpu_id); +   if (!dev) +   return -EINVAL; + +   switch (args->op) { +   case KFD_SMI_EVENTS_REGISTER: +   /* register the device */ +   return kfd_smi_event_register(dev, >data); +   case KFD_SMI_EVENTS_ENABLE: +   /* subscribe events to the device */ +   return kfd_smi_event_enable(dev, args->events); +   case KFD_SMI_EVENTS_DISABLE: +   /* unsubscribe events */ +   return kfd_smi_event_disable(dev, args->events); +   } + +   return -EINVAL; +} +  bool kfd_dev_is_large_bar(struct kfd_dev *dev)  {  struct kfd_local_mem_info mem_info; @@ -1827,6 +1854,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {   AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_
Re: [PATCH v2] drm/amdkfd: Provide SMI events watch
Thanks Felix. I'll make changes accordingly but please pay attention to my last reply inline. On 2020-04-02 7:51 p.m., Felix Kuehling wrote: On 2020-04-02 4:46 p.m., Amber Lin wrote: When the compute is malfunctioning or performance drops, the system admin will use SMI (System Management Interface) tool to monitor/diagnostic what went wrong. This patch provides an event watch interface for the user space to register events they are interested. After the event is registered, the user can use annoymous file descriptor's poll function with wait-time specified to wait for the event to happen. Once the event happens, the user can use read() to retrieve information related to the event. VM fault event is done in this patch. v2: - remove UNREGISTER and add event ENABLE/DISABLE - correct kfifo usage - move event message API to kfd_ioctl.h Signed-off-by: Amber Lin ---  drivers/gpu/drm/amd/amdkfd/Makefile |  3 +-  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |  2 +  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 30  drivers/gpu/drm/amd/amdkfd/kfd_device.c |  1 +  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c |  2 +  drivers/gpu/drm/amd/amdkfd/kfd_priv.h   | 12 ++  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 177 +++  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 31  include/uapi/linux/kfd_ioctl.h  | 30 +++-  9 files changed, 286 insertions(+), 2 deletions(-)  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 6147462..cc98b4a 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -53,7 +53,8 @@ AMDKFD_FILES   := $(AMDKFD_PATH)/kfd_module.o \  $(AMDKFD_PATH)/kfd_int_process_v9.o \  $(AMDKFD_PATH)/kfd_dbgdev.o \  $(AMDKFD_PATH)/kfd_dbgmgr.o \ -   $(AMDKFD_PATH)/kfd_crat.o +   $(AMDKFD_PATH)/kfd_crat.o \ +   $(AMDKFD_PATH)/kfd_smi_events.o   ifneq ($(CONFIG_AMD_IOMMU_V2),)  AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 9f59ba9..24b4717 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -24,6 +24,7 @@  #include "kfd_events.h"  #include "cik_int.h"  #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h"   static bool cik_event_interrupt_isr(struct kfd_dev *dev,  const uint32_t *ih_ring_entry, @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,  ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {  struct kfd_vm_fault_info info;  +   kfd_smi_event_update_vmfault(dev, pasid);  kfd_process_vm_fault(dev->dqm, pasid);   memset(, 0, sizeof(info)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index f8fa03a..591ac28 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -39,6 +39,7 @@  #include "kfd_device_queue_manager.h"  #include "kfd_dbgmgr.h"  #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h"   static long kfd_ioctl(struct file *, unsigned int, unsigned long);  static int kfd_open(struct inode *, struct file *); @@ -1243,6 +1244,32 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p,  return ret;  }  +/* Handle requests for watching SMI events */ +static int kfd_ioctl_smi_events(struct file *filep, +   struct kfd_process *p, void *data) +{ +   struct kfd_ioctl_smi_events_args *args = data; +   struct kfd_dev *dev; + +   dev = kfd_device_by_id(args->gpu_id); +   if (!dev) +   return -EINVAL; + +   switch (args->op) { +   case KFD_SMI_EVENTS_REGISTER: +   /* register the device */ +   return kfd_smi_event_register(dev, >data); +   case KFD_SMI_EVENTS_ENABLE: +   /* subscribe events to the device */ +   return kfd_smi_event_enable(dev, args->events); +   case KFD_SMI_EVENTS_DISABLE: +   /* unsubscribe events */ +   return kfd_smi_event_disable(dev, args->events); +   } + +   return -EINVAL; +} +  bool kfd_dev_is_large_bar(struct kfd_dev *dev)  {  struct kfd_local_mem_info mem_info; @@ -1827,6 +1854,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {   AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,  kfd_ioctl_alloc_queue_gws, 0), + +   AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS, +   kfd_ioctl_smi_events, 0),  };   #define AMDKFD_CORE_IOCTL_COUNT   ARRAY_SIZE(amdkfd_ioctls) diff --git a/drivers/gpu/
[PATCH v2] drm/amdkfd: Provide SMI events watch
When the compute is malfunctioning or performance drops, the system admin will use SMI (System Management Interface) tool to monitor/diagnostic what went wrong. This patch provides an event watch interface for the user space to register events they are interested. After the event is registered, the user can use annoymous file descriptor's poll function with wait-time specified to wait for the event to happen. Once the event happens, the user can use read() to retrieve information related to the event. VM fault event is done in this patch. v2: - remove UNREGISTER and add event ENABLE/DISABLE - correct kfifo usage - move event message API to kfd_ioctl.h Signed-off-by: Amber Lin --- drivers/gpu/drm/amd/amdkfd/Makefile | 3 +- drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 30 drivers/gpu/drm/amd/amdkfd/kfd_device.c | 1 + drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h| 12 ++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 177 +++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 31 include/uapi/linux/kfd_ioctl.h | 30 +++- 9 files changed, 286 insertions(+), 2 deletions(-) create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 6147462..cc98b4a 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -53,7 +53,8 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_int_process_v9.o \ $(AMDKFD_PATH)/kfd_dbgdev.o \ $(AMDKFD_PATH)/kfd_dbgmgr.o \ - $(AMDKFD_PATH)/kfd_crat.o + $(AMDKFD_PATH)/kfd_crat.o \ + $(AMDKFD_PATH)/kfd_smi_events.o ifneq ($(CONFIG_AMD_IOMMU_V2),) AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 9f59ba9..24b4717 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -24,6 +24,7 @@ #include "kfd_events.h" #include "cik_int.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static bool cik_event_interrupt_isr(struct kfd_dev *dev, const uint32_t *ih_ring_entry, @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev, ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { struct kfd_vm_fault_info info; + kfd_smi_event_update_vmfault(dev, pasid); kfd_process_vm_fault(dev->dqm, pasid); memset(, 0, sizeof(info)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index f8fa03a..591ac28 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -39,6 +39,7 @@ #include "kfd_device_queue_manager.h" #include "kfd_dbgmgr.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static long kfd_ioctl(struct file *, unsigned int, unsigned long); static int kfd_open(struct inode *, struct file *); @@ -1243,6 +1244,32 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p, return ret; } +/* Handle requests for watching SMI events */ +static int kfd_ioctl_smi_events(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_smi_events_args *args = data; + struct kfd_dev *dev; + + dev = kfd_device_by_id(args->gpu_id); + if (!dev) + return -EINVAL; + + switch (args->op) { + case KFD_SMI_EVENTS_REGISTER: + /* register the device */ + return kfd_smi_event_register(dev, >data); + case KFD_SMI_EVENTS_ENABLE: + /* subscribe events to the device */ + return kfd_smi_event_enable(dev, args->events); + case KFD_SMI_EVENTS_DISABLE: + /* unsubscribe events */ + return kfd_smi_event_disable(dev, args->events); + } + + return -EINVAL; +} + bool kfd_dev_is_large_bar(struct kfd_dev *dev) { struct kfd_local_mem_info mem_info; @@ -1827,6 +1854,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS, kfd_ioctl_alloc_queue_gws, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS, + kfd_ioctl_smi_events, 0), }; #define AMDKFD_CORE_IOCTL_COUNTARRAY_SIZE(amdkfd_ioctls) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device
Re: [PATCH] drm/amdkfd: Provide SMI events watch
Thanks Felix for the review. I have a better understanding of how kfifo works now and have changed my code quite a bit. Couple of questions below inline regarding the gpu_id and data arguments. Thanks. Amber On 2020-03-26 4:53 p.m., Felix Kuehling wrote: Hi Amber, I see that this is based on the debugger event code. Jon and I are just working through some issues with that code. The lessons from that will need to be applied to this as well. But I think we can define your API to simplify this a bit. The basic problem is, that we have one Fifo in the kfd_device, but potentially multiple file descriptors referring to it. For the event interface I think we can enforce only a single file descriptor per device. If there is already one, your register call can fail. See more comments inline. On 2020-03-17 13:57, Amber Lin wrote: When the compute is malfunctioning or performance drops, the system admin will use SMI (System Management Interface) tool to monitor/diagnostic what went wrong. This patch provides an event watch interface for the user space to register events they are interested. After the event is registered, the user can use annoymous file descriptor's pull function pull -> poll Thank you for spotting the typo. I’ll change that. with wait-time specified to wait for the event to happen. Once the event happens, the user can use read() to retrieve information related to the event. VM fault event is done in this patch. Signed-off-by: Amber Lin --- drivers/gpu/drm/amd/amdkfd/Makefile | 3 +- drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 38 ++ drivers/gpu/drm/amd/amdkfd/kfd_device.c | 1 + drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h| 10 ++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 143 +++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 41 +++ include/uapi/linux/kfd_ioctl.h | 27 - 9 files changed, 265 insertions(+), 2 deletions(-) create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 6147462..cc98b4a 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -53,7 +53,8 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_int_process_v9.o \ $(AMDKFD_PATH)/kfd_dbgdev.o \ $(AMDKFD_PATH)/kfd_dbgmgr.o \ - $(AMDKFD_PATH)/kfd_crat.o + $(AMDKFD_PATH)/kfd_crat.o \ + $(AMDKFD_PATH)/kfd_smi_events.o ifneq ($(CONFIG_AMD_IOMMU_V2),) AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 9f59ba9..24b4717 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -24,6 +24,7 @@ #include "kfd_events.h" #include "cik_int.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static bool cik_event_interrupt_isr(struct kfd_dev *dev, const uint32_t *ih_ring_entry, @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev, ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { struct kfd_vm_fault_info info; + kfd_smi_event_update_vmfault(dev, pasid); kfd_process_vm_fault(dev->dqm, pasid); memset(, 0, sizeof(info)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index f8fa03a..8e92956 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -39,6 +39,7 @@ #include "kfd_device_queue_manager.h" #include "kfd_dbgmgr.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static long kfd_ioctl(struct file *, unsigned int, unsigned long); static int kfd_open(struct inode *, struct file *); @@ -1243,6 +1244,40 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p, return ret; } +/* Handle requests for watching SMI events */ +static int kfd_ioctl_smi_events(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_smi_events_args *args = data; + struct kfd_dev *dev; + int ret = 0; + + dev = kfd_device_by_id(args->gpu_id); + if (!dev) + return -EINVAL; + + switch (args->op) { + case KFD_SMI_EVENTS_REGISTER: + ret = kfd_smi_event_register(dev, args->events); + if (ret >= 0) { +
Re: [PATCH] drm/amdkfd: Provide SMI events watch
Sorry for the messed-up link. This is the link (rocm-smi-lib) which makes use of the interface https://github.com/RadeonOpenCompute/rocm_smi_lib On 2020-03-23 2:19 p.m., Amber Lin wrote: Somehow my reply didn't seem to reach the mailing list... Hi Alex, https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2FRadeonOpenCompute%2Frocm_smi_libdata=02%7C01%7Camber.lin%40amd.com%7C37d1a82d9e734d9fec6d08d7cf56ce36%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637205844045641423sdata=I%2BVkN3VKYFUiZ0xGW0Yst70rcqrMRXUTcd995RgfRa4%3Dreserved=0 will use this interface. Those functions will be added to this library: /* Get a handler for watching events */ rsmi_status_t rsmi_event_init(rsmi_event_handle_t *handle); /* Register events for the device using the handler from init */ rsmi_status_t rsmi_event_register(uint32_t dv_ind, uint32_t events,    rsmi_event_handle_t *handle); /* Wait for events. If one of the events happens, a success is returned with  * with details in data.  */ rsmi_status_t rsmi_event_wait(rsmi_event_handle_t handle, uint32_t timeout_ms,    rsmi_event_data_t *data); /* Stop watching events */ rsmi_status_t rsmi_event_free(rsmi_event_handle_t handle); I add the ioctl to /dev/kfd with a debate if it should be in /dev/dri/card* or /dev/dri/renderD* instead. The first event to report is VM fault in this patch. Other events like RAS errors, PCIe errors, GPU reset… etc will be added for the system admin to diagnose the system health. I see this as a system feature so I use /dev/kfd. I’ll like to hear if people think differently. Thanks. Thanks. Amber On 2020-03-17 3:03 p.m., Alex Deucher wrote: On Tue, Mar 17, 2020 at 1:57 PM Amber Lin wrote: When the compute is malfunctioning or performance drops, the system admin will use SMI (System Management Interface) tool to monitor/diagnostic what went wrong. This patch provides an event watch interface for the user space to register events they are interested. After the event is registered, the user can use annoymous file descriptor's pull function with wait-time specified to wait for the event to happen. Once the event happens, the user can use read() to retrieve information related to the event. VM fault event is done in this patch. Signed-off-by: Amber Lin Can you provide a link to the userspace tools that make use of this interface? Thanks, Alex ---  drivers/gpu/drm/amd/amdkfd/Makefile |  3 +-  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |  2 +  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 38 ++  drivers/gpu/drm/amd/amdkfd/kfd_device.c |  1 +  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c |  2 +  drivers/gpu/drm/amd/amdkfd/kfd_priv.h   | 10 ++  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 143 +++  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 41 +++  include/uapi/linux/kfd_ioctl.h  | 27 -  9 files changed, 265 insertions(+), 2 deletions(-)  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 6147462..cc98b4a 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -53,7 +53,8 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_int_process_v9.o \ $(AMDKFD_PATH)/kfd_dbgdev.o \ $(AMDKFD_PATH)/kfd_dbgmgr.o \ -  $(AMDKFD_PATH)/kfd_crat.o +  $(AMDKFD_PATH)/kfd_crat.o \ +  $(AMDKFD_PATH)/kfd_smi_events.o  ifneq ($(CONFIG_AMD_IOMMU_V2),)  AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 9f59ba9..24b4717 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -24,6 +24,7 @@  #include "kfd_events.h"  #include "cik_int.h"  #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h"  static bool cik_event_interrupt_isr(struct kfd_dev *dev, const uint32_t *ih_ring_entry, @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev, ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { struct kfd_vm_fault_info info; +  kfd_smi_event_update_vmfault(dev, pasid); kfd_process_vm_fault(dev->dqm, pasid); memset(, 0, sizeof(info)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index f8fa03a..8e92956 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -39,6 +
Re: [PATCH] drm/amdkfd: Provide SMI events watch
Somehow my reply didn't seem to reach the mailing list... Hi Alex, https://github.com/RadeonOpenCompute/rocm_smi_lib will use this interface. Those functions will be added to this library: /* Get a handler for watching events */ rsmi_status_t rsmi_event_init(rsmi_event_handle_t *handle); /* Register events for the device using the handler from init */ rsmi_status_t rsmi_event_register(uint32_t dv_ind, uint32_t events,    rsmi_event_handle_t *handle); /* Wait for events. If one of the events happens, a success is returned with  * with details in data.  */ rsmi_status_t rsmi_event_wait(rsmi_event_handle_t handle, uint32_t timeout_ms,    rsmi_event_data_t *data); /* Stop watching events */ rsmi_status_t rsmi_event_free(rsmi_event_handle_t handle); I add the ioctl to /dev/kfd with a debate if it should be in /dev/dri/card* or /dev/dri/renderD* instead. The first event to report is VM fault in this patch. Other events like RAS errors, PCIe errors, GPU reset… etc will be added for the system admin to diagnose the system health. I see this as a system feature so I use /dev/kfd. I’ll like to hear if people think differently. Thanks. Thanks. Amber On 2020-03-17 3:03 p.m., Alex Deucher wrote: On Tue, Mar 17, 2020 at 1:57 PM Amber Lin wrote: When the compute is malfunctioning or performance drops, the system admin will use SMI (System Management Interface) tool to monitor/diagnostic what went wrong. This patch provides an event watch interface for the user space to register events they are interested. After the event is registered, the user can use annoymous file descriptor's pull function with wait-time specified to wait for the event to happen. Once the event happens, the user can use read() to retrieve information related to the event. VM fault event is done in this patch. Signed-off-by: Amber Lin Can you provide a link to the userspace tools that make use of this interface? Thanks, Alex --- drivers/gpu/drm/amd/amdkfd/Makefile | 3 +- drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 38 ++ drivers/gpu/drm/amd/amdkfd/kfd_device.c | 1 + drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h| 10 ++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 143 +++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 41 +++ include/uapi/linux/kfd_ioctl.h | 27 - 9 files changed, 265 insertions(+), 2 deletions(-) create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 6147462..cc98b4a 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -53,7 +53,8 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_int_process_v9.o \ $(AMDKFD_PATH)/kfd_dbgdev.o \ $(AMDKFD_PATH)/kfd_dbgmgr.o \ - $(AMDKFD_PATH)/kfd_crat.o + $(AMDKFD_PATH)/kfd_crat.o \ + $(AMDKFD_PATH)/kfd_smi_events.o ifneq ($(CONFIG_AMD_IOMMU_V2),) AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 9f59ba9..24b4717 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -24,6 +24,7 @@ #include "kfd_events.h" #include "cik_int.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static bool cik_event_interrupt_isr(struct kfd_dev *dev, const uint32_t *ih_ring_entry, @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev, ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { struct kfd_vm_fault_info info; + kfd_smi_event_update_vmfault(dev, pasid); kfd_process_vm_fault(dev->dqm, pasid); memset(, 0, sizeof(info)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index f8fa03a..8e92956 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -39,6 +39,7 @@ #include "kfd_device_queue_manager.h" #include "kfd_dbgmgr.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static long kfd_ioctl(struct file *, unsigned int, unsigned long); static int kfd_open(struct inode *, struct file *); @@ -1243,6 +1244,40 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p, return ret; } +/* Handle r
[PATCH] drm/amdkfd: Provide SMI events watch
When the compute is malfunctioning or performance drops, the system admin will use SMI (System Management Interface) tool to monitor/diagnostic what went wrong. This patch provides an event watch interface for the user space to register events they are interested. After the event is registered, the user can use annoymous file descriptor's pull function with wait-time specified to wait for the event to happen. Once the event happens, the user can use read() to retrieve information related to the event. VM fault event is done in this patch. Signed-off-by: Amber Lin --- drivers/gpu/drm/amd/amdkfd/Makefile | 3 +- drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 38 ++ drivers/gpu/drm/amd/amdkfd/kfd_device.c | 1 + drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h| 10 ++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 143 +++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 41 +++ include/uapi/linux/kfd_ioctl.h | 27 - 9 files changed, 265 insertions(+), 2 deletions(-) create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 6147462..cc98b4a 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -53,7 +53,8 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_int_process_v9.o \ $(AMDKFD_PATH)/kfd_dbgdev.o \ $(AMDKFD_PATH)/kfd_dbgmgr.o \ - $(AMDKFD_PATH)/kfd_crat.o + $(AMDKFD_PATH)/kfd_crat.o \ + $(AMDKFD_PATH)/kfd_smi_events.o ifneq ($(CONFIG_AMD_IOMMU_V2),) AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 9f59ba9..24b4717 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -24,6 +24,7 @@ #include "kfd_events.h" #include "cik_int.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static bool cik_event_interrupt_isr(struct kfd_dev *dev, const uint32_t *ih_ring_entry, @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev, ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { struct kfd_vm_fault_info info; + kfd_smi_event_update_vmfault(dev, pasid); kfd_process_vm_fault(dev->dqm, pasid); memset(, 0, sizeof(info)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index f8fa03a..8e92956 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -39,6 +39,7 @@ #include "kfd_device_queue_manager.h" #include "kfd_dbgmgr.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static long kfd_ioctl(struct file *, unsigned int, unsigned long); static int kfd_open(struct inode *, struct file *); @@ -1243,6 +1244,40 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p, return ret; } +/* Handle requests for watching SMI events */ +static int kfd_ioctl_smi_events(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_smi_events_args *args = data; + struct kfd_dev *dev; + int ret = 0; + + dev = kfd_device_by_id(args->gpu_id); + if (!dev) + return -EINVAL; + + switch (args->op) { + case KFD_SMI_EVENTS_REGISTER: + ret = kfd_smi_event_register(dev, args->events); + if (ret >= 0) { + /* When the registration is successful, it returns the +* annoymous inode. Pass it to the user in data1 +*/ + args->data1 = ret; + ret = 0; + } + break; + case KFD_SMI_EVENTS_UNREGISTER: + kfd_smi_event_unregister(dev, args->events); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + bool kfd_dev_is_large_bar(struct kfd_dev *dev) { struct kfd_local_mem_info mem_info; @@ -1827,6 +1862,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS, kfd_ioctl_alloc_queue_gws, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS, + kfd_ioctl_smi_events, 0), }; #define AMDKFD_CORE_IOCT
[PATCH v3] drm/amdkfd: Add queue information to sysfs
Provide compute queues information in sysfs under /sys/class/kfd/kfd/proc. The format is /sys/class/kfd/kfd/proc//queues//XX where XX are size, type, and gpuid three files to represent queue size, queue type, and the GPU this queue uses. folder and files underneath are generated when a queue is created. They are removed when the queue is destroyed. Signed-off-by: Amber Lin --- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 7 ++ drivers/gpu/drm/amd/amdkfd/kfd_process.c | 90 ++ .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 2 + 3 files changed, 99 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index c0b0def..f805f55 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -41,6 +41,7 @@ #include #include #include +#include #include "amd_shared.h" @@ -503,6 +504,9 @@ struct queue { struct kfd_process *process; struct kfd_dev *device; void *gws; + + /* procfs */ + struct kobject kobj; }; /* @@ -730,6 +734,7 @@ struct kfd_process { /* Kobj for our procfs */ struct kobject *kobj; + struct kobject *kobj_queues; struct attribute attr_pasid; }; @@ -836,6 +841,8 @@ extern struct device *kfd_device; /* KFD's procfs */ void kfd_procfs_init(void); void kfd_procfs_shutdown(void); +int kfd_procfs_add_queue(struct queue *q); +void kfd_procfs_del_queue(struct queue *q); /* Topology */ int kfd_topology_init(void); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 25b90f7..98dcbb9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -132,6 +132,88 @@ void kfd_procfs_shutdown(void) } } +static ssize_t kfd_procfs_queue_show(struct kobject *kobj, +struct attribute *attr, char *buffer) +{ + struct queue *q = container_of(kobj, struct queue, kobj); + + if (!strcmp(attr->name, "size")) + return snprintf(buffer, PAGE_SIZE, "%llu", + q->properties.queue_size); + else if (!strcmp(attr->name, "type")) + return snprintf(buffer, PAGE_SIZE, "%d", q->properties.type); + else if (!strcmp(attr->name, "gpuid")) + return snprintf(buffer, PAGE_SIZE, "%u", q->device->id); + else + pr_err("Invalid attribute"); + + return 0; +} + +static struct attribute attr_queue_size = { + .name = "size", + .mode = KFD_SYSFS_FILE_MODE +}; + +static struct attribute attr_queue_type = { + .name = "type", + .mode = KFD_SYSFS_FILE_MODE +}; + +static struct attribute attr_queue_gpuid = { + .name = "gpuid", + .mode = KFD_SYSFS_FILE_MODE +}; + +static struct attribute *procfs_queue_attrs[] = { + _queue_size, + _queue_type, + _queue_gpuid, + NULL +}; + +static const struct sysfs_ops procfs_queue_ops = { + .show = kfd_procfs_queue_show, +}; + +static struct kobj_type procfs_queue_type = { + .sysfs_ops = _queue_ops, + .default_attrs = procfs_queue_attrs, +}; + +int kfd_procfs_add_queue(struct queue *q) +{ + struct kfd_process *proc; + int ret; + + if (!q || !q->process) + return -EINVAL; + proc = q->process; + + /* Create proc//queues/ folder */ + if (!proc->kobj_queues) + return -EFAULT; + ret = kobject_init_and_add(>kobj, _queue_type, + proc->kobj_queues, "%u", q->properties.queue_id); + if (ret < 0) { + pr_warn("Creating proc//queues/%u failed", + q->properties.queue_id); + kobject_put(>kobj); + return ret; + } + + return 0; +} + +void kfd_procfs_del_queue(struct queue *q) +{ + if (!q) + return; + + kobject_del(>kobj); + kobject_put(>kobj); +} + int kfd_process_create_wq(void) { if (!kfd_process_wq) @@ -323,6 +405,11 @@ struct kfd_process *kfd_create_process(struct file *filep) if (ret) pr_warn("Creating pasid for pid %d failed", (int)process->lead_thread->pid); + + process->kobj_queues = kobject_create_and_add("queues", + process->kobj); + if (!process->kobj_queues) + pr_warn("Creating KFD proc/queues folder failed"); } out: if (!IS_ERR(process)) @@ -457,6 +544,9 @@ static void kfd_process_wq_release(struct work_struct *wo
[PATCH v2] drm/amdkfd: Add queue information to sysfs
Provide compute queues information in sysfs under /sys/class/kfd/kfd/proc. The format is /sys/class/kfd/kfd/proc//queues//XX where XX are size, type, and gpuid three files to represent queue size, queue type, and the GPU this queue uses. folder and files underneath are generated when a queue is created. They are removed when the queue is destroyed. Signed-off-by: Amber Lin --- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 9 ++ drivers/gpu/drm/amd/amdkfd/kfd_process.c | 96 ++ .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 2 + 3 files changed, 107 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index c0b0def..cb2d2d7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -503,6 +503,12 @@ struct queue { struct kfd_process *process; struct kfd_dev *device; void *gws; + + /* procfs */ + struct kobject *kobj_qid; + struct attribute attr_size; + struct attribute attr_type; + struct attribute attr_gpuid; }; /* @@ -730,6 +736,7 @@ struct kfd_process { /* Kobj for our procfs */ struct kobject *kobj; + struct kobject *kobj_queues; struct attribute attr_pasid; }; @@ -836,6 +843,8 @@ extern struct device *kfd_device; /* KFD's procfs */ void kfd_procfs_init(void); void kfd_procfs_shutdown(void); +int kfd_procfs_add_queue(struct queue *q); +void kfd_procfs_del_queue(struct queue *q); /* Topology */ int kfd_topology_init(void); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 25b90f7..78ca037 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -132,6 +132,94 @@ void kfd_procfs_shutdown(void) } } +static int kfd_procfs_add_file(const char *name, struct kobject *kobj, + struct attribute *attr) +{ + int ret; + + attr->name = name; + attr->mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(attr); + ret = sysfs_create_file(kobj, attr); + if (ret) + pr_warn("Creating %s file failed", name); + return ret; +} + +static ssize_t kfd_procfs_queue_show(struct kobject *kobj, +struct attribute *attr, char *buffer) +{ + if (!strcmp(attr->name, "size")) { + struct queue *q = container_of(attr, struct queue, attr_size); + return snprintf(buffer, PAGE_SIZE, "%llu", + q->properties.queue_size); + } else if (!strcmp(attr->name, "type")) { + struct queue *q = container_of(attr, struct queue, attr_type); + return snprintf(buffer, PAGE_SIZE, "%d", q->properties.type); + } else if (!strcmp(attr->name, "gpuid")) { + struct queue *q = container_of(attr, struct queue, attr_gpuid); + return snprintf(buffer, PAGE_SIZE, "%u", q->device->id); + } else + pr_err("Invalid attribute"); + + return 0; +} + +static const struct sysfs_ops procfs_queue_ops = { + .show = kfd_procfs_queue_show, +}; + +static struct kobj_type procfs_queue_type = { + .release = kfd_procfs_kobj_release, + .sysfs_ops = _queue_ops, +}; + +int kfd_procfs_add_queue(struct queue *q) +{ + struct kfd_process *proc; + int ret; + + if (!q || !q->process) + return -EINVAL; + proc = q->process; + + /* Create proc//queues/ folder*/ + if (!proc->kobj_queues) + return -EFAULT; + if (q->kobj_qid) + return -EEXIST; + q->kobj_qid = kfd_alloc_struct(q->kobj_qid); + if (!q->kobj_qid) + return -ENOMEM; + ret = kobject_init_and_add(q->kobj_qid, _queue_type, + proc->kobj_queues, "%u", q->properties.queue_id); + if (ret < 0) { + pr_warn("Creating proc//queues/%u failed", + q->properties.queue_id); + return ret; + } + + /* Create proc//queues//XX files */ + kfd_procfs_add_file("size", q->kobj_qid, >attr_size); + kfd_procfs_add_file("type", q->kobj_qid, >attr_type); + kfd_procfs_add_file("gpuid", q->kobj_qid, >attr_gpuid); + + return 0; +} + +void kfd_procfs_del_queue(struct queue *q) +{ + if (!q || !q->process) + return; + + sysfs_remove_file(q->kobj_qid, >attr_size); + sysfs_remove_file(q->kobj_qid, >attr_type); + sysfs_remove_file(q->kobj_qid, >attr_gpuid); + kobject_del(q->kobj_qid); + kobject_put(q->kobj_qid); + q-&g
[PATCH] drm/amdkfd: Add queue information to sysfs
Provide compute queues information in sysfs under /sys/class/kfd/kfd/proc. The format is /sys/class/kfd/kfd/proc//queues//XX where XX are size, type, and gpuid three files to represent queue size, queue type, and the GPU this queue uses. folder and files underneath are generated when a queue is created. They are removed when the queue is destroyed. Signed-off-by: Amber Lin --- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 9 ++ drivers/gpu/drm/amd/amdkfd/kfd_process.c | 99 ++ .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 2 + 3 files changed, 110 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index c0b0def..cb2d2d7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -503,6 +503,12 @@ struct queue { struct kfd_process *process; struct kfd_dev *device; void *gws; + + /* procfs */ + struct kobject *kobj_qid; + struct attribute attr_size; + struct attribute attr_type; + struct attribute attr_gpuid; }; /* @@ -730,6 +736,7 @@ struct kfd_process { /* Kobj for our procfs */ struct kobject *kobj; + struct kobject *kobj_queues; struct attribute attr_pasid; }; @@ -836,6 +843,8 @@ extern struct device *kfd_device; /* KFD's procfs */ void kfd_procfs_init(void); void kfd_procfs_shutdown(void); +int kfd_procfs_add_queue(struct queue *q); +void kfd_procfs_del_queue(struct queue *q); /* Topology */ int kfd_topology_init(void); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 25b90f7..0220651 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -132,6 +132,97 @@ void kfd_procfs_shutdown(void) } } +static int kfd_procfs_add_file(const char *name, struct kobject *kobj, + struct attribute *attr) +{ + int ret; + + attr->name = name; + attr->mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(attr); + ret = sysfs_create_file(kobj, attr); + if (ret) + pr_warn("Creating %s file failed", name); + return ret; +} + +static ssize_t kfd_procfs_queue_show(struct kobject *kobj, +struct attribute *attr, char *buffer) +{ + if (!strcmp(attr->name, "size")) { + struct queue *q = container_of(attr, struct queue, attr_size); + return snprintf(buffer, PAGE_SIZE, "%llu", + q->properties.queue_size); + } else if (!strcmp(attr->name, "type")) { + struct queue *q = container_of(attr, struct queue, attr_type); + return snprintf(buffer, PAGE_SIZE, "%d", q->properties.type); + } else if (!strcmp(attr->name, "gpuid")) { + struct queue *q = container_of(attr, struct queue, attr_gpuid); + return snprintf(buffer, PAGE_SIZE, "%u", q->device->id); + } else + pr_err("Invalid attribute"); + + return 0; +} + +static const struct sysfs_ops procfs_queue_ops = { + .show = kfd_procfs_queue_show, +}; + +static struct kobj_type procfs_queue_type = { + .release = kfd_procfs_kobj_release, + .sysfs_ops = _queue_ops, +}; + +int kfd_procfs_add_queue(struct queue *q) +{ + struct kfd_process *proc; + int ret; + + if (!q || !q->process) + return -EINVAL; + proc = q->process; + + /* Create proc//queues/ folder*/ + if (!proc->kobj_queues) + return -EFAULT; + if (q->kobj_qid) + return -EEXIST; + q->kobj_qid = kfd_alloc_struct(q->kobj_qid); + if (!q->kobj_qid) + return -ENOMEM; + ret = kobject_init_and_add(q->kobj_qid, _queue_type, + proc->kobj_queues, "%u", q->properties.queue_id); + if (ret < 0) { + pr_warn("Creating proc//queues/%u failed", + q->properties.queue_id); + return ret; + } + + /* Create proc//queues//XX files */ + kfd_procfs_add_file("size", q->kobj_qid, >attr_size); + kfd_procfs_add_file("type", q->kobj_qid, >attr_type); + kfd_procfs_add_file("gpuid", q->kobj_qid, >attr_gpuid); + + return 0; +} + +void kfd_procfs_del_queue(struct queue *q) +{ + struct kfd_process *proc; + + if (!q || !q->process) + return; + proc = q->process; + + sysfs_remove_file(q->kobj_qid, >attr_size); + sysfs_remove_file(q->kobj_qid, >attr_type); + sysfs_remove_file(q->kobj_qid, >attr_gpuid); + k
[PATCH] drm/amdgpu: Relocate some definitions
Move some KFD-related (but used in amdgpu_drv.c) definitions from kfd_priv.h to kgd_kfd_interface.h so we don't need to include kfd_priv.h in amdgpu_drv.c. This fixes a build failure when AMDGPU is enabled but MMU_NOTIFIER is not. This patch also disables KFD-related module options when HSA_AMD is not enabled. Signed-off-by: Amber Lin --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 20 +- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 28 - drivers/gpu/drm/amd/include/kgd_kfd_interface.h | 28 + 3 files changed, 38 insertions(+), 38 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index dd6d8b1..e4d0d72 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -39,7 +39,6 @@ #include "amdgpu_gem.h" #include "amdgpu_amdkfd.h" -#include "kfd_priv.h" /* * KMS wrapper. @@ -128,15 +127,6 @@ int amdgpu_compute_multipipe = -1; int amdgpu_gpu_recovery = -1; /* auto */ int amdgpu_emu_mode = 0; uint amdgpu_smu_memory_pool_size = 0; -/* KFD parameters */ -int sched_policy = KFD_SCHED_POLICY_HWS; -int hws_max_conc_proc = 8; -int cwsr_enable = 1; -int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT; -int send_sigterm; -int debug_largebar; -int ignore_crat; -int vega10_noretry; /** * DOC: vramlimit (int) @@ -542,12 +532,14 @@ MODULE_PARM_DESC(smu_memory_pool_size, "0x1 = 256Mbyte, 0x2 = 512Mbyte, 0x4 = 1 Gbyte, 0x8 = 2GByte"); module_param_named(smu_memory_pool_size, amdgpu_smu_memory_pool_size, uint, 0444); +#ifdef CONFIG_HSA_AMD /** * DOC: sched_policy (int) * Set scheduling policy. Default is HWS(hardware scheduling) with over-subscription. * Setting 1 disables over-subscription. Setting 2 disables HWS and statically * assigns queues to HQDs. */ +int sched_policy = KFD_SCHED_POLICY_HWS; module_param(sched_policy, int, 0444); MODULE_PARM_DESC(sched_policy, "Scheduling policy (0 = HWS (Default), 1 = HWS without over-subscription, 2 = Non-HWS (Used for debugging only)"); @@ -557,6 +549,7 @@ MODULE_PARM_DESC(sched_policy, * Maximum number of processes that HWS can schedule concurrently. The maximum is the * number of VMIDs assigned to the HWS, which is also the default. */ +int hws_max_conc_proc = 8; module_param(hws_max_conc_proc, int, 0444); MODULE_PARM_DESC(hws_max_conc_proc, "Max # processes HWS can execute concurrently when sched_policy=0 (0 = no concurrency, #VMIDs for KFD = Maximum(default))"); @@ -567,6 +560,7 @@ MODULE_PARM_DESC(hws_max_conc_proc, * the middle of a compute wave. Default is 1 to enable this feature. Setting 0 * disables it. */ +int cwsr_enable = 1; module_param(cwsr_enable, int, 0444); MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))"); @@ -575,6 +569,7 @@ MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))"); * Maximum number of queues per device. Valid setting is between 1 and 4096. Default * is 4096. */ +int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT; module_param(max_num_of_queues_per_device, int, 0444); MODULE_PARM_DESC(max_num_of_queues_per_device, "Maximum number of supported queues per device (1 = Minimum, 4096 = default)"); @@ -584,6 +579,7 @@ MODULE_PARM_DESC(max_num_of_queues_per_device, * Send sigterm to HSA process on unhandled exceptions. Default is not to send sigterm * but just print errors on dmesg. Setting 1 enables sending sigterm. */ +int send_sigterm; module_param(send_sigterm, int, 0444); MODULE_PARM_DESC(send_sigterm, "Send sigterm to HSA process on unhandled exception (0 = disable, 1 = enable)"); @@ -595,6 +591,7 @@ MODULE_PARM_DESC(send_sigterm, * size, usually 256MB. * Default value is 0, diabled. */ +int debug_largebar; module_param(debug_largebar, int, 0444); MODULE_PARM_DESC(debug_largebar, "Debug large-bar flag used to simulate large-bar capability on non-large bar machine (0 = disable, 1 = enable)"); @@ -605,6 +602,7 @@ MODULE_PARM_DESC(debug_largebar, * table to get information about AMD APUs. This option can serve as a workaround on * systems with a broken CRAT table. */ +int ignore_crat; module_param(ignore_crat, int, 0444); MODULE_PARM_DESC(ignore_crat, "Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)"); @@ -615,9 +613,11 @@ MODULE_PARM_DESC(ignore_crat, * Setting 1 disables retry. * Retry is needed for recoverable page faults. */ +int vega10_noretry; module_param_named(noretry, vega10_noretry, int, 0644); MODULE_PARM_DESC(noretry, "Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled (default), 1 = retry disabled)"); +#endif static const st
[PATCH] drm/amdgpu: Relocate some definitions
Move some KFD-related (but used in amdgpu_drv.c) definitions from kfd_priv.h to kgd_kfd_interface.h so we don't need to include kfd_priv.h in amdgpu_drv.c. This fixes a build failure when AMDGPU is enabled but MMU_NOTIFIER is not. This patch also disables KFD-related module options when HSA_AMD is not enabled. Signed-off-by: Amber Lin --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 20 +- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 28 - drivers/gpu/drm/amd/include/kgd_kfd_interface.h | 28 + 3 files changed, 38 insertions(+), 38 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index dd6d8b1..e4d0d72 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -39,7 +39,6 @@ #include "amdgpu_gem.h" #include "amdgpu_amdkfd.h" -#include "kfd_priv.h" /* * KMS wrapper. @@ -128,15 +127,6 @@ int amdgpu_compute_multipipe = -1; int amdgpu_gpu_recovery = -1; /* auto */ int amdgpu_emu_mode = 0; uint amdgpu_smu_memory_pool_size = 0; -/* KFD parameters */ -int sched_policy = KFD_SCHED_POLICY_HWS; -int hws_max_conc_proc = 8; -int cwsr_enable = 1; -int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT; -int send_sigterm; -int debug_largebar; -int ignore_crat; -int vega10_noretry; /** * DOC: vramlimit (int) @@ -542,12 +532,14 @@ MODULE_PARM_DESC(smu_memory_pool_size, "0x1 = 256Mbyte, 0x2 = 512Mbyte, 0x4 = 1 Gbyte, 0x8 = 2GByte"); module_param_named(smu_memory_pool_size, amdgpu_smu_memory_pool_size, uint, 0444); +#ifdef CONFIG_HSA_AMD /** * DOC: sched_policy (int) * Set scheduling policy. Default is HWS(hardware scheduling) with over-subscription. * Setting 1 disables over-subscription. Setting 2 disables HWS and statically * assigns queues to HQDs. */ +int sched_policy = KFD_SCHED_POLICY_HWS; module_param(sched_policy, int, 0444); MODULE_PARM_DESC(sched_policy, "Scheduling policy (0 = HWS (Default), 1 = HWS without over-subscription, 2 = Non-HWS (Used for debugging only)"); @@ -557,6 +549,7 @@ MODULE_PARM_DESC(sched_policy, * Maximum number of processes that HWS can schedule concurrently. The maximum is the * number of VMIDs assigned to the HWS, which is also the default. */ +int hws_max_conc_proc = 8; module_param(hws_max_conc_proc, int, 0444); MODULE_PARM_DESC(hws_max_conc_proc, "Max # processes HWS can execute concurrently when sched_policy=0 (0 = no concurrency, #VMIDs for KFD = Maximum(default))"); @@ -567,6 +560,7 @@ MODULE_PARM_DESC(hws_max_conc_proc, * the middle of a compute wave. Default is 1 to enable this feature. Setting 0 * disables it. */ +int cwsr_enable = 1; module_param(cwsr_enable, int, 0444); MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))"); @@ -575,6 +569,7 @@ MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))"); * Maximum number of queues per device. Valid setting is between 1 and 4096. Default * is 4096. */ +int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT; module_param(max_num_of_queues_per_device, int, 0444); MODULE_PARM_DESC(max_num_of_queues_per_device, "Maximum number of supported queues per device (1 = Minimum, 4096 = default)"); @@ -584,6 +579,7 @@ MODULE_PARM_DESC(max_num_of_queues_per_device, * Send sigterm to HSA process on unhandled exceptions. Default is not to send sigterm * but just print errors on dmesg. Setting 1 enables sending sigterm. */ +int send_sigterm; module_param(send_sigterm, int, 0444); MODULE_PARM_DESC(send_sigterm, "Send sigterm to HSA process on unhandled exception (0 = disable, 1 = enable)"); @@ -595,6 +591,7 @@ MODULE_PARM_DESC(send_sigterm, * size, usually 256MB. * Default value is 0, diabled. */ +int debug_largebar; module_param(debug_largebar, int, 0444); MODULE_PARM_DESC(debug_largebar, "Debug large-bar flag used to simulate large-bar capability on non-large bar machine (0 = disable, 1 = enable)"); @@ -605,6 +602,7 @@ MODULE_PARM_DESC(debug_largebar, * table to get information about AMD APUs. This option can serve as a workaround on * systems with a broken CRAT table. */ +int ignore_crat; module_param(ignore_crat, int, 0444); MODULE_PARM_DESC(ignore_crat, "Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)"); @@ -615,9 +613,11 @@ MODULE_PARM_DESC(ignore_crat, * Setting 1 disables retry. * Retry is needed for recoverable page faults. */ +int vega10_noretry; module_param_named(noretry, vega10_noretry, int, 0644); MODULE_PARM_DESC(noretry, "Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled (default), 1 = retry disabled)"); +#endif static const st
[PATCH v2] drm/amdgpu: Move KFD parameters to amdgpu
After merging KFD into amdgpu, move module parameters defined in KFD to amdgpu_drv.c, where other module parameters are declared. v2: add kernel-doc comments Change-Id: I2de8d6c96bb49554c028bbc84bdb194f974c9278 Signed-off-by: Amber Lin --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 87 + drivers/gpu/drm/amd/amdkfd/kfd_module.c | 40 --- 2 files changed, 87 insertions(+), 40 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 2221f6b..dd6d8b1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -39,6 +39,7 @@ #include "amdgpu_gem.h" #include "amdgpu_amdkfd.h" +#include "kfd_priv.h" /* * KMS wrapper. @@ -127,6 +128,15 @@ int amdgpu_compute_multipipe = -1; int amdgpu_gpu_recovery = -1; /* auto */ int amdgpu_emu_mode = 0; uint amdgpu_smu_memory_pool_size = 0; +/* KFD parameters */ +int sched_policy = KFD_SCHED_POLICY_HWS; +int hws_max_conc_proc = 8; +int cwsr_enable = 1; +int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT; +int send_sigterm; +int debug_largebar; +int ignore_crat; +int vega10_noretry; /** * DOC: vramlimit (int) @@ -532,6 +542,83 @@ MODULE_PARM_DESC(smu_memory_pool_size, "0x1 = 256Mbyte, 0x2 = 512Mbyte, 0x4 = 1 Gbyte, 0x8 = 2GByte"); module_param_named(smu_memory_pool_size, amdgpu_smu_memory_pool_size, uint, 0444); +/** + * DOC: sched_policy (int) + * Set scheduling policy. Default is HWS(hardware scheduling) with over-subscription. + * Setting 1 disables over-subscription. Setting 2 disables HWS and statically + * assigns queues to HQDs. + */ +module_param(sched_policy, int, 0444); +MODULE_PARM_DESC(sched_policy, + "Scheduling policy (0 = HWS (Default), 1 = HWS without over-subscription, 2 = Non-HWS (Used for debugging only)"); + +/** + * DOC: hws_max_conc_proc (int) + * Maximum number of processes that HWS can schedule concurrently. The maximum is the + * number of VMIDs assigned to the HWS, which is also the default. + */ +module_param(hws_max_conc_proc, int, 0444); +MODULE_PARM_DESC(hws_max_conc_proc, + "Max # processes HWS can execute concurrently when sched_policy=0 (0 = no concurrency, #VMIDs for KFD = Maximum(default))"); + +/** + * DOC: cwsr_enable (int) + * CWSR(compute wave store and resume) allows the GPU to preempt shader execution in + * the middle of a compute wave. Default is 1 to enable this feature. Setting 0 + * disables it. + */ +module_param(cwsr_enable, int, 0444); +MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))"); + +/** + * DOC: max_num_of_queues_per_device (int) + * Maximum number of queues per device. Valid setting is between 1 and 4096. Default + * is 4096. + */ +module_param(max_num_of_queues_per_device, int, 0444); +MODULE_PARM_DESC(max_num_of_queues_per_device, + "Maximum number of supported queues per device (1 = Minimum, 4096 = default)"); + +/** + * DOC: send_sigterm (int) + * Send sigterm to HSA process on unhandled exceptions. Default is not to send sigterm + * but just print errors on dmesg. Setting 1 enables sending sigterm. + */ +module_param(send_sigterm, int, 0444); +MODULE_PARM_DESC(send_sigterm, + "Send sigterm to HSA process on unhandled exception (0 = disable, 1 = enable)"); + +/** + * DOC: debug_largebar (int) + * Set debug_largebar as 1 to enable simulating large-bar capability on non-large bar + * system. This limits the VRAM size reported to ROCm applications to the visible + * size, usually 256MB. + * Default value is 0, diabled. + */ +module_param(debug_largebar, int, 0444); +MODULE_PARM_DESC(debug_largebar, + "Debug large-bar flag used to simulate large-bar capability on non-large bar machine (0 = disable, 1 = enable)"); + +/** + * DOC: ignore_crat (int) + * Ignore CRAT table during KFD initialization. By default, KFD uses the ACPI CRAT + * table to get information about AMD APUs. This option can serve as a workaround on + * systems with a broken CRAT table. + */ +module_param(ignore_crat, int, 0444); +MODULE_PARM_DESC(ignore_crat, + "Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)"); + +/** + * DOC: noretry (int) + * This parameter sets sh_mem_config.retry_disable. Default value, 0, enables retry. + * Setting 1 disables retry. + * Retry is needed for recoverable page faults. + */ +module_param_named(noretry, vega10_noretry, int, 0644); +MODULE_PARM_DESC(noretry, + "Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled (default), 1 = retry disabled)"); + static const struct pci_device_id pciidlist[] = { #ifdef CONFIG_DRM_AMDGPU_SI {0x1002, 0x6780, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TAHITI}, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/a
[PATCH v2] drm/amdgpu: Merge amdkfd into amdgpu
Since KFD is only supported by single GPU driver, it makes sense to merge amdgpu and amdkfd into one module. This patch is the initial step: merge Kconfig and Makefile. v2: also remove kfd from drm Kconfig Change-Id: I21c996ba29d393c1bf8064bdb2f5d89541159649 Signed-off-by: Amber Lin --- drivers/gpu/drm/Kconfig | 2 - drivers/gpu/drm/amd/amdgpu/Kconfig | 1 + drivers/gpu/drm/amd/amdgpu/Makefile | 6 ++- drivers/gpu/drm/amd/amdkfd/Kconfig | 2 +- drivers/gpu/drm/amd/amdkfd/Makefile | 53 ++- drivers/gpu/drm/amd/amdkfd/kfd_module.c | 76 ++--- 6 files changed, 63 insertions(+), 77 deletions(-) diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 2a72d2f..5ea1ac3 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -264,8 +264,6 @@ source "drivers/gpu/drm/bridge/Kconfig" source "drivers/gpu/drm/sti/Kconfig" -source "drivers/gpu/drm/amd/amdkfd/Kconfig" - source "drivers/gpu/drm/imx/Kconfig" source "drivers/gpu/drm/v3d/Kconfig" diff --git a/drivers/gpu/drm/amd/amdgpu/Kconfig b/drivers/gpu/drm/amd/amdgpu/Kconfig index e8af1f5..9221e54 100644 --- a/drivers/gpu/drm/amd/amdgpu/Kconfig +++ b/drivers/gpu/drm/amd/amdgpu/Kconfig @@ -42,3 +42,4 @@ config DRM_AMDGPU_GART_DEBUGFS source "drivers/gpu/drm/amd/acp/Kconfig" source "drivers/gpu/drm/amd/display/Kconfig" +source "drivers/gpu/drm/amd/amdkfd/Kconfig" diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile index d2bafab..847536b 100644 --- a/drivers/gpu/drm/amd/amdgpu/Makefile +++ b/drivers/gpu/drm/amd/amdgpu/Makefile @@ -35,7 +35,8 @@ ccflags-y := -I$(FULL_AMD_PATH)/include/asic_reg \ -I$(FULL_AMD_DISPLAY_PATH) \ -I$(FULL_AMD_DISPLAY_PATH)/include \ -I$(FULL_AMD_DISPLAY_PATH)/dc \ - -I$(FULL_AMD_DISPLAY_PATH)/amdgpu_dm + -I$(FULL_AMD_DISPLAY_PATH)/amdgpu_dm \ + -I$(FULL_AMD_PATH)/amdkfd amdgpu-y := amdgpu_drv.o @@ -136,6 +137,9 @@ amdgpu-y += \ amdgpu-y += amdgpu_amdkfd.o ifneq ($(CONFIG_HSA_AMD),) +AMDKFD_PATH := ../amdkfd +include $(FULL_AMD_PATH)/amdkfd/Makefile +amdgpu-y += $(AMDKFD_FILES) amdgpu-y += \ amdgpu_amdkfd_fence.o \ amdgpu_amdkfd_gpuvm.o \ diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig index 3858820..fbf0ee5 100644 --- a/drivers/gpu/drm/amd/amdkfd/Kconfig +++ b/drivers/gpu/drm/amd/amdkfd/Kconfig @@ -3,7 +3,7 @@ # config HSA_AMD - tristate "HSA kernel driver for AMD GPU devices" + bool "HSA kernel driver for AMD GPU devices" depends on DRM_AMDGPU && X86_64 imply AMD_IOMMU_V2 select MMU_NOTIFIER diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index ffd096f..69ec969 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -23,26 +23,41 @@ # Makefile for Heterogenous System Architecture support for AMD GPU devices # -ccflags-y := -Idrivers/gpu/drm/amd/include/ \ - -Idrivers/gpu/drm/amd/include/asic_reg - -amdkfd-y := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \ - kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \ - kfd_process.o kfd_queue.o kfd_mqd_manager.o \ - kfd_mqd_manager_cik.o kfd_mqd_manager_vi.o \ - kfd_mqd_manager_v9.o \ - kfd_kernel_queue.o kfd_kernel_queue_cik.o \ - kfd_kernel_queue_vi.o kfd_kernel_queue_v9.o \ - kfd_packet_manager.o kfd_process_queue_manager.o \ - kfd_device_queue_manager.o kfd_device_queue_manager_cik.o \ - kfd_device_queue_manager_vi.o kfd_device_queue_manager_v9.o \ - kfd_interrupt.o kfd_events.o cik_event_interrupt.o \ - kfd_int_process_v9.o kfd_dbgdev.o kfd_dbgmgr.o kfd_crat.o +AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ + $(AMDKFD_PATH)/kfd_device.o \ + $(AMDKFD_PATH)/kfd_chardev.o \ + $(AMDKFD_PATH)/kfd_topology.o \ + $(AMDKFD_PATH)/kfd_pasid.o \ + $(AMDKFD_PATH)/kfd_doorbell.o \ + $(AMDKFD_PATH)/kfd_flat_memory.o \ + $(AMDKFD_PATH)/kfd_process.o \ + $(AMDKFD_PATH)/kfd_queue.o \ + $(AMDKFD_PATH)/kfd_mqd_manager.o \ + $(AMDKFD_PATH)/kfd_mqd_manager_cik.o \ + $(AMDKFD_PATH)/kfd_mqd_manager_vi.o \ + $(AMDKFD_PATH)/kfd_mqd_manager_v9.o \ + $(AMDKFD_PATH)/kfd_kernel_queue.o \ + $(AMDKFD_PATH)/kfd_kernel_queue_cik.o \ + $(AMDKFD_PATH)/kfd_kernel_queue_vi.o \ + $(AMDKFD_PATH)/kfd_kernel_queue_v9.o \ + $(AMDKFD_PATH)/kfd_packet_manager.o \ + $(AMDKFD_PATH)/kfd_process
[PATCH 1/3] drm/amdgpu: Merge amdkfd into amdgpu
Since KFD is only supported by single GPU driver, it makes sense to merge amdgpu and amdkfd into one module. This patch is the initial step: merge Kconfig and Makefile. Change-Id: I21c996ba29d393c1bf8064bdb2f5d89541159649 Signed-off-by: Amber Lin --- drivers/gpu/drm/amd/amdgpu/Kconfig | 1 + drivers/gpu/drm/amd/amdgpu/Makefile | 6 ++- drivers/gpu/drm/amd/amdkfd/Kconfig | 2 +- drivers/gpu/drm/amd/amdkfd/Makefile | 53 ++- drivers/gpu/drm/amd/amdkfd/kfd_module.c | 76 ++--- 5 files changed, 63 insertions(+), 75 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/Kconfig b/drivers/gpu/drm/amd/amdgpu/Kconfig index e8af1f5..9221e54 100644 --- a/drivers/gpu/drm/amd/amdgpu/Kconfig +++ b/drivers/gpu/drm/amd/amdgpu/Kconfig @@ -42,3 +42,4 @@ config DRM_AMDGPU_GART_DEBUGFS source "drivers/gpu/drm/amd/acp/Kconfig" source "drivers/gpu/drm/amd/display/Kconfig" +source "drivers/gpu/drm/amd/amdkfd/Kconfig" diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile index d2bafab..847536b 100644 --- a/drivers/gpu/drm/amd/amdgpu/Makefile +++ b/drivers/gpu/drm/amd/amdgpu/Makefile @@ -35,7 +35,8 @@ ccflags-y := -I$(FULL_AMD_PATH)/include/asic_reg \ -I$(FULL_AMD_DISPLAY_PATH) \ -I$(FULL_AMD_DISPLAY_PATH)/include \ -I$(FULL_AMD_DISPLAY_PATH)/dc \ - -I$(FULL_AMD_DISPLAY_PATH)/amdgpu_dm + -I$(FULL_AMD_DISPLAY_PATH)/amdgpu_dm \ + -I$(FULL_AMD_PATH)/amdkfd amdgpu-y := amdgpu_drv.o @@ -136,6 +137,9 @@ amdgpu-y += \ amdgpu-y += amdgpu_amdkfd.o ifneq ($(CONFIG_HSA_AMD),) +AMDKFD_PATH := ../amdkfd +include $(FULL_AMD_PATH)/amdkfd/Makefile +amdgpu-y += $(AMDKFD_FILES) amdgpu-y += \ amdgpu_amdkfd_fence.o \ amdgpu_amdkfd_gpuvm.o \ diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig index 3858820..fbf0ee5 100644 --- a/drivers/gpu/drm/amd/amdkfd/Kconfig +++ b/drivers/gpu/drm/amd/amdkfd/Kconfig @@ -3,7 +3,7 @@ # config HSA_AMD - tristate "HSA kernel driver for AMD GPU devices" + bool "HSA kernel driver for AMD GPU devices" depends on DRM_AMDGPU && X86_64 imply AMD_IOMMU_V2 select MMU_NOTIFIER diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index ffd096f..69ec969 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -23,26 +23,41 @@ # Makefile for Heterogenous System Architecture support for AMD GPU devices # -ccflags-y := -Idrivers/gpu/drm/amd/include/ \ - -Idrivers/gpu/drm/amd/include/asic_reg - -amdkfd-y := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \ - kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \ - kfd_process.o kfd_queue.o kfd_mqd_manager.o \ - kfd_mqd_manager_cik.o kfd_mqd_manager_vi.o \ - kfd_mqd_manager_v9.o \ - kfd_kernel_queue.o kfd_kernel_queue_cik.o \ - kfd_kernel_queue_vi.o kfd_kernel_queue_v9.o \ - kfd_packet_manager.o kfd_process_queue_manager.o \ - kfd_device_queue_manager.o kfd_device_queue_manager_cik.o \ - kfd_device_queue_manager_vi.o kfd_device_queue_manager_v9.o \ - kfd_interrupt.o kfd_events.o cik_event_interrupt.o \ - kfd_int_process_v9.o kfd_dbgdev.o kfd_dbgmgr.o kfd_crat.o +AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ + $(AMDKFD_PATH)/kfd_device.o \ + $(AMDKFD_PATH)/kfd_chardev.o \ + $(AMDKFD_PATH)/kfd_topology.o \ + $(AMDKFD_PATH)/kfd_pasid.o \ + $(AMDKFD_PATH)/kfd_doorbell.o \ + $(AMDKFD_PATH)/kfd_flat_memory.o \ + $(AMDKFD_PATH)/kfd_process.o \ + $(AMDKFD_PATH)/kfd_queue.o \ + $(AMDKFD_PATH)/kfd_mqd_manager.o \ + $(AMDKFD_PATH)/kfd_mqd_manager_cik.o \ + $(AMDKFD_PATH)/kfd_mqd_manager_vi.o \ + $(AMDKFD_PATH)/kfd_mqd_manager_v9.o \ + $(AMDKFD_PATH)/kfd_kernel_queue.o \ + $(AMDKFD_PATH)/kfd_kernel_queue_cik.o \ + $(AMDKFD_PATH)/kfd_kernel_queue_vi.o \ + $(AMDKFD_PATH)/kfd_kernel_queue_v9.o \ + $(AMDKFD_PATH)/kfd_packet_manager.o \ + $(AMDKFD_PATH)/kfd_process_queue_manager.o \ + $(AMDKFD_PATH)/kfd_device_queue_manager.o \ + $(AMDKFD_PATH)/kfd_device_queue_manager_cik.o \ + $(AMDKFD_PATH)/kfd_device_queue_manager_vi.o \ + $(AMDKFD_PATH)/kfd_device_queue_manager_v9.o \ + $(AMDKFD_PATH)/kfd_interrupt.o \ + $(AMDKFD_PATH)/kfd_events.o \ + $(AMDKFD_PATH)/cik_event_interrupt.o \ + $(AMDKFD_PATH)/kfd_int_process_v9.o \ + $(AMDKFD_PATH)/kfd_dbgdev.o \ + $
[PATCH 2/3] drm/amdgpu: Remove CONFIG_HSA_AMD_MODULE
After amdkfd is merged to amdgpu, CONFIG_HSA_AMD_MODULE no longer exists. Change-Id: I42096cdf887e0d776075f3dd3e8d3f153aff4e85 Signed-off-by: Amber Lin --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 26 +++--- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index e3ed08d..8c652ec 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -36,36 +36,16 @@ int amdgpu_amdkfd_init(void) { int ret; -#if defined(CONFIG_HSA_AMD_MODULE) - int (*kgd2kfd_init_p)(unsigned int, const struct kgd2kfd_calls**); - - kgd2kfd_init_p = symbol_request(kgd2kfd_init); - - if (kgd2kfd_init_p == NULL) - return -ENOENT; - - ret = kgd2kfd_init_p(KFD_INTERFACE_VERSION, ); - if (ret) { - symbol_put(kgd2kfd_init); - kgd2kfd = NULL; - } - - -#elif defined(CONFIG_HSA_AMD) - +#ifdef CONFIG_HSA_AMD ret = kgd2kfd_init(KFD_INTERFACE_VERSION, ); if (ret) kgd2kfd = NULL; - + amdgpu_amdkfd_gpuvm_init_mem_limits(); #else kgd2kfd = NULL; ret = -ENOENT; #endif -#if defined(CONFIG_HSA_AMD_MODULE) || defined(CONFIG_HSA_AMD) - amdgpu_amdkfd_gpuvm_init_mem_limits(); -#endif - return ret; } @@ -471,7 +451,7 @@ bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid) return false; } -#if !defined(CONFIG_HSA_AMD_MODULE) && !defined(CONFIG_HSA_AMD) +#ifndef CONFIG_HSA_AMD bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm) { return false; -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 3/3] drm/amdgpu: Move KFD parameters to amdgpu
After merging KFD into amdgpu, move module parameters defined in KFD to amdgpu_drv.c, where other module parameters are declared. Change-Id: I2de8d6c96bb49554c028bbc84bdb194f974c9278 Signed-off-by: Amber Lin --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 41 + drivers/gpu/drm/amd/amdkfd/kfd_module.c | 40 2 files changed, 41 insertions(+), 40 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 2221f6b..af9a766 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -39,6 +39,7 @@ #include "amdgpu_gem.h" #include "amdgpu_amdkfd.h" +#include "kfd_priv.h" /* * KMS wrapper. @@ -127,6 +128,15 @@ int amdgpu_compute_multipipe = -1; int amdgpu_gpu_recovery = -1; /* auto */ int amdgpu_emu_mode = 0; uint amdgpu_smu_memory_pool_size = 0; +/* KFD parameters */ +int sched_policy = KFD_SCHED_POLICY_HWS; +int hws_max_conc_proc = 8; +int cwsr_enable = 1; +int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT; +int send_sigterm; +int debug_largebar; +int ignore_crat; +int vega10_noretry; /** * DOC: vramlimit (int) @@ -532,6 +542,37 @@ MODULE_PARM_DESC(smu_memory_pool_size, "0x1 = 256Mbyte, 0x2 = 512Mbyte, 0x4 = 1 Gbyte, 0x8 = 2GByte"); module_param_named(smu_memory_pool_size, amdgpu_smu_memory_pool_size, uint, 0444); +module_param(sched_policy, int, 0444); +MODULE_PARM_DESC(sched_policy, + "Scheduling policy (0 = HWS (Default), 1 = HWS without over-subscription, 2 = Non-HWS (Used for debugging only)"); + +module_param(hws_max_conc_proc, int, 0444); +MODULE_PARM_DESC(hws_max_conc_proc, + "Max # processes HWS can execute concurrently when sched_policy=0 (0 = no concurrency, #VMIDs for KFD = Maximum(default))"); + +module_param(cwsr_enable, int, 0444); +MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))"); + +module_param(max_num_of_queues_per_device, int, 0444); +MODULE_PARM_DESC(max_num_of_queues_per_device, + "Maximum number of supported queues per device (1 = Minimum, 4096 = default)"); + +module_param(send_sigterm, int, 0444); +MODULE_PARM_DESC(send_sigterm, + "Send sigterm to HSA process on unhandled exception (0 = disable, 1 = enable)"); + +module_param(debug_largebar, int, 0444); +MODULE_PARM_DESC(debug_largebar, + "Debug large-bar flag used to simulate large-bar capability on non-large bar machine (0 = disable, 1 = enable)"); + +module_param(ignore_crat, int, 0444); +MODULE_PARM_DESC(ignore_crat, + "Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)"); + +module_param_named(noretry, vega10_noretry, int, 0644); +MODULE_PARM_DESC(noretry, + "Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled (default), 1 = retry disabled)"); + static const struct pci_device_id pciidlist[] = { #ifdef CONFIG_DRM_AMDGPU_SI {0x1002, 0x6780, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TAHITI}, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c index 8847514..5f4977b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c @@ -21,7 +21,6 @@ */ #include -#include #include #include "kfd_priv.h" @@ -39,45 +38,6 @@ static const struct kgd2kfd_calls kgd2kfd = { kgd2kfd_schedule_evict_and_restore_process, }; -int sched_policy = KFD_SCHED_POLICY_HWS; -module_param(sched_policy, int, 0444); -MODULE_PARM_DESC(sched_policy, - "Scheduling policy (0 = HWS (Default), 1 = HWS without over-subscription, 2 = Non-HWS (Used for debugging only)"); - -int hws_max_conc_proc = 8; -module_param(hws_max_conc_proc, int, 0444); -MODULE_PARM_DESC(hws_max_conc_proc, - "Max # processes HWS can execute concurrently when sched_policy=0 (0 = no concurrency, #VMIDs for KFD = Maximum(default))"); - -int cwsr_enable = 1; -module_param(cwsr_enable, int, 0444); -MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))"); - -int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT; -module_param(max_num_of_queues_per_device, int, 0444); -MODULE_PARM_DESC(max_num_of_queues_per_device, - "Maximum number of supported queues per device (1 = Minimum, 4096 = default)"); - -int send_sigterm; -module_param(send_sigterm, int, 0444); -MODULE_PARM_DESC(send_sigterm, - "Send sigterm to HSA process on unhandled exception (0 = disable, 1 = enable)"); - -int debug_largebar; -module_param(debug_largebar, int, 0444); -MODULE_PARM_DESC(debug_largebar, - "Debug large-bar flag used to simulate large-bar capability on non-large bar machine (0 = disable, 1 = enable)"); - -in
[PATCH v2] drm/amdgpu: Map all visible VRAM at startup
When using CPU to update page table, we need to kmap all the PDs/PTs after they are allocated and that requires a TLB shot down on each CPU, which is quite heavy. Instead, we map the whole visible VRAM to a kernel address at once. Pages can be obtained from the offset. v2: move the mapping base from gmc to amdgpu_mman structure, and the implementation in amdgpu_ttm_* functions Change-Id: I56574bd544dae273da50e8b5dd6894cd5d9454bd Signed-off-by: Amber Lin <amber@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 17 + drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 1 + 2 files changed, 18 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index e38e6db..f126a5a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -621,6 +621,7 @@ static int amdgpu_ttm_io_mem_reserve(struct ttm_bo_device *bdev, struct ttm_mem_ { struct ttm_mem_type_manager *man = >man[mem->mem_type]; struct amdgpu_device *adev = amdgpu_ttm_adev(bdev); + struct drm_mm_node *mm_node = mem->mm_node; mem->bus.addr = NULL; mem->bus.offset = 0; @@ -640,6 +641,15 @@ static int amdgpu_ttm_io_mem_reserve(struct ttm_bo_device *bdev, struct ttm_mem_ /* check if it's visible */ if ((mem->bus.offset + mem->bus.size) > adev->gmc.visible_vram_size) return -EINVAL; + /* Only physically contiguous buffers apply. In a contiguous +* buffer, size of the first mm_node would match the number of +* pages in ttm_mem_reg. +*/ + if (adev->mman.aper_base_kaddr && + (mm_node->size == mem->num_pages)) + mem->bus.addr = (u8 *)adev->mman.aper_base_kaddr + + mem->bus.offset; + mem->bus.base = adev->gmc.aper_base; mem->bus.is_iomem = true; break; @@ -1402,6 +1412,10 @@ int amdgpu_ttm_init(struct amdgpu_device *adev) /* Change the size here instead of the init above so only lpfn is affected */ amdgpu_ttm_set_active_vram_size(adev, adev->gmc.visible_vram_size); +#ifdef CONFIG_64BIT + adev->mman.aper_base_kaddr = ioremap_wc(adev->gmc.aper_base, + adev->gmc.visible_vram_size); +#endif /* *The reserved vram for firmware must be pinned to the specified @@ -1494,6 +1508,9 @@ void amdgpu_ttm_fini(struct amdgpu_device *adev) amdgpu_ttm_debugfs_fini(adev); amdgpu_bo_free_kernel(>stolen_vga_memory, NULL, NULL); amdgpu_ttm_fw_reserve_vram_fini(adev); + if (adev->mman.aper_base_kaddr) + iounmap(adev->mman.aper_base_kaddr); + adev->mman.aper_base_kaddr = NULL; ttm_bo_clean_mm(>mman.bdev, TTM_PL_VRAM); ttm_bo_clean_mm(>mman.bdev, TTM_PL_TT); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h index 1e275c7..d314910 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h @@ -44,6 +44,7 @@ struct amdgpu_mman { struct ttm_bo_devicebdev; boolmem_global_referenced; boolinitialized; + void __iomem*aper_base_kaddr; #if defined(CONFIG_DEBUG_FS) struct dentry *debugfs_entries[8]; -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx