Support ras critical address check. Signed-off-by: YiPeng Chai <yipeng.c...@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 89 +++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 14 ++++ 2 files changed, 103 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 234f0de31917..0ad3a9eedfd2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -143,6 +143,10 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, uint64_t addr); static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, uint64_t addr); + +static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev); +static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev); + #ifdef CONFIG_X86_MCE_AMD static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev); struct mce_notifier_adev_list { @@ -3709,6 +3713,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info) amdgpu_register_bad_pages_mca_notifier(adev); #endif + amdgpu_ras_critical_region_init(adev); + return 0; free: @@ -4198,6 +4204,9 @@ int amdgpu_ras_init(struct amdgpu_device *adev) con->init_task_pid = task_pid_nr(current); get_task_comm(con->init_task_comm, current); + mutex_init(&con->critical_region_lock); + INIT_LIST_HEAD(&con->critical_region_head); + dev_info(adev->dev, "RAS INFO: ras initialized successfully, " "hardware ability[%x] ras_mask[%x]\n", adev->ras_hw_enabled, adev->ras_enabled); @@ -4477,6 +4486,9 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) if (!adev->ras_enabled || !con) return 0; + amdgpu_ras_critical_region_fini(adev); + mutex_destroy(&con->critical_region_lock); + list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) { if (ras_node->ras_obj) { obj = ras_node->ras_obj; @@ -5433,3 +5445,80 @@ bool amdgpu_ras_is_rma(struct amdgpu_device *adev) return con->is_rma; } + +int amdgpu_ras_add_critical_region(struct amdgpu_device *adev, + struct amdgpu_bo *bo) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct amdgpu_vram_mgr_resource *vres; + struct ras_critical_region *region; + struct drm_buddy_block *block; + int ret = 0; + + if (!bo || !bo->tbo.resource) + return -EINVAL; + + vres = to_amdgpu_vram_mgr_resource(bo->tbo.resource); + + mutex_lock(&con->critical_region_lock); + + /* Check if the bo had been recorded */ + list_for_each_entry(region, &con->critical_region_head, node) + if (region->bo == bo) + goto out; + + /* Record new critical amdgpu bo */ + list_for_each_entry(block, &vres->blocks, link) { + region = kzalloc(sizeof(*region), GFP_KERNEL); + if (!region) { + ret = -ENOMEM; + goto out; + } + region->bo = bo; + region->start = amdgpu_vram_mgr_block_start(block); + region->size = amdgpu_vram_mgr_block_size(block); + list_add_tail(®ion->node, &con->critical_region_head); + } + +out: + mutex_unlock(&con->critical_region_lock); + + return ret; +} + +static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev) +{ + amdgpu_ras_add_critical_region(adev, adev->mman.fw_reserved_memory); +} + +static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_critical_region *region, *tmp; + + mutex_lock(&con->critical_region_lock); + list_for_each_entry_safe(region, tmp, &con->critical_region_head, node) { + list_del(®ion->node); + kfree(region); + } + mutex_unlock(&con->critical_region_lock); +} + +bool amdgpu_ras_check_critical_address(struct amdgpu_device *adev, uint64_t addr) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_critical_region *region; + bool ret = false; + + mutex_lock(&con->critical_region_lock); + list_for_each_entry(region, &con->critical_region_head, node) { + if ((region->start <= addr) && + (addr < (region->start + region->size))) { + ret = true; + break; + } + } + mutex_unlock(&con->critical_region_lock); + + return ret; +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 9faedfc494af..9614cec7c3c3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -496,6 +496,13 @@ struct ras_ecc_log_info { uint64_t prev_de_queried_count; }; +struct ras_critical_region { + struct list_head node; + struct amdgpu_bo *bo; + uint64_t start; + uint64_t size; +}; + struct amdgpu_ras { void *ras_mgr; /* ras infrastructure */ @@ -574,6 +581,10 @@ struct amdgpu_ras { pid_t init_task_pid; char init_task_comm[TASK_COMM_LEN]; + + struct list_head critical_region_head; + struct mutex critical_region_lock; + }; struct ras_fs_data { @@ -977,6 +988,9 @@ int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_ int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn); +int amdgpu_ras_add_critical_region(struct amdgpu_device *adev, struct amdgpu_bo *bo); +bool amdgpu_ras_check_critical_address(struct amdgpu_device *adev, uint64_t addr); + int amdgpu_ras_put_poison_req(struct amdgpu_device *adev, enum amdgpu_ras_block block, uint16_t pasid, pasid_notify pasid_fn, void *data, uint32_t reset); -- 2.34.1