[PATCH 4/4] drm/amdgpu: Using uninitialized value *size when calling amdgpu_vce_cs_reloc
From: Jesse Zhang Initialize the size before calling amdgpu_vce_cs_reloc, such as case 0x0301. Signed-off-by: Jesse Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c index 59acf424a078..60d97cd14855 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c @@ -742,7 +742,7 @@ int amdgpu_vce_ring_parse_cs(struct amdgpu_cs_parser *p, uint32_t destroyed = 0; uint32_t created = 0; uint32_t allocated = 0; - uint32_t tmp, handle = 0; + uint32_t tmp = 0, handle = 0; uint32_t *size = unsigned int idx; int i, r = 0; -- 2.25.1
[PATCH 3/4] drm/amdgpu: Using uninitialized value new_state.jpeg when calling adev->vcn.pause_dpg_mode
From: Jesse Zhang Initialize the new_state.jpeg before it used Signed-off-by: Jesse Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c index 677eb141554e..13125ddd5e86 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c @@ -410,6 +410,11 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work) else new_state.fw_based = VCN_DPG_STATE__UNPAUSE; + if (amdgpu_fence_count_emitted(adev->jpeg.inst->ring_dec)) + new_state.jpeg = VCN_DPG_STATE__PAUSE; + else + new_state.jpeg = VCN_DPG_STATE__UNPAUSE; + adev->vcn.pause_dpg_mode(adev, j, _state); } -- 2.25.1
[PATCH 2/4] Initialize the last_jump_jiffies in atom_exec_context before it used
From: Jesse Zhang The parameter "last_jump_jiffies" should be initialized before being used in the function atom_op_jump. Signed-off-by: Jesse Zhang --- drivers/gpu/drm/amd/amdgpu/atom.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/atom.c b/drivers/gpu/drm/amd/amdgpu/atom.c index 72362df352f6..d552e013354c 100644 --- a/drivers/gpu/drm/amd/amdgpu/atom.c +++ b/drivers/gpu/drm/amd/amdgpu/atom.c @@ -1243,6 +1243,7 @@ static int amdgpu_atom_execute_table_locked(struct atom_context *ctx, int index, ectx.ps_size = params_size; ectx.abort = false; ectx.last_jump = 0; + ectx.last_jump_jiffies = 0; if (ws) { ectx.ws = kcalloc(4, ws, GFP_KERNEL); ectx.ws_size = ws; -- 2.25.1
[PATCH 1/4] drm/amdgpu: add check before free wb entry
From: Jesse Zhang check if ring is not mes queue before free wb entry. Signed-off-by: Jesse Zhang --- drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 3 ++- drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 3 ++- drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c index 45a2d0a5a2d7..b7d33d78bce0 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c @@ -999,7 +999,8 @@ static int sdma_v5_0_ring_test_ring(struct amdgpu_ring *ring) r = amdgpu_ring_alloc(ring, 20); if (r) { DRM_ERROR("amdgpu: dma failed to lock ring %d (%d).\n", ring->idx, r); - amdgpu_device_wb_free(adev, index); + if (!ring->is_mes_queue) + amdgpu_device_wb_free(adev, index); return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c index 43e64b2da575..cc9e961f0078 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c @@ -839,7 +839,8 @@ static int sdma_v5_2_ring_test_ring(struct amdgpu_ring *ring) r = amdgpu_ring_alloc(ring, 20); if (r) { DRM_ERROR("amdgpu: dma failed to lock ring %d (%d).\n", ring->idx, r); - amdgpu_device_wb_free(adev, index); + if (!ring->is_mes_queue) + amdgpu_device_wb_free(adev, index); return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c index 1f4877195213..c833b6b8373b 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c @@ -861,7 +861,8 @@ static int sdma_v6_0_ring_test_ring(struct amdgpu_ring *ring) r = amdgpu_ring_alloc(ring, 5); if (r) { DRM_ERROR("amdgpu: dma failed to lock ring %d (%d).\n", ring->idx, r); - amdgpu_device_wb_free(adev, index); + if (!ring->is_mes_queue) + amdgpu_device_wb_free(adev, index); return r; } -- 2.25.1
[PATCH V2] drm/ttm: remove unused paramter
From: Jesse Zhang remove the unsed the paramter in the function ttm_bo_bounce_temp_buffer and ttm_bo_add_move_fence. V2:rebase the patch on top of drm-misc-next (Christian) Signed-off-by: Jesse Zhang Reviewed-by: Christian König --- drivers/gpu/drm/ttm/ttm_bo.c | 8 +++- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index e059b1e1b13b..6396dece0db1 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -402,7 +402,6 @@ void ttm_bo_put(struct ttm_buffer_object *bo) EXPORT_SYMBOL(ttm_bo_put); static int ttm_bo_bounce_temp_buffer(struct ttm_buffer_object *bo, -struct ttm_resource **mem, struct ttm_operation_ctx *ctx, struct ttm_place *hop) { @@ -469,7 +468,7 @@ static int ttm_bo_evict(struct ttm_buffer_object *bo, if (ret != -EMULTIHOP) break; - ret = ttm_bo_bounce_temp_buffer(bo, _mem, ctx, ); + ret = ttm_bo_bounce_temp_buffer(bo, ctx, ); } while (!ret); if (ret) { @@ -698,7 +697,6 @@ EXPORT_SYMBOL(ttm_bo_unpin); */ static int ttm_bo_add_move_fence(struct ttm_buffer_object *bo, struct ttm_resource_manager *man, -struct ttm_resource *mem, bool no_wait_gpu) { struct dma_fence *fence; @@ -787,7 +785,7 @@ static int ttm_bo_alloc_resource(struct ttm_buffer_object *bo, if (ret) continue; - ret = ttm_bo_add_move_fence(bo, man, *res, ctx->no_wait_gpu); + ret = ttm_bo_add_move_fence(bo, man, ctx->no_wait_gpu); if (unlikely(ret)) { ttm_resource_free(bo, res); if (ret == -EBUSY) @@ -894,7 +892,7 @@ int ttm_bo_validate(struct ttm_buffer_object *bo, bounce: ret = ttm_bo_handle_move_mem(bo, res, false, ctx, ); if (ret == -EMULTIHOP) { - ret = ttm_bo_bounce_temp_buffer(bo, , ctx, ); + ret = ttm_bo_bounce_temp_buffer(bo, ctx, ); /* try and move to final place now. */ if (!ret) goto bounce; -- 2.25.1
[PATCH] drm/amdgpu : remove unused code
From: Jesse Zhang Remove the unused function - amdgpu_vm_pt_is_root_clean and remove the impossible condition v1: entries == 0 is not possible any more, so this condition could probably be removed (Felix) Signed-off-by: Jesse Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h| 2 - drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 61 ++- 2 files changed, 16 insertions(+), 47 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 7f95039bb37d..047ec1930d12 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -537,8 +537,6 @@ int amdgpu_vm_pt_create(struct amdgpu_device *adev, struct amdgpu_vm *vm, int level, bool immediate, struct amdgpu_bo_vm **vmbo, int32_t xcp_id); void amdgpu_vm_pt_free_root(struct amdgpu_device *adev, struct amdgpu_vm *vm); -bool amdgpu_vm_pt_is_root_clean(struct amdgpu_device *adev, - struct amdgpu_vm *vm); int amdgpu_vm_pde_update(struct amdgpu_vm_update_params *params, struct amdgpu_vm_bo_base *entry); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index 8bce4da67131..7ecddb77b3ae 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -367,6 +367,7 @@ int amdgpu_vm_pt_clear(struct amdgpu_device *adev, struct amdgpu_vm *vm, struct amdgpu_bo *bo = >bo; uint64_t addr; int r, idx; + uint64_t value = 0, flags = 0; /* Figure out our place in the hierarchy */ if (ancestor->parent) { @@ -409,27 +410,24 @@ int amdgpu_vm_pt_clear(struct amdgpu_device *adev, struct amdgpu_vm *vm, addr = 0; - if (entries) { - uint64_t value = 0, flags = 0; - - if (adev->asic_type >= CHIP_VEGA10) { - if (level != AMDGPU_VM_PTB) { - /* Handle leaf PDEs as PTEs */ - flags |= AMDGPU_PDE_PTE; - amdgpu_gmc_get_vm_pde(adev, level, - , ); - } else { - /* Workaround for fault priority problem on GMC9 */ - flags = AMDGPU_PTE_EXECUTABLE; - } - } - r = vm->update_funcs->update(, vmbo, addr, 0, entries, -value, flags); - if (r) - goto exit; + if (adev->asic_type >= CHIP_VEGA10) { + if (level != AMDGPU_VM_PTB) { + /* Handle leaf PDEs as PTEs */ + flags |= AMDGPU_PDE_PTE; + amdgpu_gmc_get_vm_pde(adev, level, + , ); + } else { + /* Workaround for fault priority problem on GMC9 */ + flags = AMDGPU_PTE_EXECUTABLE; + } } + r = vm->update_funcs->update(, vmbo, addr, 0, entries, +value, flags); + if (r) + goto exit; + r = vm->update_funcs->commit(, NULL); exit: drm_dev_exit(idx); @@ -673,33 +671,6 @@ void amdgpu_vm_pt_free_root(struct amdgpu_device *adev, struct amdgpu_vm *vm) amdgpu_vm_pt_free_dfs(adev, vm, NULL, false); } -/** - * amdgpu_vm_pt_is_root_clean - check if a root PD is clean - * - * @adev: amdgpu_device pointer - * @vm: the VM to check - * - * Check all entries of the root PD, if any subsequent PDs are allocated, - * it means there are page table creating and filling, and is no a clean - * VM - * - * Returns: - * 0 if this VM is clean - */ -bool amdgpu_vm_pt_is_root_clean(struct amdgpu_device *adev, - struct amdgpu_vm *vm) -{ - enum amdgpu_vm_level root = adev->vm_manager.root_level; - unsigned int entries = amdgpu_vm_pt_num_entries(adev, root); - unsigned int i = 0; - - for (i = 0; i < entries; i++) { - if (to_amdgpu_bo_vm(vm->root.bo)->entries[i].bo) - return false; - } - return true; -} - /** * amdgpu_vm_pde_update - update a single level in the hierarchy * -- 2.25.1
[PATCH V2] drm/amdkfd: fix shift out of bounds about gpu debug
From: Jesse Zhang [ 3810.410040] UBSAN: shift-out-of-bounds in drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_int_process_v10.c:345:5 [ 3810.410044] shift exponent 4294967295 is too large for 64-bit type 'long long unsigned int' [ 3810.410047] CPU: 6 PID: 331 Comm: kworker/6:1H Not tainted 6.5.0+ #508 [ 3810.410050] Hardware name: AMD Splinter/Splinter-GNR, BIOS WS54117N_140 01/16/2024 [ 3810.410052] Workqueue: KFD IH interrupt_wq [amdgpu] [ 3810.410273] Call Trace: [ 3810.410274] [ 3810.410277] dump_stack_lvl+0x4c/0x70 [ 3810.410283] dump_stack+0x14/0x20 [ 3810.410285] ubsan_epilogue+0x9/0x40 [ 3810.410290] __ubsan_handle_shift_out_of_bounds+0x113/0x170 [ 3810.410292] ? ZSTD_decompressSequencesSplitLitBuffer_default.isra.0+0x1389/0x1b50 [ 3810.410296] event_interrupt_wq_v10.cold+0x16/0x1e [amdgpu] [ 3810.410523] ? raw_spin_rq_unlock+0x14/0x40 [ 3810.410526] ? finish_task_switch+0x85/0x2b0 [ 3810.410528] interrupt_wq+0xb2/0x120 [amdgpu] [ 3810.410692] ? interrupt_wq+0xb2/0x120 [amdgpu] [ 3810.410806] process_one_work+0x229/0x430 [ 3810.410810] worker_thread+0x4e/0x3c0 [ 3810.410811] ? __pfx_worker_thread+0x10/0x10 [ 3810.410813] kthread+0xfb/0x130 [ 3810.410815] ? __pfx_kthread+0x10/0x10 [ 3810.410816] ret_from_fork+0x3d/0x60 [ 3810.410819] ? __pfx_kthread+0x10/0x10 [ 3810.410820] ret_from_fork_asm+0x1b/0x30 [ 3810.410823] -v2: define a macro. KFD process interrupts v9, v10, v11 can use that check prior to mask conversion and user space may find it useful as well.(Jon) Signed-off-by: Jesse Zhang --- drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c | 3 +++ drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c | 6 +- drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 3 +++ include/uapi/linux/kfd_ioctl.h | 6 ++ 4 files changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c index 9a06c6fb6605..110ec5f71056 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c @@ -340,6 +340,9 @@ static void event_interrupt_wq_v10(struct kfd_node *dev, } kfd_signal_event_interrupt(pasid, context_id0 & 0x7f, 23); } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) { + /* filter out the invalidate context_id0 */ + if (KFD_DBG_EC_RANGE_CHECK(context_id0)) + return; kfd_set_dbg_ev_from_interrupt(dev, pasid, KFD_DEBUG_DOORBELL_ID(context_id0), KFD_EC_MASK(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0)), diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c index 7e2859736a55..c28cafa4b902 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c @@ -328,11 +328,15 @@ static void event_interrupt_wq_v11(struct kfd_node *dev, /* CP */ if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) kfd_signal_event_interrupt(pasid, context_id0, 32); - else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) + else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) { + /* filter out the invalidate context_id0 */ + if (KFD_DBG_EC_RANGE_CHECK(context_id0)) + return; kfd_set_dbg_ev_from_interrupt(dev, pasid, KFD_CTXID0_DOORBELL_ID(context_id0), KFD_EC_MASK(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0)), NULL, 0); + } /* SDMA */ else if (source_id == SOC21_INTSRC_SDMA_TRAP) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index 91dd5e045b51..89dbefbd3081 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -389,6 +389,9 @@ static void event_interrupt_wq_v9(struct kfd_node *dev, } kfd_signal_event_interrupt(pasid, sq_int_data, 24); } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) { +/* filter out the invalidate context_id0 */ + if (KFD_DBG_EC_RANGE_CHECK(context_id0)) + return; kfd_set_dbg_ev_from_interrupt(dev, pasid, KFD_DEBUG_DOORBELL_ID(context_id0), KFD_EC_MASK(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0)), diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 9ce46edc62a5..9cd3aa83aac3 100644 ---
[PATCH V2] Revert "drm/amdgpu: remove vm sanity check from amdgpu_vm_make_compute" for Raven
From: "Jesse.Zhang" fix the issue: "amdgpu: Failed to create process VM object". [Why]when amdgpu initialized, seq64 do mampping and update bo mapping in vm page table. But when clifo run. It also initializes a vm for a process device through the function kfd_process_device_init_vm and ensure the root PD is clean through the function amdgpu_vm_pt_is_root_clean. So they have a conflict, and clinfo always failed. [HOW] Skip the seq64 entry check in vm page table. Signed-off-by: Jesse Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 13 + 1 file changed, 13 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index a160265ddc07..bdae5381887e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -746,8 +746,21 @@ bool amdgpu_vm_pt_is_root_clean(struct amdgpu_device *adev, enum amdgpu_vm_level root = adev->vm_manager.root_level; unsigned int entries = amdgpu_vm_pt_num_entries(adev, root); unsigned int i = 0; + u64 seq64_addr = (adev->vm_manager.max_pfn << AMDGPU_GPU_PAGE_SHIFT) - AMDGPU_VA_RESERVED_TOP; + + seq64_addr /= AMDGPU_GPU_PAGE_SIZE; + mask = amdgpu_vm_pt_entries_mask(adev, adev->vm_manager.root_level); + shift = amdgpu_vm_pt_level_shift(adev, adev->vm_manager.root_level); + seq64_entry = (seq64_addr >> shift) & mask; for (i = 0; i < entries; i++) { + /* seq64 reserve 2M memory from top of address space. +* Then do the mapping and update the vm page table at amdgpu initialize. +* So skip the know result. +*/ + + if(i == seq64_entry) + continue; if (to_amdgpu_bo_vm(vm->root.bo)->entries[i].bo) return false; } -- 2.34.1