From: "Jiang, Sonny" <sonny.ji...@amd.com> and add radeonsi support. This will be used by radeonsi internally.
Signed-off-by: Sonny Jiang <sonny.ji...@amd.com> --- src/gallium/drivers/radeonsi/si_compute.c | 33 +++++++++++++++++++---- src/gallium/include/pipe/p_state.h | 21 +++++++++++++++ 2 files changed, 49 insertions(+), 5 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index cbcd8e79c7b..4d844e9f4e3 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -790,32 +790,55 @@ static void si_emit_dispatch_packets(struct si_context *sctx, /* SI */ if (sctx->cs_max_waves_per_sh) { unsigned limit_div16 = DIV_ROUND_UP(sctx->cs_max_waves_per_sh, 16); compute_resource_limits |= S_00B854_WAVES_PER_SH_SI(limit_div16); } } radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, compute_resource_limits); - radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3); - radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0])); - radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1])); - radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2])); - unsigned dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_FORCE_START_AT_000(1) | /* If the KMD allows it (there is a KMD hw register for it), * allow launching waves out-of-order. (same as Vulkan) */ S_00B800_ORDER_MODE(sctx->chip_class >= CIK); + bool partial_block_en = info->last_block[0] || + info->last_block[1] || + info->last_block[2]; + + radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3); + + if (partial_block_en) { + unsigned partial[3]; + + /* If no partial_block, these should be an entire block size, not 0. */ + partial[0] = info->last_block[0] ? info->last_block[0] : info->block[0]; + partial[1] = info->last_block[1] ? info->last_block[1] : info->block[1]; + partial[2] = info->last_block[2] ? info->last_block[2] : info->block[2]; + + radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]) | + S_00B81C_NUM_THREAD_PARTIAL(partial[0])); + radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]) | + S_00B820_NUM_THREAD_PARTIAL(partial[1])); + radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]) | + S_00B824_NUM_THREAD_PARTIAL(partial[2])); + + dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1); + } else { + radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0])); + radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1])); + radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2])); + } + if (info->indirect) { uint64_t base_va = r600_resource(info->indirect)->gpu_address; radeon_add_to_buffer_list(sctx, sctx->gfx_cs, r600_resource(info->indirect), RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT); radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1)); radeon_emit(cs, 1); diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h index 38052e5fd3d..0960577e61a 100644 --- a/src/gallium/include/pipe/p_state.h +++ b/src/gallium/include/pipe/p_state.h @@ -831,20 +831,41 @@ struct pipe_grid_info * clEnqueueNDRangeKernel. Note block[] and grid[] must be padded with * 1 for non-used dimensions. */ uint work_dim; /** * Determine the layout of the working block (in thread units) to be used. */ uint block[3]; + /** + * last_block allows disabling threads at the farthermost grid boundary. + * Full blocks as specified by "block" are launched, but the threads + * outside of "last_block" dimensions are disabled. + * + * If a block touches the grid boundary in the i-th axis, threads with + * THREAD_ID[i] >= last_block[i] are disabled. + * + * If last_block[i] is 0, it has the same behavior as last_block[i] = block[i], + * meaning no effect. + * + * It's equivalent to doing this at the beginning of the compute shader: + * + * for (i = 0; i < 3; i++) { + * if (block_id[i] == grid[i] - 1 && + * last_block[i] && last_block[i] >= thread_id[i]) + * return; + * } + */ + uint last_block[3]; + /** * Determine the layout of the grid (in block units) to be used. */ uint grid[3]; /* Indirect compute parameters resource: If not NULL, block sizes are taken * from this buffer instead, which is laid out as follows: * * struct { * uint32_t num_blocks_x; -- 2.17.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev