From: "Jiang, Sonny" <sonny.ji...@amd.com>

and add radeonsi support. This will be used by radeonsi internally.

Signed-off-by: Sonny Jiang <sonny.ji...@amd.com>
---
 src/gallium/drivers/radeonsi/si_compute.c | 33 +++++++++++++++++++----
 src/gallium/include/pipe/p_state.h        | 21 +++++++++++++++
 2 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index cbcd8e79c7b..4d844e9f4e3 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -790,32 +790,55 @@ static void si_emit_dispatch_packets(struct si_context 
*sctx,
                /* SI */
                if (sctx->cs_max_waves_per_sh) {
                        unsigned limit_div16 = 
DIV_ROUND_UP(sctx->cs_max_waves_per_sh, 16);
                        compute_resource_limits |= 
S_00B854_WAVES_PER_SH_SI(limit_div16);
                }
        }
 
        radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
                          compute_resource_limits);
 
-       radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
-       radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]));
-       radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]));
-       radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]));
-
        unsigned dispatch_initiator =
                S_00B800_COMPUTE_SHADER_EN(1) |
                S_00B800_FORCE_START_AT_000(1) |
                /* If the KMD allows it (there is a KMD hw register for it),
                 * allow launching waves out-of-order. (same as Vulkan) */
                S_00B800_ORDER_MODE(sctx->chip_class >= CIK);
 
+       bool partial_block_en = info->last_block[0] ||
+                               info->last_block[1] ||
+                               info->last_block[2];
+
+       radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
+
+       if (partial_block_en) {
+               unsigned partial[3];
+
+               /* If no partial_block, these should be an entire block size, 
not 0. */
+               partial[0] = info->last_block[0] ? info->last_block[0] : 
info->block[0];
+               partial[1] = info->last_block[1] ? info->last_block[1] : 
info->block[1];
+               partial[2] = info->last_block[2] ? info->last_block[2] : 
info->block[2];
+
+               radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]) |
+                               S_00B81C_NUM_THREAD_PARTIAL(partial[0]));
+               radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]) |
+                               S_00B820_NUM_THREAD_PARTIAL(partial[1]));
+               radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]) |
+                               S_00B824_NUM_THREAD_PARTIAL(partial[2]));
+
+               dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
+       } else {
+               radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]));
+               radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]));
+               radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]));
+       }
+
        if (info->indirect) {
                uint64_t base_va = r600_resource(info->indirect)->gpu_address;
 
                radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
                                 r600_resource(info->indirect),
                                 RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
 
                radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) |
                                PKT3_SHADER_TYPE_S(1));
                radeon_emit(cs, 1);
diff --git a/src/gallium/include/pipe/p_state.h 
b/src/gallium/include/pipe/p_state.h
index 38052e5fd3d..0960577e61a 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -831,20 +831,41 @@ struct pipe_grid_info
     * clEnqueueNDRangeKernel. Note block[] and grid[] must be padded with
     * 1 for non-used dimensions.
     */
    uint work_dim;
 
    /**
     * Determine the layout of the working block (in thread units) to be used.
     */
    uint block[3];
 
+   /**
+    * last_block allows disabling threads at the farthermost grid boundary.
+    * Full blocks as specified by "block" are launched, but the threads
+    * outside of "last_block" dimensions are disabled.
+    *
+    * If a block touches the grid boundary in the i-th axis, threads with
+    * THREAD_ID[i] >= last_block[i] are disabled.
+    *
+    * If last_block[i] is 0, it has the same behavior as last_block[i] = 
block[i],
+    * meaning no effect.
+    *
+    * It's equivalent to doing this at the beginning of the compute shader:
+    *
+    *   for (i = 0; i < 3; i++) {
+    *      if (block_id[i] == grid[i] - 1 &&
+    *          last_block[i] && last_block[i] >= thread_id[i])
+    *         return;
+    *   }
+    */
+   uint last_block[3];
+
    /**
     * Determine the layout of the grid (in block units) to be used.
     */
    uint grid[3];
 
    /* Indirect compute parameters resource: If not NULL, block sizes are taken
     * from this buffer instead, which is laid out as follows:
     *
     *  struct {
     *     uint32_t num_blocks_x;
-- 
2.17.1

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to