compute: Use the HSA abi for non-TGSI compute shaders

Nicolai Hähnle Tue, 26 Jul 2016 08:48:30 -0700

On 26.07.2016 01:11, Tom Stellard wrote:

This patche switches non-TGSI compute shaders over to using the HSA


Typo: patch :)

ABI described here:

https://github.com/RadeonOpenCompute/ROCm-Docs/blob/master/AMDGPU-ABI.md

The HSA ABI provides a much cleaner interface for compute shaders and allows
us to share more code in the compiler with the HSA stack.

The main changes in this patch are:
  - We now pass the scratch buffer resource into the shader via user sgprs
    rather than using relocations.
  - Grid/Block sizes are now passed to the shader via the dispatch packet
    rather than at the beginning of the kernel arguments.

Typically for HSA, the CP firmware will create the dispatch packet and set
up the user sgprs automatically.  However, in Mesa we let the driver do
this work.  The main reason for this is that I haven't researched how to
get the CP to do all these things, and I'm not sure if it is supported
for all GPUs.
---
 src/gallium/drivers/radeon/r600_pipe_common.c    |   6 +-
 src/gallium/drivers/radeonsi/amd_kernel_code_t.h | 534 +++++++++++++++++++++++
 src/gallium/drivers/radeonsi/si_compute.c        | 234 +++++++++-
 3 files changed, 756 insertions(+), 18 deletions(-)
 create mode 100644 src/gallium/drivers/radeonsi/amd_kernel_code_t.h

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c 
b/src/gallium/drivers/radeon/r600_pipe_common.c
index cd4908f..9ecf666 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -784,7 +784,11 @@ static int r600_get_compute_param(struct pipe_screen 
*screen,
                if (rscreen->family <= CHIP_ARUBA) {
                        triple = "r600--";
                } else {
-                       triple = "amdgcn--";
+                       if (HAVE_LLVM < 0x0400) {
+                               triple = "amdgcn--";
+                       } else {
+                               triple = "amdgcn--mesa3d";
+                       }
                }
                switch(rscreen->family) {
                /* Clang < 3.6 is missing Hainan in its list of

[snip]

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 949ab1a..1aced60 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -28,6 +28,7 @@
 #include "radeon/r600_pipe_common.h"
 #include "radeon/radeon_elf_util.h"

+#include "amd_kernel_code_t.h"
 #include "radeon/r600_cs.h"
 #include "si_pipe.h"
 #include "si_shader.h"
@@ -43,8 +44,52 @@ struct si_compute {
        struct si_shader shader;

        struct pipe_resource *global_buffers[MAX_GLOBAL_BUFFERS];
+       bool use_code_object_v2;
 };

+struct dispatch_packet {
+       uint16_t header;
+       uint16_t setup;
+       uint16_t workgroup_size_x;
+       uint16_t workgroup_size_y;
+       uint16_t workgroup_size_z;
+       uint16_t reserved0;
+       uint32_t grid_size_x;
+       uint32_t grid_size_y;
+       uint32_t grid_size_z;
+       uint32_t private_segment_size;
+       uint32_t group_segment_size;
+       uint64_t kernel_object;
+       uint64_t kernarg_address;
+       uint64_t reserved2;
+};
+
+static const amd_kernel_code_t *si_compute_get_code_object(
+       const struct si_compute *program,
+       uint64_t symbol_offset)
+{
+       if (!program->use_code_object_v2) {
+               return NULL;
+       }
+       return (const amd_kernel_code_t*)
+               (program->shader.binary.code + symbol_offset);
+}
+
+static void code_object_to_config(const amd_kernel_code_t *code_object,
+                                 struct si_shader_config *out_config) {
+
+       uint32_t rsrc1 = code_object->compute_pgm_resource_registers;
+       uint32_t rsrc2 = code_object->compute_pgm_resource_registers >> 32;
+       out_config->num_sgprs = code_object->wavefront_sgpr_count;
+       out_config->num_vgprs = code_object->workitem_vgpr_count;
+       out_config->float_mode = G_00B028_FLOAT_MODE(rsrc1);
+       out_config->rsrc1 = rsrc1;
+       out_config->lds_size = MAX2(out_config->lds_size, 
G_00B84C_LDS_SIZE(rsrc2));
+       out_config->rsrc2 = rsrc2;
+       out_config->scratch_bytes_per_wave =
+               align(code_object->workitem_private_segment_byte_size * 64, 
1024);
+}
+
 static void *si_create_compute_state(
        struct pipe_context *ctx,
        const struct pipe_compute_state *cso)
@@ -59,6 +104,8 @@ static void *si_create_compute_state(
        program->local_size = cso->req_local_mem;
        program->private_size = cso->req_private_mem;
        program->input_size = cso->req_input_mem;
+       program->use_code_object_v2 = HAVE_LLVM >= 0x0400 &&
+                                       cso->ir_type == PIPE_SHADER_IR_NATIVE;


        if (cso->ir_type == PIPE_SHADER_IR_TGSI) {
@@ -110,8 +157,14 @@ static void *si_create_compute_state(
                code = cso->prog + sizeof(struct pipe_llvm_program_header);

                radeon_elf_read(code, header->num_bytes, 
&program->shader.binary);
-               si_shader_binary_read_config(&program->shader.binary,
-                            &program->shader.config, 0);
+               if (program->use_code_object_v2) {
+                       const amd_kernel_code_t *code_object =
+                               si_compute_get_code_object(program, 0);
+                       code_object_to_config(code_object, 
&program->shader.config);
+               } else {
+                       si_shader_binary_read_config(&program->shader.binary,
+                                    &program->shader.config, 0);
+               }
                si_shader_dump(sctx->screen, &program->shader, &sctx->b.debug,
                               PIPE_SHADER_COMPUTE, stderr);
                si_shader_binary_upload(sctx->screen, &program->shader);
@@ -234,7 +287,9 @@ static bool si_setup_compute_scratch_buffer(struct 
si_context *sctx,

 static bool si_switch_compute_shader(struct si_context *sctx,
                                      struct si_compute *program,
-                                     struct si_shader *shader, unsigned offset)
+                                    struct si_shader *shader,
+                                    const amd_kernel_code_t *code_object,
+                                    unsigned offset)
 {
        struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
        struct si_shader_config inline_config = {0};
@@ -251,7 +306,11 @@ static bool si_switch_compute_shader(struct si_context 
*sctx,
                unsigned lds_blocks;

                config = &inline_config;
-               si_shader_binary_read_config(&shader->binary, config, offset);
+               if (code_object) {
+                       code_object_to_config(code_object, config);
+               } else {
+                       si_shader_binary_read_config(&shader->binary, config, 
offset);
+               }

                lds_blocks = config->lds_size;
                /* XXX: We are over allocating LDS.  For SI, the shader reports
@@ -287,6 +346,11 @@ static bool si_switch_compute_shader(struct si_context 
*sctx,
        }

        shader_va = shader->bo->gpu_address + offset;
+       if (program->use_code_object_v2) {
+               /* Shader code is placed after the amd_kernel_code_t
+                * struct. */
+               shader_va += sizeof(amd_kernel_code_t);
+       }

        radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, shader->bo,
                                  RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
@@ -314,14 +378,140 @@ static bool si_switch_compute_shader(struct si_context 
*sctx,
        return true;
 }

+static void setup_scratch_rsrc_user_sgprs(struct si_context *sctx,
+                                         const amd_kernel_code_t *code_object,
+                                         unsigned user_sgpr)
+{
+       struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+       uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
+
+       unsigned max_private_element_size = AMD_HSA_BITS_GET(
+                       code_object->code_properties,
+                       AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE);
+
+       uint32_t scratch_dword0 = scratch_va & 0xffffffff;
+       uint32_t scratch_dword1 =
+               S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
+               S_008F04_SWIZZLE_ENABLE(1);
+       uint32_t scratch_dword2 = 0xffffffff; 
//sctx->compute_scratch_buffer->b.b.width0;


What's the reason for not setting that?

+       uint32_t scratch_dword3 =
+               S_008F0C_ELEMENT_SIZE(max_private_element_size) |
+               S_008F0C_INDEX_STRIDE(3) |
+               S_008F0C_ADD_TID_ENABLE(1);
+
+
+       if (sctx->screen->b.family < CHIP_TONGA) {
+               /* XXX: I have no idea why we use NUM_FORMAT_FLOAT. */
+               scratch_dword3 |=
+                       S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+                       S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_8);


chip_class < VI? Or is this really about Tonga specifically?

And yes, those settings are peculiar. I don't think LLVM emitsbuffer_*_format instructions -- the non-format instructions shouldignore those fields anyway. I'm also not aware of any quirks neededrelated to this...


Nicolai

+       }
+
+       radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
+                                                       (user_sgpr * 4), 4);
+       radeon_emit(cs, scratch_dword0);
+       radeon_emit(cs, scratch_dword1);
+       radeon_emit(cs, scratch_dword2);
+       radeon_emit(cs, scratch_dword3);
+}
+
+static void si_setup_user_sgprs_co_v2(struct si_context *sctx,
+                                      const amd_kernel_code_t *code_object,
+                                     const struct pipe_grid_info *info,
+                                     uint64_t kernel_args_va)
+{
+       struct si_compute *program = sctx->cs_shader_state.program;
+       struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+
+       static const enum amd_code_property_mask_t workgroup_count_masks [] = {
+               AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X,
+               AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y,
+               AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z
+       };
+
+       unsigned i, user_sgpr = 0;
+       if (AMD_HSA_BITS_GET(code_object->code_properties,
+                       AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) {
+               if (code_object->workitem_private_segment_byte_size > 0) {
+                       setup_scratch_rsrc_user_sgprs(sctx, code_object,
+                                                               user_sgpr);
+               }
+               user_sgpr += 4;
+       }
+
+       if (AMD_HSA_BITS_GET(code_object->code_properties,
+                       AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR)) {
+               struct dispatch_packet dispatch;
+               unsigned dispatch_offset;
+               struct r600_resource *dispatch_buf = NULL;
+               uint64_t dispatch_va;
+
+               /* Upload dispatch ptr */
+               memset(&dispatch, 0, sizeof(dispatch));
+
+               dispatch.workgroup_size_x = info->block[0];
+               dispatch.workgroup_size_y = info->block[1];
+               dispatch.workgroup_size_z = info->block[2];
+
+               dispatch.grid_size_x = info->grid[0] * info->block[0];
+               dispatch.grid_size_y = info->grid[1] * info->block[1];
+               dispatch.grid_size_z = info->grid[2] * info->block[2];
+
+               dispatch.private_segment_size = program->private_size;
+               dispatch.group_segment_size = program->local_size;
+
+               dispatch.kernarg_address = kernel_args_va;
+
+               u_upload_data(sctx->b.uploader, 0, sizeof(dispatch), 256,
+                               &dispatch, &dispatch_offset,
+                               (struct pipe_resource**)&dispatch_buf);
+
+               assert(dispatch_buf);
+               radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, dispatch_buf,
+                                 RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
+
+               dispatch_va = dispatch_buf->gpu_address + dispatch_offset;
+
+               radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
+                                                       (user_sgpr * 4), 2);
+               radeon_emit(cs, dispatch_va);
+               radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(dispatch_va >> 32) |
+                                S_008F04_STRIDE(0));
+
+               r600_resource_reference(&dispatch_buf, NULL);
+               user_sgpr += 2;
+       }
+
+       if (AMD_HSA_BITS_GET(code_object->code_properties,
+                       AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) {
+               radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
+                                                       (user_sgpr * 4), 2);
+               radeon_emit(cs, kernel_args_va);
+               radeon_emit(cs, S_008F04_BASE_ADDRESS_HI (kernel_args_va >> 32) 
|
+                               S_008F04_STRIDE(0));
+               user_sgpr += 2;
+       }
+
+       for (i = 0; i < 3 && user_sgpr < 16; i++) {
+               if (code_object->code_properties & workgroup_count_masks[i]) {
+                       radeon_set_sh_reg_seq(cs,
+                               R_00B900_COMPUTE_USER_DATA_0 +
+                               (user_sgpr * 4), 1);
+                       radeon_emit(cs, info->grid[i]);
+                       user_sgpr += 1;
+               }
+       }
+}
+
 static void si_upload_compute_input(struct si_context *sctx,
-                                  const struct pipe_grid_info *info)
+                                   const amd_kernel_code_t *code_object,
+                                   const struct pipe_grid_info *info)
 {
        struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
        struct si_compute *program = sctx->cs_shader_state.program;
        struct r600_resource *input_buffer = NULL;
        unsigned kernel_args_size;
-       unsigned num_work_size_bytes = 36;
+       unsigned num_work_size_bytes = program->use_code_object_v2 ? 0 : 36;
        uint32_t kernel_args_offset = 0;
        uint32_t *kernel_args;
        void *kernel_args_ptr;
@@ -336,10 +526,14 @@ static void si_upload_compute_input(struct si_context 
*sctx,
                       (struct pipe_resource**)&input_buffer, &kernel_args_ptr);

        kernel_args = (uint32_t*)kernel_args_ptr;
-       for (i = 0; i < 3; i++) {
-               kernel_args[i] = info->grid[i];
-               kernel_args[i + 3] = info->grid[i] * info->block[i];
-               kernel_args[i + 6] = info->block[i];
+       kernel_args_va = input_buffer->gpu_address + kernel_args_offset;
+
+       if (!code_object) {
+               for (i = 0; i < 3; i++) {
+                       kernel_args[i] = info->grid[i];
+                       kernel_args[i + 3] = info->grid[i] * info->block[i];
+                       kernel_args[i + 6] = info->block[i];
+               }
        }

        memcpy(kernel_args + (num_work_size_bytes / 4), info->input,
@@ -351,15 +545,18 @@ static void si_upload_compute_input(struct si_context 
*sctx,
                        kernel_args[i]);
        }

-       kernel_args_va = input_buffer->gpu_address + kernel_args_offset;

        radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, input_buffer,
                                  RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);

-       radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2);
-       radeon_emit(cs, kernel_args_va);
-       radeon_emit(cs, S_008F04_BASE_ADDRESS_HI (kernel_args_va >> 32) |
-                       S_008F04_STRIDE(0));
+       if (code_object) {
+               si_setup_user_sgprs_co_v2(sctx, code_object, info, 
kernel_args_va);
+       } else {
+               radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2);
+               radeon_emit(cs, kernel_args_va);
+               radeon_emit(cs, S_008F04_BASE_ADDRESS_HI (kernel_args_va >> 32) 
|
+                               S_008F04_STRIDE(0));
+       }

        r600_resource_reference(&input_buffer, NULL);
 }
@@ -442,6 +639,8 @@ static void si_launch_grid(
 {
        struct si_context *sctx = (struct si_context*)ctx;
        struct si_compute *program = sctx->cs_shader_state.program;
+       const amd_kernel_code_t *code_object =
+               si_compute_get_code_object(program, info->pc);
        int i;
        /* HW bug workaround when CS threadgroups > 256 threads and async
         * compute isn't used, i.e. only one compute job can run at a time.
@@ -469,7 +668,8 @@ static void si_launch_grid(
        if (sctx->b.flags)
                si_emit_cache_flush(sctx, NULL);

-       if (!si_switch_compute_shader(sctx, program, &program->shader, 
info->pc))
+       if (!si_switch_compute_shader(sctx, program, &program->shader,
+                                       code_object, info->pc))
                return;

        si_upload_compute_shader_descriptors(sctx);
@@ -482,7 +682,7 @@ static void si_launch_grid(
        }

        if (program->input_size || program->ir_type == PIPE_SHADER_IR_NATIVE)
-               si_upload_compute_input(sctx, info);
+               si_upload_compute_input(sctx, code_object, info);

        /* Global buffers */
        for (i = 0; i < MAX_GLOBAL_BUFFERS; i++) {

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 2/2] radeonsi/compute: Use the HSA abi for non-TGSI compute shaders

Reply via email to