From: Marek Olšák <marek.ol...@amd.com>

---
 src/amd/common/ac_llvm_build.c           | 50 ++++++++++++++++++++++++--------
 src/amd/common/ac_llvm_build.h           |  3 +-
 src/amd/common/ac_nir_to_llvm.c          |  2 +-
 src/gallium/drivers/radeonsi/si_shader.c | 23 +++++++--------
 4 files changed, 51 insertions(+), 27 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 87a1fb7..4b048ba 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -626,47 +626,73 @@ ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
 LLVMValueRef
 ac_build_buffer_load(struct ac_llvm_context *ctx,
                     LLVMValueRef rsrc,
                     int num_channels,
                     LLVMValueRef vindex,
                     LLVMValueRef voffset,
                     LLVMValueRef soffset,
                     unsigned inst_offset,
                     unsigned glc,
                     unsigned slc,
-                    bool readonly_memory)
+                    bool readonly_memory,
+                    bool coherent)
 {
+       LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0);
+       if (voffset)
+               offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
+       if (soffset)
+               offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
+
+       /* Loads from a shader buffer that has no stores in the same shader
+        * and is non-coherent with other shader invocations can use SMEM.
+        */
+       if (readonly_memory && !coherent) {
+               assert(vindex == NULL);
+               assert(glc == 0);
+               assert(slc == 0);
+
+               LLVMValueRef result[4];
+
+               for (int i = 0; i < num_channels; i++) {
+                       if (i) {
+                               offset = LLVMBuildAdd(ctx->builder, offset,
+                                                     LLVMConstInt(ctx->i32, 4, 
0), "");
+                       }
+                       LLVMValueRef args[2] = {rsrc, offset};
+                       result[i] = ac_build_intrinsic(ctx, 
"llvm.SI.load.const.v4i32",
+                                                      ctx->f32, args, 2,
+                                                      AC_FUNC_ATTR_READNONE |
+                                                      AC_FUNC_ATTR_LEGACY);
+               }
+               if (num_channels == 1)
+                       return result[0];
+
+               if (num_channels == 3)
+                       result[num_channels++] = LLVMGetUndef(ctx->f32);
+               return ac_build_gather_values(ctx, result, num_channels);
+       }
+
        unsigned func = CLAMP(num_channels, 1, 3) - 1;
 
        LLVMValueRef args[] = {
                LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
                vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0),
-               LLVMConstInt(ctx->i32, inst_offset, 0),
+               offset,
                LLVMConstInt(ctx->i1, glc, 0),
                LLVMConstInt(ctx->i1, slc, 0)
        };
 
        LLVMTypeRef types[] = {ctx->f32, LLVMVectorType(ctx->f32, 2),
                               ctx->v4f32};
        const char *type_names[] = {"f32", "v2f32", "v4f32"};
        char name[256];
 
-       if (voffset) {
-               args[2] = LLVMBuildAdd(ctx->builder, args[2], voffset,
-                               "");
-       }
-
-       if (soffset) {
-               args[2] = LLVMBuildAdd(ctx->builder, args[2], soffset,
-                               "");
-       }
-
        snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
                 type_names[func]);
 
        return ac_build_intrinsic(ctx, name, types[func], args,
                                  ARRAY_SIZE(args),
                                  /* READNONE means writes can't affect it, 
while
                                   * READONLY means that writes can affect it. 
*/
                                  readonly_memory && HAVE_LLVM >= 0x0400 ?
                                          AC_FUNC_ATTR_READNONE :
                                          AC_FUNC_ATTR_READONLY);
diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index 0ecbc4a..754461b 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -136,21 +136,22 @@ ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
 LLVMValueRef
 ac_build_buffer_load(struct ac_llvm_context *ctx,
                     LLVMValueRef rsrc,
                     int num_channels,
                     LLVMValueRef vindex,
                     LLVMValueRef voffset,
                     LLVMValueRef soffset,
                     unsigned inst_offset,
                     unsigned glc,
                     unsigned slc,
-                    bool readonly_memory);
+                    bool readonly_memory,
+                    bool coherent);
 
 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
                                         LLVMValueRef rsrc,
                                         LLVMValueRef vindex,
                                         LLVMValueRef voffset,
                                         bool readonly_memory);
 
 LLVMValueRef
 ac_get_thread_id(struct ac_llvm_context *ctx);
 
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 8ae0a75..9794c70 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -2816,21 +2816,21 @@ load_tes_input(struct nir_to_llvm_context *ctx,
        param = 
shader_io_get_unique_index(instr->variables[0]->var->data.location);
        if (instr->variables[0]->var->data.location == VARYING_SLOT_CLIP_DIST0 
&&
            is_compact && const_index > 3) {
                const_index -= 3;
                param++;
        }
        buf_addr = get_tcs_tes_buffer_address_params(ctx, param, const_index,
                                                     is_compact, vertex_index, 
indir_index);
 
        result = ac_build_buffer_load(&ctx->ac, ctx->hs_ring_tess_offchip, 
instr->num_components, NULL,
-                                     buf_addr, ctx->oc_lds, is_compact ? (4 * 
const_index) : 0, 1, 0, true);
+                                     buf_addr, ctx->oc_lds, is_compact ? (4 * 
const_index) : 0, 1, 0, true, true);
        result = trim_vector(ctx, result, instr->num_components);
        result = LLVMBuildBitCast(ctx->builder, result, get_def_type(ctx, 
&instr->dest.ssa), "");
        return result;
 }
 
 static LLVMValueRef
 load_gs_input(struct nir_to_llvm_context *ctx,
              nir_intrinsic_instr *instr)
 {
        LLVMValueRef indir_index, vtx_offset;
diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index 61f1384..7dd121b 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -826,39 +826,39 @@ static LLVMValueRef buffer_load(struct 
lp_build_tgsi_context *bld_base,
                                 LLVMValueRef base, bool readonly_memory)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
        struct gallivm_state *gallivm = &ctx->gallivm;
        LLVMValueRef value, value2;
        LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
        LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
 
        if (swizzle == ~0) {
                value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, 
offset,
-                                            0, 1, 0, readonly_memory);
+                                            0, 1, 0, readonly_memory, true);
 
                return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
        }
 
        if (!tgsi_type_is_64bit(type)) {
                value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, 
offset,
-                                            0, 1, 0, readonly_memory);
+                                            0, 1, 0, readonly_memory, true);
 
                value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
                return LLVMBuildExtractElement(gallivm->builder, value,
                                    LLVMConstInt(ctx->i32, swizzle, 0), "");
        }
 
        value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
-                                 swizzle * 4, 1, 0, readonly_memory);
+                                 swizzle * 4, 1, 0, readonly_memory, true);
 
        value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
-                                  swizzle * 4 + 4, 1, 0, readonly_memory);
+                                  swizzle * 4 + 4, 1, 0, readonly_memory, 
true);
 
        return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
 }
 
 /**
  * Load from LDS.
  *
  * \param type         output value type
  * \param swizzle      offset (typically 0..3); it can be ~0, which loads a 
vec4
  * \param dw_addr      address in dwords
@@ -1147,28 +1147,28 @@ static LLVMValueRef fetch_input_gs(
                vtx_offset_param += ctx->param_gs_vtx2_offset - 2;
        }
        vtx_offset = lp_build_mul_imm(uint,
                                      LLVMGetParam(ctx->main_fn,
                                                   vtx_offset_param),
                                      4);
 
        soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
 
        value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
-                                    vtx_offset, soffset, 0, 1, 0, true);
+                                    vtx_offset, soffset, 0, 1, 0, true, true);
        if (tgsi_type_is_64bit(type)) {
                LLVMValueRef value2;
                soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 
256, 0);
 
                value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
                                              ctx->i32_0, vtx_offset, soffset,
-                                             0, 1, 0, true);
+                                             0, 1, 0, true, true);
                return si_llvm_emit_fetch_64bit(bld_base, type,
                                                value, value2);
        }
        return LLVMBuildBitCast(gallivm->builder,
                                value,
                                tgsi2llvmtype(bld_base, type), "");
 }
 
 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
 {
@@ -1382,26 +1382,22 @@ static LLVMValueRef get_sample_id(struct 
si_shader_context *ctx)
 }
 
 
 /**
  * Load a dword from a constant buffer.
  */
 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
                                      LLVMValueRef resource,
                                      LLVMValueRef offset)
 {
-       LLVMBuilderRef builder = ctx->gallivm.builder;
-       LLVMValueRef args[2] = {resource, offset};
-
-       return lp_build_intrinsic(builder, "llvm.SI.load.const.v4i32", 
ctx->f32, args, 2,
-                                 LP_FUNC_ATTR_READNONE |
-                                 LP_FUNC_ATTR_LEGACY);
+       return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL,
+                                   0, 0, 0, true, false);
 }
 
 static LLVMValueRef load_sample_position(struct si_shader_context *ctx, 
LLVMValueRef sample_id)
 {
        struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;
        struct gallivm_state *gallivm = &ctx->gallivm;
        LLVMBuilderRef builder = gallivm->builder;
        LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
        LLVMValueRef buf_index = LLVMConstInt(ctx->i32, 
SI_PS_CONST_SAMPLE_POSITIONS, 0);
        LLVMValueRef resource = ac_build_indexed_load_const(&ctx->ac, desc, 
buf_index);
@@ -5192,21 +5188,22 @@ si_generate_gs_copy_shader(struct si_screen *sscreen,
                                }
 
                                LLVMValueRef soffset = LLVMConstInt(ctx.i32,
                                        offset * 
gs_selector->gs_max_out_vertices * 16 * 4, 0);
                                offset++;
 
                                outputs[i].values[chan] =
                                        ac_build_buffer_load(&ctx.ac,
                                                             ctx.gsvs_ring[0], 
1,
                                                             ctx.i32_0, voffset,
-                                                            soffset, 0, 1, 1, 
true);
+                                                            soffset, 0, 1, 1,
+                                                            true, true);
                        }
                }
 
                /* Streamout and exports. */
                if (gs_selector->so.num_outputs) {
                        si_llvm_emit_streamout(&ctx, outputs,
                                               gsinfo->num_outputs,
                                               stream);
                }
 
-- 
2.7.4

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to