From: Nicolai Hähnle <nicolai.haeh...@amd.com>

The overall goal is to support unaligned loads from vertex buffers
natively on SI.

In the unaligned case, we fall back to the general case implementation in
ac_build_opencoded_load_format. Since this function is fully general,
we will also use it going forward for cases requiring fully manual format
conversions of dwords anyway.

This requires a different encoding of the fix_fetch array, which will now
contain the entire format information if a fixup is required.

Having to check the alignment of vertex buffers is awkward. To keep the
impact on the fast path minimal, the si_context will keep track of which
vertex buffers are (not) at least dword-aligned, while the
si_vertex_elements will note which vertex buffers have some (at most dword)
alignment requirement. Vertex buffers should be dword-aligned most of the
time, which allows a fast early-out in almost all cases.

Add the radeonsi_vs_fetch_always_opencode configuration variable for
testing purposes. Note that it can only be used reliably on LLVM >= 9,
because support for byte and short load is required.
---
 .../drivers/radeonsi/si_debug_options.h       |   1 +
 src/gallium/drivers/radeonsi/si_get.c         |   2 +-
 src/gallium/drivers/radeonsi/si_pipe.h        |   1 +
 src/gallium/drivers/radeonsi/si_shader.c      | 249 ++++++------------
 src/gallium/drivers/radeonsi/si_shader.h      |  46 ++--
 src/gallium/drivers/radeonsi/si_state.c       | 233 +++++++++-------
 src/gallium/drivers/radeonsi/si_state.h       |  19 ++
 .../drivers/radeonsi/si_state_shaders.c       |  26 +-
 8 files changed, 297 insertions(+), 280 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_debug_options.h 
b/src/gallium/drivers/radeonsi/si_debug_options.h
index 019256ca1d1..0bde7910fc6 100644
--- a/src/gallium/drivers/radeonsi/si_debug_options.h
+++ b/src/gallium/drivers/radeonsi/si_debug_options.h
@@ -1,6 +1,7 @@
 OPT_BOOL(clear_db_cache_before_clear, false, "Clear DB cache before fast depth 
clear")
 OPT_BOOL(enable_nir, false, "Enable NIR")
 OPT_BOOL(aux_debug, false, "Generate ddebug_dumps for the auxiliary context")
 OPT_BOOL(sync_compile, false, "Always compile synchronously (will cause 
stalls)")
+OPT_BOOL(vs_fetch_always_opencode, false, "Always open code vertex fetches 
(less efficient, purely for testing)")
 
 #undef OPT_BOOL
diff --git a/src/gallium/drivers/radeonsi/si_get.c 
b/src/gallium/drivers/radeonsi/si_get.c
index 4e23d283ab7..ff825c5e30a 100644
--- a/src/gallium/drivers/radeonsi/si_get.c
+++ b/src/gallium/drivers/radeonsi/si_get.c
@@ -190,21 +190,21 @@ static int si_get_param(struct pipe_screen *pscreen, enum 
pipe_cap param)
                /* Optimal number for good TexSubImage performance on 
Polaris10. */
                return 64 * 1024 * 1024;
 
        case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
        case PIPE_CAP_MAX_SHADER_BUFFER_SIZE:
                return MIN2(sscreen->info.max_alloc_size, INT_MAX);
 
        case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
        case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
        case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
-               return !sscreen->info.has_unaligned_shader_loads;
+               return HAVE_LLVM < 0x0900 && 
!sscreen->info.has_unaligned_shader_loads;
 
        case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
                return sscreen->info.has_sparse_vm_mappings ?
                                RADEON_SPARSE_PAGE_SIZE : 0;
 
        case PIPE_CAP_PACKED_UNIFORMS:
                if (sscreen->options.enable_nir)
                        return 1;
                return 0;
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index 7fc0319973b..1d241436a6d 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -938,20 +938,21 @@ struct si_context {
        union pipe_color_union          *border_color_map; /* in VRAM (slow 
access), little endian */
        unsigned                        border_color_count;
        unsigned                        num_vs_blit_sgprs;
        uint32_t                        
vs_blit_sh_data[SI_VS_BLIT_SGPRS_POS_TEXCOORD];
        uint32_t                        cs_user_data[4];
 
        /* Vertex and index buffers. */
        bool                            vertex_buffers_dirty;
        bool                            vertex_buffer_pointer_dirty;
        struct pipe_vertex_buffer       vertex_buffer[SI_NUM_VERTEX_BUFFERS];
+       uint16_t                        vertex_buffer_unaligned; /* bitmask of 
not dword-aligned buffers */
 
        /* MSAA config state. */
        int                             ps_iter_samples;
        bool                            ps_uses_fbfetch;
        bool                            smoothing_enabled;
 
        /* DB render state. */
        unsigned                ps_db_shader_control;
        unsigned                dbcb_copy_sample;
        bool                    dbcb_depth_copy_enabled:1;
diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index d783555ca33..f6d882cf583 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -423,35 +423,20 @@ static LLVMValueRef get_tcs_in_vertex_dw_stride(struct 
si_shader_context *ctx)
                        return LLVMConstInt(ctx->i32, stride, 0);
                }
                return si_unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
 
        default:
                assert(0);
                return NULL;
        }
 }
 
-/* Bitcast <4 x float> to <2 x double>, extract the component, and convert
- * to float. */
-static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
-                                           LLVMValueRef vec4,
-                                           unsigned double_index)
-{
-       LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->ac.context);
-       LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
-                                             LLVMVectorType(f64, 2), "");
-       LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
-       LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
-       return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
-}
-
 static LLVMValueRef unpack_sint16(struct si_shader_context *ctx,
                                 LLVMValueRef i32, unsigned index)
 {
        assert(index <= 1);
 
        if (index == 1)
                return LLVMBuildAShr(ctx->ac.builder, i32,
                                     LLVMConstInt(ctx->i32, 16, 0), "");
 
        return LLVMBuildSExt(ctx->ac.builder,
@@ -529,226 +514,147 @@ void si_llvm_load_input_vs(
                        out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1,
                                                 y1, y2, "");
                        out[2] = LLVMGetParam(ctx->main_fn,
                                              ctx->param_vs_blit_inputs + 7);
                        out[3] = LLVMGetParam(ctx->main_fn,
                                              ctx->param_vs_blit_inputs + 8);
                }
                return;
        }
 
-       unsigned chan;
-       unsigned fix_fetch;
-       unsigned num_fetches;
-       unsigned fetch_stride;
-       unsigned num_channels;
-
+       union si_vs_fix_fetch fix_fetch;
        LLVMValueRef t_list_ptr;
        LLVMValueRef t_offset;
        LLVMValueRef t_list;
        LLVMValueRef vertex_index;
-       LLVMValueRef input[3];
+       LLVMValueRef tmp;
 
        /* Load the T list */
        t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
 
        t_offset = LLVMConstInt(ctx->i32, input_index, 0);
 
        t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
 
        vertex_index = LLVMGetParam(ctx->main_fn,
                                    ctx->param_vertex_index0 +
                                    input_index);
 
-       fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
+       /* Use the open-coded implementation for all loads of doubles and
+        * of dword-sized data that needs fixups. We need to insert conversion
+        * code anyway, and the amd/common code does it for us.
+        *
+        * Note: On LLVM <= 8, we can only open-code formats with
+        * channel size >= 4 bytes.
+        */
+       bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << 
input_index);
+       fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits;
+       if (opencode ||
+           (fix_fetch.u.log_size == 3 && fix_fetch.u.format == 
AC_FETCH_FORMAT_FLOAT) ||
+           (fix_fetch.u.log_size == 2)) {
+               tmp = ac_build_opencoded_load_format(
+                               &ctx->ac, fix_fetch.u.log_size, 
fix_fetch.u.num_channels_m1 + 1,
+                               fix_fetch.u.format, fix_fetch.u.reverse, 
!opencode,
+                               t_list, vertex_index, ctx->ac.i32_0, 
ctx->ac.i32_0,
+                               false, false, true);
+               for (unsigned i = 0; i < 4; ++i)
+                       out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, 
LLVMConstInt(ctx->i32, i, false), "");
+               return;
+       }
 
        /* Do multiple loads for special formats. */
-       switch (fix_fetch) {
-       case SI_FIX_FETCH_RG_64_FLOAT:
-               num_fetches = 1; /* 1 2-dword or 4-dword load */
-               fetch_stride = 0;
-               if (util_last_bit(info->input_usage_mask[input_index]) >= 2)
-                       num_channels = 4; /* 2 doubles in 4 dwords */
-               else
-                       num_channels = 2; /* 1 double in 2 dwords */
-               break;
-       case SI_FIX_FETCH_RGB_64_FLOAT:
-               num_fetches = 3; /* 3 2-dword loads */
-               fetch_stride = 8;
-               num_channels = 2;
-               break;
-       case SI_FIX_FETCH_RGBA_64_FLOAT:
-               num_fetches = 2; /* 2 4-dword loads */
-               fetch_stride = 16;
-               num_channels = 4;
-               break;
-       case SI_FIX_FETCH_RGB_8:
-       case SI_FIX_FETCH_RGB_8_INT:
-               num_fetches = 3;
-               fetch_stride = 1;
-               num_channels = 1;
-               break;
-       case SI_FIX_FETCH_RGB_16:
-       case SI_FIX_FETCH_RGB_16_INT:
-               num_fetches = 3;
-               fetch_stride = 2;
-               num_channels = 1;
-               break;
-       default:
+       unsigned required_channels = 
util_last_bit(info->input_usage_mask[input_index]);
+       LLVMValueRef fetches[4];
+       unsigned num_fetches;
+       unsigned fetch_stride;
+       unsigned channels_per_fetch;
+
+       if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) {
+               num_fetches = MIN2(required_channels, 3);
+               fetch_stride = 1 << fix_fetch.u.log_size;
+               channels_per_fetch = 1;
+       } else {
                num_fetches = 1;
                fetch_stride = 0;
-               num_channels = 
util_last_bit(info->input_usage_mask[input_index]);
+               channels_per_fetch = required_channels;
        }
 
-       for (unsigned i = 0; i < num_fetches; i++) {
+       for (unsigned i = 0; i < num_fetches; ++i) {
                LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 
0);
-
-               input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
-                                                      vertex_index, voffset,
-                                                      num_channels, false, 
true);
-               input[i] = ac_build_expand_to_vec4(&ctx->ac, input[i], 
num_channels);
+               fetches[i] = ac_build_buffer_load_format(&ctx->ac, t_list, 
vertex_index, voffset,
+                                                        channels_per_fetch, 
false, true);
        }
 
-       /* Break up the vec4 into individual components */
-       for (chan = 0; chan < 4; chan++) {
-               LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
-               out[chan] = LLVMBuildExtractElement(ctx->ac.builder,
-                                                   input[0], llvm_chan, "");
+       if (num_fetches == 1 && channels_per_fetch > 1) {
+               LLVMValueRef fetch = fetches[0];
+               for (unsigned i = 0; i < channels_per_fetch; ++i) {
+                       tmp = LLVMConstInt(ctx->i32, i, false);
+                       fetches[i] = LLVMBuildExtractElement(
+                               ctx->ac.builder, fetch, tmp, "");
+               }
+               num_fetches = channels_per_fetch;
+               channels_per_fetch = 1;
        }
 
-       switch (fix_fetch) {
-       case SI_FIX_FETCH_A2_SNORM:
-       case SI_FIX_FETCH_A2_SSCALED:
-       case SI_FIX_FETCH_A2_SINT: {
-               /* The hardware returns an unsigned value; convert it to a
-                * signed one.
+       for (unsigned i = num_fetches; i < 4; ++i)
+               fetches[i] = LLVMGetUndef(ctx->f32);
+
+       if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 &&
+           required_channels == 4) {
+               if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || 
fix_fetch.u.format == AC_FETCH_FORMAT_SINT)
+                       fetches[3] = ctx->ac.i32_1;
+               else
+                       fetches[3] = ctx->ac.f32_1;
+       } else if (fix_fetch.u.log_size == 3 &&
+                  (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ||
+                   fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED ||
+                   fix_fetch.u.format == AC_FETCH_FORMAT_SINT) &&
+                  required_channels == 4) {
+               /* For 2_10_10_10, the hardware returns an unsigned value;
+                * convert it to a signed one.
                 */
-               LLVMValueRef tmp = out[3];
+               LLVMValueRef tmp = fetches[3];
                LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
 
                /* First, recover the sign-extended signed integer value. */
-               if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
+               if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED)
                        tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->i32, 
"");
                else
                        tmp = ac_to_integer(&ctx->ac, tmp);
 
                /* For the integer-like cases, do a natural sign extension.
                 *
                 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
                 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
                 * exponent.
                 */
                tmp = LLVMBuildShl(ctx->ac.builder, tmp,
-                                  fix_fetch == SI_FIX_FETCH_A2_SNORM ?
+                                  fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ?
                                   LLVMConstInt(ctx->i32, 7, 0) : c30, "");
                tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
 
                /* Convert back to the right type. */
-               if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
+               if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) {
                        LLVMValueRef clamp;
                        LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
                        tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, 
"");
                        clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, 
tmp, neg_one, "");
                        tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, 
tmp, "");
-               } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
+               } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) {
                        tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, 
"");
                }
 
-               out[3] = tmp;
-               break;
+               fetches[3] = tmp;
        }
-       case SI_FIX_FETCH_RGBA_32_UNORM:
-       case SI_FIX_FETCH_RGBX_32_UNORM:
-               for (chan = 0; chan < 4; chan++) {
-                       out[chan] = ac_to_integer(&ctx->ac, out[chan]);
-                       out[chan] = LLVMBuildUIToFP(ctx->ac.builder,
-                                                   out[chan], ctx->f32, "");
-                       out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan],
-                                                 LLVMConstReal(ctx->f32, 1.0 / 
UINT_MAX), "");
-               }
-               /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by 
normalizing. */
-               if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
-                       out[3] = LLVMConstReal(ctx->f32, 1);
-               break;
-       case SI_FIX_FETCH_RGBA_32_SNORM:
-       case SI_FIX_FETCH_RGBX_32_SNORM:
-       case SI_FIX_FETCH_RGBA_32_FIXED:
-       case SI_FIX_FETCH_RGBX_32_FIXED: {
-               double scale;
-               if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
-                       scale = 1.0 / 0x10000;
-               else
-                       scale = 1.0 / INT_MAX;
 
-               for (chan = 0; chan < 4; chan++) {
-                       out[chan] = ac_to_integer(&ctx->ac, out[chan]);
-                       out[chan] = LLVMBuildSIToFP(ctx->ac.builder,
-                                                   out[chan], ctx->f32, "");
-                       out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan],
-                                                 LLVMConstReal(ctx->f32, 
scale), "");
-               }
-               /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by 
normalizing. */
-               if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
-                   fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
-                       out[3] = LLVMConstReal(ctx->f32, 1);
-               break;
-       }
-       case SI_FIX_FETCH_RGBA_32_USCALED:
-               for (chan = 0; chan < 4; chan++) {
-                       out[chan] = ac_to_integer(&ctx->ac, out[chan]);
-                       out[chan] = LLVMBuildUIToFP(ctx->ac.builder,
-                                                   out[chan], ctx->f32, "");
-               }
-               break;
-       case SI_FIX_FETCH_RGBA_32_SSCALED:
-               for (chan = 0; chan < 4; chan++) {
-                       out[chan] = ac_to_integer(&ctx->ac, out[chan]);
-                       out[chan] = LLVMBuildSIToFP(ctx->ac.builder,
-                                                   out[chan], ctx->f32, "");
-               }
-               break;
-       case SI_FIX_FETCH_RG_64_FLOAT:
-               for (chan = 0; chan < 2; chan++)
-                       out[chan] = extract_double_to_float(ctx, input[0], 
chan);
-
-               out[2] = LLVMConstReal(ctx->f32, 0);
-               out[3] = LLVMConstReal(ctx->f32, 1);
-               break;
-       case SI_FIX_FETCH_RGB_64_FLOAT:
-               for (chan = 0; chan < 3; chan++)
-                       out[chan] = extract_double_to_float(ctx, input[chan], 
0);
-
-               out[3] = LLVMConstReal(ctx->f32, 1);
-               break;
-       case SI_FIX_FETCH_RGBA_64_FLOAT:
-               for (chan = 0; chan < 4; chan++) {
-                       out[chan] = extract_double_to_float(ctx, input[chan / 
2],
-                                                           chan % 2);
-               }
-               break;
-       case SI_FIX_FETCH_RGB_8:
-       case SI_FIX_FETCH_RGB_8_INT:
-       case SI_FIX_FETCH_RGB_16:
-       case SI_FIX_FETCH_RGB_16_INT:
-               for (chan = 0; chan < 3; chan++) {
-                       out[chan] = LLVMBuildExtractElement(ctx->ac.builder,
-                                                           input[chan],
-                                                           ctx->i32_0, "");
-               }
-               if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
-                   fix_fetch == SI_FIX_FETCH_RGB_16) {
-                       out[3] = LLVMConstReal(ctx->f32, 1);
-               } else {
-                       out[3] = ac_to_float(&ctx->ac, ctx->i32_1);
-               }
-               break;
-       }
+       for (unsigned i = 0; i < 4; ++i)
+               out[i] = ac_to_float(&ctx->ac, fetches[i]);
 }
 
 static void declare_input_vs(
        struct si_shader_context *ctx,
        unsigned input_index,
        const struct tgsi_full_declaration *decl,
        LLVMValueRef out[4])
 {
        si_llvm_load_input_vs(ctx, input_index, out);
 }
@@ -5770,23 +5676,32 @@ static void si_dump_shader_key_vs(const struct 
si_shader_key *key,
                                  const struct si_vs_prolog_bits *prolog,
                                  const char *prefix, FILE *f)
 {
        fprintf(f, "  %s.instance_divisor_is_one = %u\n",
                prefix, prolog->instance_divisor_is_one);
        fprintf(f, "  %s.instance_divisor_is_fetched = %u\n",
                prefix, prolog->instance_divisor_is_fetched);
        fprintf(f, "  %s.ls_vgpr_fix = %u\n",
                prefix, prolog->ls_vgpr_fix);
 
+       fprintf(f, "  mono.vs.fetch_opencode = %x\n", 
key->mono.vs_fetch_opencode);
        fprintf(f, "  mono.vs.fix_fetch = {");
-       for (int i = 0; i < SI_MAX_ATTRIBS; i++)
-               fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
+       for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
+               union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i];
+               if (i)
+                       fprintf(f, ", ");
+               if (!fix.bits)
+                       fprintf(f, "0");
+               else
+                       fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size,
+                               fix.u.num_channels_m1, fix.u.format);
+       }
        fprintf(f, "}\n");
 }
 
 static void si_dump_shader_key(unsigned processor, const struct si_shader 
*shader,
                               FILE *f)
 {
        const struct si_shader_key *key = &shader->key;
 
        fprintf(f, "SHADER KEY\n");
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.h 
b/src/gallium/drivers/radeonsi/si_shader.h
index f9f81a7bc1e..ecf7f8bbd7a 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -266,41 +266,38 @@ enum {
 
        /* Use a property enum that VS wouldn't use. */
        TGSI_PROPERTY_VS_BLIT_SGPRS = TGSI_PROPERTY_FS_COORD_ORIGIN,
 
        /* These represent the number of SGPRs the shader uses. */
        SI_VS_BLIT_SGPRS_POS = 3,
        SI_VS_BLIT_SGPRS_POS_COLOR = 7,
        SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9,
 };
 
-/* For VS shader key fix_fetch. */
-enum {
-       SI_FIX_FETCH_NONE = 0,
-       SI_FIX_FETCH_A2_SNORM,
-       SI_FIX_FETCH_A2_SSCALED,
-       SI_FIX_FETCH_A2_SINT,
-       SI_FIX_FETCH_RGBA_32_UNORM,
-       SI_FIX_FETCH_RGBX_32_UNORM,
-       SI_FIX_FETCH_RGBA_32_SNORM,
-       SI_FIX_FETCH_RGBX_32_SNORM,
-       SI_FIX_FETCH_RGBA_32_USCALED,
-       SI_FIX_FETCH_RGBA_32_SSCALED,
-       SI_FIX_FETCH_RGBA_32_FIXED,
-       SI_FIX_FETCH_RGBX_32_FIXED,
-       SI_FIX_FETCH_RG_64_FLOAT,
-       SI_FIX_FETCH_RGB_64_FLOAT,
-       SI_FIX_FETCH_RGBA_64_FLOAT,
-       SI_FIX_FETCH_RGB_8,     /* A = 1.0 */
-       SI_FIX_FETCH_RGB_8_INT, /* A = 1 */
-       SI_FIX_FETCH_RGB_16,
-       SI_FIX_FETCH_RGB_16_INT,
+/**
+ * For VS shader keys, describe any fixups required for vertex fetch.
+ *
+ * \ref log_size, \ref format, and the number of channels are interpreted as
+ * by \ref ac_build_opencoded_load_format.
+ *
+ * Note: all bits 0 (size = 1 byte, num channels = 1, format = float) is an
+ * impossible format and indicates that no fixup is needed (just use
+ * buffer_load_format_xyzw).
+ */
+union si_vs_fix_fetch {
+       struct {
+               uint8_t log_size : 2; /* 1, 2, 4, 8 or bytes per channel */
+               uint8_t num_channels_m1 : 2; /* number of channels minus 1 */
+               uint8_t format : 3; /* AC_FETCH_FORMAT_xxx */
+               uint8_t reverse : 1; /* reverse XYZ channels */
+       } u;
+       uint8_t bits;
 };
 
 struct si_shader;
 
 /* State of the context creating the shader object. */
 struct si_compiler_ctx_state {
        /* Should only be used by si_init_shader_selector_async and
         * si_build_shader_variant if thread_index == -1 (non-threaded). */
        struct ac_llvm_compiler         *compiler;
 
@@ -517,22 +514,25 @@ struct si_shader_key {
        } part;
 
        /* These two are initially set according to the NEXT_SHADER property,
         * or guessed if the property doesn't seem correct.
         */
        unsigned as_es:1; /* export shader, which precedes GS */
        unsigned as_ls:1; /* local shader, which precedes TCS */
 
        /* Flags for monolithic compilation only. */
        struct {
-               /* One byte for every input: SI_FIX_FETCH_* enums. */
-               uint8_t         vs_fix_fetch[SI_MAX_ATTRIBS];
+               /* Whether fetch should be opencoded according to vs_fix_fetch.
+                * Otherwise, if vs_fix_fetch is non-zero, 
buffer_load_format_xyzw
+                * with minimal fixups is used. */
+               uint16_t vs_fetch_opencode;
+               union si_vs_fix_fetch vs_fix_fetch[SI_MAX_ATTRIBS];
 
                union {
                        uint64_t        ff_tcs_inputs_to_copy; /* for 
fixed-func TCS */
                        /* When PS needs PrimID and GS is disabled. */
                        unsigned        vs_export_prim_id:1;
                        struct {
                                unsigned interpolate_at_sample_force_center:1;
                                unsigned fbfetch_msaa;
                                unsigned fbfetch_is_1D;
                                unsigned fbfetch_layered;
diff --git a/src/gallium/drivers/radeonsi/si_state.c 
b/src/gallium/drivers/radeonsi/si_state.c
index 757c17f7df8..8f4ced1f1b5 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -4456,24 +4456,22 @@ static void *si_create_vertex_elements(struct 
pipe_context *ctx,
        assert(count <= SI_MAX_ATTRIBS);
        if (!v)
                return NULL;
 
        v->count = count;
        v->desc_list_byte_size = align(count * 16, SI_CPDMA_ALIGNMENT);
 
        for (i = 0; i < count; ++i) {
                const struct util_format_description *desc;
                const struct util_format_channel_description *channel;
-               unsigned data_format, num_format;
                int first_non_void;
                unsigned vbo_index = elements[i].vertex_buffer_index;
-               unsigned char swizzle[4];
 
                if (vbo_index >= SI_NUM_VERTEX_BUFFERS) {
                        FREE(v);
                        return NULL;
                }
 
                unsigned instance_divisor = elements[i].instance_divisor;
                if (instance_divisor) {
                        v->uses_instance_divisors = true;
 
@@ -4486,119 +4484,151 @@ static void *si_create_vertex_elements(struct 
pipe_context *ctx,
                        }
                }
 
                if (!used[vbo_index]) {
                        v->first_vb_use_mask |= 1 << i;
                        used[vbo_index] = true;
                }
 
                desc = util_format_description(elements[i].src_format);
                first_non_void = 
util_format_get_first_non_void_channel(elements[i].src_format);
-               data_format = si_translate_buffer_dataformat(ctx->screen, desc, 
first_non_void);
-               num_format = si_translate_buffer_numformat(ctx->screen, desc, 
first_non_void);
                channel = first_non_void >= 0 ? &desc->channel[first_non_void] 
: NULL;
-               memcpy(swizzle, desc->swizzle, sizeof(swizzle));
 
                v->format_size[i] = desc->block.bits / 8;
                v->src_offset[i] = elements[i].src_offset;
                v->vertex_buffer_index[i] = vbo_index;
 
-               /* The hardware always treats the 2-bit alpha channel as
-                * unsigned, so a shader workaround is needed. The affected
-                * chips are VI and older except Stoney (GFX8.1).
-                */
-               if (data_format == V_008F0C_BUF_DATA_FORMAT_2_10_10_10 &&
-                   sscreen->info.chip_class <= VI &&
-                   sscreen->info.family != CHIP_STONEY) {
-                       if (num_format == V_008F0C_BUF_NUM_FORMAT_SNORM) {
-                               v->fix_fetch[i] = SI_FIX_FETCH_A2_SNORM;
-                       } else if (num_format == 
V_008F0C_BUF_NUM_FORMAT_SSCALED) {
-                               v->fix_fetch[i] = SI_FIX_FETCH_A2_SSCALED;
-                       } else if (num_format == V_008F0C_BUF_NUM_FORMAT_SINT) {
-                               /* This isn't actually used in OpenGL. */
-                               v->fix_fetch[i] = SI_FIX_FETCH_A2_SINT;
-                       }
-               } else if (channel && channel->type == UTIL_FORMAT_TYPE_FIXED) {
-                       if (desc->swizzle[3] == PIPE_SWIZZLE_1)
-                               v->fix_fetch[i] = SI_FIX_FETCH_RGBX_32_FIXED;
-                       else
-                               v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_FIXED;
-               } else if (channel && channel->size == 32 && 
!channel->pure_integer) {
-                       if (channel->type == UTIL_FORMAT_TYPE_SIGNED) {
-                               if (channel->normalized) {
-                                       if (desc->swizzle[3] == PIPE_SWIZZLE_1)
-                                               v->fix_fetch[i] = 
SI_FIX_FETCH_RGBX_32_SNORM;
-                                       else
-                                               v->fix_fetch[i] = 
SI_FIX_FETCH_RGBA_32_SNORM;
-                               } else {
-                                       v->fix_fetch[i] = 
SI_FIX_FETCH_RGBA_32_SSCALED;
-                               }
-                       } else if (channel->type == UTIL_FORMAT_TYPE_UNSIGNED) {
-                               if (channel->normalized) {
-                                       if (desc->swizzle[3] == PIPE_SWIZZLE_1)
-                                               v->fix_fetch[i] = 
SI_FIX_FETCH_RGBX_32_UNORM;
-                                       else
-                                               v->fix_fetch[i] = 
SI_FIX_FETCH_RGBA_32_UNORM;
-                               } else {
-                                       v->fix_fetch[i] = 
SI_FIX_FETCH_RGBA_32_USCALED;
-                               }
-                       }
-               } else if (channel && channel->size == 64 &&
-                          channel->type == UTIL_FORMAT_TYPE_FLOAT) {
-                       switch (desc->nr_channels) {
-                       case 1:
-                       case 2:
-                               v->fix_fetch[i] = SI_FIX_FETCH_RG_64_FLOAT;
-                               swizzle[0] = PIPE_SWIZZLE_X;
-                               swizzle[1] = PIPE_SWIZZLE_Y;
-                               swizzle[2] = desc->nr_channels == 2 ? 
PIPE_SWIZZLE_Z : PIPE_SWIZZLE_0;
-                               swizzle[3] = desc->nr_channels == 2 ? 
PIPE_SWIZZLE_W : PIPE_SWIZZLE_0;
-                               break;
-                       case 3:
-                               v->fix_fetch[i] = SI_FIX_FETCH_RGB_64_FLOAT;
-                               swizzle[0] = PIPE_SWIZZLE_X; /* 3 loads */
-                               swizzle[1] = PIPE_SWIZZLE_Y;
-                               swizzle[2] = PIPE_SWIZZLE_0;
-                               swizzle[3] = PIPE_SWIZZLE_0;
-                               break;
-                       case 4:
-                               v->fix_fetch[i] = SI_FIX_FETCH_RGBA_64_FLOAT;
-                               swizzle[0] = PIPE_SWIZZLE_X; /* 2 loads */
-                               swizzle[1] = PIPE_SWIZZLE_Y;
-                               swizzle[2] = PIPE_SWIZZLE_Z;
-                               swizzle[3] = PIPE_SWIZZLE_W;
-                               break;
-                       default:
-                               assert(0);
-                       }
-               } else if (channel && desc->nr_channels == 3) {
-                       assert(desc->swizzle[0] == PIPE_SWIZZLE_X);
+               bool always_fix = false;
+               union si_vs_fix_fetch fix_fetch;
+               unsigned log_hw_load_size; /* the load element size as seen by 
the hardware */
 
-                       if (channel->size == 8) {
+               fix_fetch.bits = 0;
+               log_hw_load_size = MIN2(2, util_logbase2(desc->block.bits) - 3);
+
+               if (channel) {
+                       switch (channel->type) {
+                       case UTIL_FORMAT_TYPE_FLOAT: fix_fetch.u.format = 
AC_FETCH_FORMAT_FLOAT; break;
+                       case UTIL_FORMAT_TYPE_FIXED: fix_fetch.u.format = 
AC_FETCH_FORMAT_FIXED; break;
+                       case UTIL_FORMAT_TYPE_SIGNED: {
                                if (channel->pure_integer)
-                                       v->fix_fetch[i] = 
SI_FIX_FETCH_RGB_8_INT;
+                                       fix_fetch.u.format = 
AC_FETCH_FORMAT_SINT;
+                               else if (channel->normalized)
+                                       fix_fetch.u.format = 
AC_FETCH_FORMAT_SNORM;
                                else
-                                       v->fix_fetch[i] = SI_FIX_FETCH_RGB_8;
-                       } else if (channel->size == 16) {
+                                       fix_fetch.u.format = 
AC_FETCH_FORMAT_SSCALED;
+                               break;
+                       }
+                       case UTIL_FORMAT_TYPE_UNSIGNED: {
                                if (channel->pure_integer)
-                                       v->fix_fetch[i] = 
SI_FIX_FETCH_RGB_16_INT;
+                                       fix_fetch.u.format = 
AC_FETCH_FORMAT_UINT;
+                               else if (channel->normalized)
+                                       fix_fetch.u.format = 
AC_FETCH_FORMAT_UNORM;
                                else
-                                       v->fix_fetch[i] = SI_FIX_FETCH_RGB_16;
+                                       fix_fetch.u.format = 
AC_FETCH_FORMAT_USCALED;
+                               break;
+                       }
+                       default: unreachable("bad format type");
+                       }
+               } else {
+                       switch (elements[i].src_format) {
+                       case PIPE_FORMAT_R11G11B10_FLOAT: fix_fetch.u.format = 
AC_FETCH_FORMAT_FLOAT; break;
+                       default: unreachable("bad other format");
                        }
                }
 
-               v->rsrc_word3[i] = 
S_008F0C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
-                                  
S_008F0C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
-                                  
S_008F0C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
-                                  
S_008F0C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
-                                  S_008F0C_NUM_FORMAT(num_format) |
-                                  S_008F0C_DATA_FORMAT(data_format);
+               if (desc->channel[0].size == 10) {
+                       fix_fetch.u.log_size = 3; /* special encoding for 
2_10_10_10 */
+                       log_hw_load_size = 2;
+
+                       /* The hardware always treats the 2-bit alpha channel as
+                        * unsigned, so a shader workaround is needed. The 
affected
+                        * chips are VI and older except Stoney (GFX8.1).
+                        */
+                       always_fix = sscreen->info.chip_class <= VI &&
+                                    sscreen->info.family != CHIP_STONEY &&
+                                    channel->type == UTIL_FORMAT_TYPE_SIGNED;
+               } else if (elements[i].src_format == 
PIPE_FORMAT_R11G11B10_FLOAT) {
+                       fix_fetch.u.log_size = 3; /* special encoding */
+                       fix_fetch.u.format = AC_FETCH_FORMAT_FIXED;
+                       log_hw_load_size = 2;
+               } else {
+                       fix_fetch.u.log_size = util_logbase2(channel->size) - 3;
+                       fix_fetch.u.num_channels_m1 = desc->nr_channels - 1;
+
+                       /* Always fix up:
+                        * - doubles (multiple loads + truncate to float)
+                        * - 32-bit requiring a conversion
+                        */
+                       always_fix =
+                               (fix_fetch.u.log_size == 3) ||
+                               (fix_fetch.u.log_size == 2 &&
+                                fix_fetch.u.format != AC_FETCH_FORMAT_FLOAT &&
+                                fix_fetch.u.format != AC_FETCH_FORMAT_UINT &&
+                                fix_fetch.u.format != AC_FETCH_FORMAT_SINT);
+
+                       /* Also fixup 8_8_8 and 16_16_16. */
+                       if (desc->nr_channels == 3 && fix_fetch.u.log_size <= 
1) {
+                               always_fix = true;
+                               log_hw_load_size = fix_fetch.u.log_size;
+                       }
+               }
+
+               if (desc->swizzle[0] != PIPE_SWIZZLE_X) {
+                       assert(desc->swizzle[0] == PIPE_SWIZZLE_Z &&
+                              (desc->swizzle[2] == PIPE_SWIZZLE_X || 
desc->swizzle[2] == PIPE_SWIZZLE_0));
+                       fix_fetch.u.reverse = 1;
+               }
+
+               /* Force the workaround for unaligned access here already if the
+                * offset relative to the vertex buffer base is unaligned.
+                *
+                * There is a theoretical case in which this is too 
conservative:
+                * if the vertex buffer's offset is also unaligned in just the
+                * right way, we end up with an aligned address after all.
+                * However, this case should be extremely rare in practice (it
+                * won't happen in well-behaved applications), and taking it
+                * into account would complicate the fast path (where everything
+                * is nicely aligned).
+                */
+               bool check_alignment = log_hw_load_size >= 1 && 
sscreen->info.chip_class == SI;
+               bool opencode = sscreen->options.vs_fetch_always_opencode;
+
+               if (check_alignment &&
+                   (elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 
0)
+                       opencode = true;
+
+               if (always_fix || check_alignment || opencode)
+                       v->fix_fetch[i] = fix_fetch.bits;
+
+               if (opencode)
+                       v->fix_fetch_opencode |= 1 << i;
+               if (opencode || always_fix)
+                       v->fix_fetch_always |= 1 << i;
+
+               if (check_alignment && !opencode) {
+                       assert(log_hw_load_size == 1 || log_hw_load_size == 2);
+
+                       v->fix_fetch_unaligned |= 1 << i;
+                       v->hw_load_is_dword |= (log_hw_load_size - 1) << i;
+                       v->vb_alignment_check_mask |= 1 << vbo_index;
+               }
+
+               v->rsrc_word3[i] = 
S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
+                                  
S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
+                                  
S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
+                                  
S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3]));
+
+               unsigned data_format, num_format;
+               data_format = si_translate_buffer_dataformat(ctx->screen, desc, 
first_non_void);
+               num_format = si_translate_buffer_numformat(ctx->screen, desc, 
first_non_void);
+               v->rsrc_word3[i] |= S_008F0C_NUM_FORMAT(num_format) |
+                                   S_008F0C_DATA_FORMAT(data_format);
        }
 
        if (v->instance_divisor_is_fetched) {
                unsigned num_divisors = 
util_last_bit(v->instance_divisor_is_fetched);
 
                v->instance_divisor_factor_buffer =
                        (struct si_resource*)
                        pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT,
                                           num_divisors * 
sizeof(divisor_factors[0]));
                if (!v->instance_divisor_factor_buffer) {
@@ -4618,21 +4648,27 @@ static void si_bind_vertex_elements(struct pipe_context 
*ctx, void *state)
        struct si_vertex_elements *old = sctx->vertex_elements;
        struct si_vertex_elements *v = (struct si_vertex_elements*)state;
 
        sctx->vertex_elements = v;
        sctx->vertex_buffers_dirty = true;
 
        if (v &&
            (!old ||
             old->count != v->count ||
             old->uses_instance_divisors != v->uses_instance_divisors ||
-            v->uses_instance_divisors || /* we don't check which divisors 
changed */
+            /* we don't check which divisors changed */
+            v->uses_instance_divisors ||
+            /* fix_fetch_{always,opencode,unaligned} and hw_load_is_dword are
+             * functions of fix_fetch and the src_offset alignment.
+             * If they change and fix_fetch doesn't, it must be due to 
different
+             * src_offset alignment, which is reflected in fix_fetch_opencode. 
*/
+            old->fix_fetch_opencode != v->fix_fetch_opencode ||
             memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * 
v->count)))
                sctx->do_update_shaders = true;
 
        if (v && v->instance_divisor_is_fetched) {
                struct pipe_constant_buffer cb;
 
                cb.buffer = &v->instance_divisor_factor_buffer->b.b;
                cb.user_buffer = NULL;
                cb.buffer_offset = 0;
                cb.buffer_size = 0xffffffff;
@@ -4650,43 +4686,64 @@ static void si_delete_vertex_element(struct 
pipe_context *ctx, void *state)
        si_resource_reference(&v->instance_divisor_factor_buffer, NULL);
        FREE(state);
 }
 
 static void si_set_vertex_buffers(struct pipe_context *ctx,
                                  unsigned start_slot, unsigned count,
                                  const struct pipe_vertex_buffer *buffers)
 {
        struct si_context *sctx = (struct si_context *)ctx;
        struct pipe_vertex_buffer *dst = sctx->vertex_buffer + start_slot;
+       uint32_t orig_unaligned = sctx->vertex_buffer_unaligned;
+       uint32_t unaligned = orig_unaligned;
        int i;
 
        assert(start_slot + count <= ARRAY_SIZE(sctx->vertex_buffer));
 
        if (buffers) {
                for (i = 0; i < count; i++) {
                        const struct pipe_vertex_buffer *src = buffers + i;
                        struct pipe_vertex_buffer *dsti = dst + i;
                        struct pipe_resource *buf = src->buffer.resource;
 
                        pipe_resource_reference(&dsti->buffer.resource, buf);
                        dsti->buffer_offset = src->buffer_offset;
                        dsti->stride = src->stride;
+                       if (dsti->buffer_offset & 3 || dsti->stride & 3)
+                               unaligned |= 1 << (start_slot + i);
+                       else
+                               unaligned &= ~(1 << (start_slot + i));
+
                        si_context_add_resource_size(sctx, buf);
                        if (buf)
                                si_resource(buf)->bind_history |= 
PIPE_BIND_VERTEX_BUFFER;
                }
        } else {
                for (i = 0; i < count; i++) {
                        pipe_resource_reference(&dst[i].buffer.resource, NULL);
                }
+               unaligned &= ~u_bit_consecutive(start_slot, count);
        }
        sctx->vertex_buffers_dirty = true;
+       sctx->vertex_buffer_unaligned = unaligned;
+
+       /* Check whether alignment may have changed in a way that requires
+        * shader changes. This check is conservative: a vertex buffer can only
+        * trigger a shader change if the misalignment amount changes (e.g.
+        * from byte-aligned to short-aligned), but we only keep track of
+        * whether buffers are at least dword-aligned, since that should always
+        * be the case in well-behaved applications anyway.
+        */
+       if (sctx->vertex_elements &&
+           (sctx->vertex_elements->vb_alignment_check_mask &
+            (unaligned | orig_unaligned) & u_bit_consecutive(start_slot, 
count)))
+               sctx->do_update_shaders = true;
 }
 
 /*
  * Misc
  */
 
 static void si_set_tess_state(struct pipe_context *ctx,
                              const float default_outer_level[4],
                              const float default_inner_level[2])
 {
diff --git a/src/gallium/drivers/radeonsi/si_state.h 
b/src/gallium/drivers/radeonsi/si_state.h
index b0802416c73..c5b4dc95b4b 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -132,20 +132,39 @@ struct si_stencil_ref {
 
 struct si_vertex_elements
 {
        struct si_resource              *instance_divisor_factor_buffer;
        uint32_t                        rsrc_word3[SI_MAX_ATTRIBS];
        uint16_t                        src_offset[SI_MAX_ATTRIBS];
        uint8_t                         fix_fetch[SI_MAX_ATTRIBS];
        uint8_t                         format_size[SI_MAX_ATTRIBS];
        uint8_t                         vertex_buffer_index[SI_MAX_ATTRIBS];
 
+       /* Bitmask of elements that always need a fixup to be applied. */
+       uint16_t                        fix_fetch_always;
+
+       /* Bitmask of elements whose fetch should always be opencoded. */
+       uint16_t                        fix_fetch_opencode;
+
+       /* Bitmask of elements which need to be opencoded if the vertex buffer
+        * is unaligned. */
+       uint16_t                        fix_fetch_unaligned;
+
+       /* For elements in fix_fetch_unaligned: whether the effective
+        * element load size as seen by the hardware is a dword (as opposed
+        * to a short).
+        */
+       uint16_t                        hw_load_is_dword;
+
+       /* Bitmask of vertex buffers requiring alignment check */
+       uint16_t                        vb_alignment_check_mask;
+
        uint8_t                         count;
        bool                            uses_instance_divisors;
 
        uint16_t                        first_vb_use_mask;
        /* Vertex buffer descriptor list size aligned for optimal prefetch. */
        uint16_t                        desc_list_byte_size;
        uint16_t                        instance_divisor_is_one; /* bitmask of 
inputs */
        uint16_t                        instance_divisor_is_fetched;  /* 
bitmask of inputs */
 };
 
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c 
b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 583d7c9d3ca..c287db28d5f 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1383,21 +1383,45 @@ static void si_shader_selector_key_vs(struct si_context 
*sctx,
 
        prolog_key->instance_divisor_is_one = elts->instance_divisor_is_one;
        prolog_key->instance_divisor_is_fetched = 
elts->instance_divisor_is_fetched;
 
        /* Prefer a monolithic shader to allow scheduling divisions around
         * VBO loads. */
        if (prolog_key->instance_divisor_is_fetched)
                key->opt.prefer_mono = 1;
 
        unsigned count = MIN2(vs->info.num_inputs, elts->count);
-       memcpy(key->mono.vs_fix_fetch, elts->fix_fetch, count);
+       unsigned count_mask = (1 << count) - 1;
+       unsigned fix = elts->fix_fetch_always & count_mask;
+       unsigned opencode = elts->fix_fetch_opencode & count_mask;
+
+       if (sctx->vertex_buffer_unaligned & elts->vb_alignment_check_mask) {
+               uint32_t mask = elts->fix_fetch_unaligned & count_mask;
+               while (mask) {
+                       unsigned i = u_bit_scan(&mask);
+                       unsigned log_hw_load_size = 1 + 
((elts->hw_load_is_dword >> i) & 1);
+                       unsigned vbidx = elts->vertex_buffer_index[i];
+                       struct pipe_vertex_buffer *vb = 
&sctx->vertex_buffer[vbidx];
+                       unsigned align_mask = (1 << log_hw_load_size) - 1;
+                       if (vb->buffer_offset & align_mask ||
+                           vb->stride & align_mask) {
+                               fix |= 1 << i;
+                               opencode |= 1 << i;
+                       }
+               }
+       }
+
+       while (fix) {
+               unsigned i = u_bit_scan(&fix);
+               key->mono.vs_fix_fetch[i].bits = elts->fix_fetch[i];
+       }
+       key->mono.vs_fetch_opencode = opencode;
 }
 
 static void si_shader_selector_key_hw_vs(struct si_context *sctx,
                                         struct si_shader_selector *vs,
                                         struct si_shader_key *key)
 {
        struct si_shader_selector *ps = sctx->ps_shader.cso;
 
        key->opt.clip_disable =
                sctx->queued.named.rasterizer->clip_plane_enable == 0 &&
-- 
2.20.1

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to