[Mesa-dev] [PATCH 11/14] radeonsi: Enable dynamic HS.

2016-05-10 Thread Bas Nieuwenhuizen
This allows running the TES on different CU's than the
TCS which results in performance improvements.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_shader.c| 11 ---
 src/gallium/drivers/radeonsi/si_state_shaders.c |  2 +-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index 4516ea2..5728be0 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2583,12 +2583,17 @@ static void si_write_tess_factors(struct 
lp_build_tgsi_context *bld_base,
byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
  lp_build_const_int32(gallivm, 4 * stride), 
"");
 
-   /* Store the outputs. */
+   /* Store the dynamic HS control word. */
+   build_tbuffer_store_dwords(ctx, buffer,
+  lp_build_const_int32(gallivm, 0x8000),
+  1, lp_build_const_int32(gallivm, 0), 
tf_base, 0);
+
+   /* Store the tessellation factors. */
build_tbuffer_store_dwords(ctx, buffer, vec0,
-  MIN2(stride, 4), byteoffset, tf_base, 0);
+  MIN2(stride, 4), byteoffset, tf_base, 4);
if (vec1)
build_tbuffer_store_dwords(ctx, buffer, vec1,
-  stride - 4, byteoffset, tf_base, 16);
+  stride - 4, byteoffset, tf_base, 20);
lp_build_endif(_ctx);
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c 
b/src/gallium/drivers/radeonsi/si_state_shaders.c
index f48582a..43f4a84 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1873,7 +1873,7 @@ static void si_update_vgt_shader_config(struct si_context 
*sctx)
 
if (sctx->tes_shader.cso) {
stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) |
- S_028B54_HS_EN(1);
+ S_028B54_HS_EN(1) | S_028B54_DYNAMIC_HS(1);
 
if (sctx->gs_shader.cso)
stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) |
-- 
2.8.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 01/14] radeonsi: Add buffer for offchip storage between TCS and TES.

2016-05-10 Thread Bas Nieuwenhuizen
The buffer is quite large, but should only be allocated if the
application uses tessellation. Most non-games don't.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_pipe.c  |  1 +
 src/gallium/drivers/radeonsi/si_pipe.h  |  1 +
 src/gallium/drivers/radeonsi/si_state.h |  1 +
 src/gallium/drivers/radeonsi/si_state_shaders.c | 17 +
 4 files changed, 20 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
b/src/gallium/drivers/radeonsi/si_pipe.c
index 61d5578..10123d4 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -48,6 +48,7 @@ static void si_destroy_context(struct pipe_context *context)
pipe_resource_reference(>esgs_ring, NULL);
pipe_resource_reference(>gsvs_ring, NULL);
pipe_resource_reference(>tf_ring, NULL);
+   pipe_resource_reference(>tess_offchip_ring, NULL);
pipe_resource_reference(>null_const_buf.buffer, NULL);
r600_resource_reference(>border_color_buffer, NULL);
free(sctx->border_color_table);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index d31e9a9..b219dd4 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -256,6 +256,7 @@ struct si_context {
struct pipe_resource*esgs_ring;
struct pipe_resource*gsvs_ring;
struct pipe_resource*tf_ring;
+   struct pipe_resource*tess_offchip_ring;
union pipe_color_union  *border_color_table; /* in CPU memory, 
any endian */
struct r600_resource*border_color_buffer;
union pipe_color_union  *border_color_map; /* in VRAM (slow 
access), little endian */
diff --git a/src/gallium/drivers/radeonsi/si_state.h 
b/src/gallium/drivers/radeonsi/si_state.h
index f2a3b03..6d4346b 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -155,6 +155,7 @@ struct si_shader_data {
 /* Private read-write buffer slots. */
 enum {
SI_HS_RING_TESS_FACTOR,
+   SI_HS_RING_TESS_OFFCHIP,
 
SI_ES_RING_ESGS,
SI_GS_RING_ESGS,
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c 
b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 0bfd7e8..0fa32c8 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1772,6 +1772,15 @@ static void si_init_tess_factor_ring(struct si_context 
*sctx)
 
assert(((sctx->tf_ring->width0 / 4) & C_030938_SIZE) == 0);
 
+   /* The size is derived from 256 blocks of 8192 dwords each, as set in
+* R_03093C_VGT_HS_OFFCHIP_PARAM. */
+   sctx->tess_offchip_ring = pipe_buffer_create(sctx->b.b.screen,
+PIPE_BIND_CUSTOM,
+PIPE_USAGE_DEFAULT,
+8 * 1024 * 1024);
+   if (!sctx->tess_offchip_ring)
+   return;
+
si_init_config_add_vgt_flush(sctx);
 
/* Append these registers to the init config state. */
@@ -1787,6 +1796,10 @@ static void si_init_tess_factor_ring(struct si_context 
*sctx)
   r600_resource(sctx->tf_ring)->gpu_address >> 8);
}
 
+   si_pm4_set_reg(sctx->init_config, R_03093C_VGT_HS_OFFCHIP_PARAM,
+  S_03093C_OFFCHIP_BUFFERING(0xFF) |
+  S_03093C_OFFCHIP_GRANULARITY(V_03093C_X_8K_DWORDS));
+
/* Flush the context to re-emit the init_config state.
 * This is done only once in a lifetime of a context.
 */
@@ -1796,6 +1809,10 @@ static void si_init_tess_factor_ring(struct si_context 
*sctx)
 
si_set_ring_buffer(>b.b, SI_HS_RING_TESS_FACTOR, sctx->tf_ring,
   0, sctx->tf_ring->width0, false, false, 0, 0, 0);
+
+   si_set_ring_buffer(>b.b, SI_HS_RING_TESS_OFFCHIP,
+  sctx->tess_offchip_ring, 0,
+  sctx->tess_offchip_ring->width0, false, false, 0, 0, 
0);
 }
 
 /**
-- 
2.8.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 08/14] radeonsi: Store inputs to memory when not using a TCS.

2016-05-10 Thread Bas Nieuwenhuizen
We need to copy the VS outputs to memory. I decided to do this
using a shader key, as the value depends on other shaders.

I also switch the fixed function TCS over to monolithic, as
otherwisze many of the user SGPR's need to be passed to the
epilog, which increases register pressure, or complexity to
avoid that. The main body of the fixed function TCS is not
that interesting to precompile anyway, since we do it on
demand and it is very small.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_shader.c| 45 +
 src/gallium/drivers/radeonsi/si_shader.h|  1 +
 src/gallium/drivers/radeonsi/si_state_shaders.c |  3 ++
 3 files changed, 49 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index 90830ee..50c48bf 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2423,6 +2423,48 @@ handle_semantic:
}
 }
 
+static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
+{
+   struct si_shader_context *ctx = si_shader_context(bld_base);
+   struct gallivm_state *gallivm = bld_base->base.gallivm;
+   LLVMValueRef invocation_id, rw_buffers, buffer, buffer_offset;
+   LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
+   unsigned num_outputs, i;
+
+   invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
+
+   rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
+   buffer = build_indexed_load_const(ctx, rw_buffers,
+   lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
+
+   buffer_offset = LLVMGetParam(ctx->radeon_bld.main_fn, 
ctx->param_oc_lds);
+
+   lds_vertex_stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
+   lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
+lds_vertex_stride, "");
+   lds_base = get_tcs_in_current_patch_offset(ctx);
+   lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, 
"");
+
+   num_outputs = 
util_last_bit64(ctx->shader->key.tcs.epilog.inputs_to_copy);
+   for (i = 0; i < num_outputs; i++) {
+   if (!((1llu << i) & ctx->shader->key.tcs.epilog.inputs_to_copy))
+   continue;
+
+   LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
+   lp_build_const_int32(gallivm, 4 * 
i),
+"");
+
+   LLVMValueRef buffer_addr = get_buffer_address(ctx, 
invocation_id,
+ lp_build_const_int32(gallivm, i));
+
+   LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
+ lds_ptr);
+
+   build_tbuffer_store_dwords(ctx, buffer, value, 4, buffer_addr,
+  buffer_offset, 0);
+   }
+}
+
 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
  LLVMValueRef rel_patch_id,
  LLVMValueRef invocation_id,
@@ -2564,6 +2606,7 @@ static void si_llvm_emit_tcs_epilogue(struct 
lp_build_tgsi_context *bld_base)
return;
}
 
+   si_copy_tcs_inputs(bld_base);
si_write_tess_factors(bld_base, rel_patch_id, invocation_id, 
tf_lds_offset);
 }
 
@@ -7374,6 +7417,8 @@ int si_shader_create(struct si_screen *sscreen, 
LLVMTargetMachineRef tm,
  shader->key.vs.as_ls != mainp->key.vs.as_ls)) ||
(shader->selector->type == PIPE_SHADER_TESS_EVAL &&
 shader->key.tes.as_es != mainp->key.tes.as_es) ||
+   (shader->selector->type == PIPE_SHADER_TESS_CTRL &&
+shader->key.tcs.epilog.inputs_to_copy) ||
shader->selector->type == PIPE_SHADER_COMPUTE) {
/* Monolithic shader (compiled as a whole, has many variants,
 * may take a long time to compile).
diff --git a/src/gallium/drivers/radeonsi/si_shader.h 
b/src/gallium/drivers/radeonsi/si_shader.h
index 26be25e..67b457b 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -304,6 +304,7 @@ struct si_vs_epilog_bits {
 /* Common TCS bits between the shader key and the epilog key. */
 struct si_tcs_epilog_bits {
unsignedprim_mode:3;
+   uint64_tinputs_to_copy;
 };
 
 /* Common PS bits between the shader key and the prolog key. */
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c 
b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 32ac95d..f48582a 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/driver

[Mesa-dev] [PATCH 03/14] radeonsi: Define build_tbuffer_store_dwords earlier to support new users.

2016-05-10 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_shader.c | 138 +++
 1 file changed, 69 insertions(+), 69 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index 29ff68f..5897149 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -664,6 +664,75 @@ static LLVMValueRef get_dw_address(struct 
si_shader_context *ctx,
lp_build_const_int32(gallivm, param * 4), "");
 }
 
+/* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by 
num_channels=1..4.
+ * The type of vdata must be one of i32 (num_channels=1), v2i32 
(num_channels=2),
+ * or v4i32 (num_channels=3,4). */
+static void build_tbuffer_store(struct si_shader_context *ctx,
+   LLVMValueRef rsrc,
+   LLVMValueRef vdata,
+   unsigned num_channels,
+   LLVMValueRef vaddr,
+   LLVMValueRef soffset,
+   unsigned inst_offset,
+   unsigned dfmt,
+   unsigned nfmt,
+   unsigned offen,
+   unsigned idxen,
+   unsigned glc,
+   unsigned slc,
+   unsigned tfe)
+{
+   struct gallivm_state *gallivm = >radeon_bld.gallivm;
+   LLVMValueRef args[] = {
+   rsrc,
+   vdata,
+   LLVMConstInt(ctx->i32, num_channels, 0),
+   vaddr,
+   soffset,
+   LLVMConstInt(ctx->i32, inst_offset, 0),
+   LLVMConstInt(ctx->i32, dfmt, 0),
+   LLVMConstInt(ctx->i32, nfmt, 0),
+   LLVMConstInt(ctx->i32, offen, 0),
+   LLVMConstInt(ctx->i32, idxen, 0),
+   LLVMConstInt(ctx->i32, glc, 0),
+   LLVMConstInt(ctx->i32, slc, 0),
+   LLVMConstInt(ctx->i32, tfe, 0)
+   };
+
+   /* The instruction offset field has 12 bits */
+   assert(offen || inst_offset < (1 << 12));
+
+   /* The intrinsic is overloaded, we need to add a type suffix for 
overloading to work. */
+   unsigned func = CLAMP(num_channels, 1, 3) - 1;
+   const char *types[] = {"i32", "v2i32", "v4i32"};
+   char name[256];
+   snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
+
+   lp_build_intrinsic(gallivm->builder, name, ctx->voidt,
+  args, Elements(args), 0);
+}
+
+static void build_tbuffer_store_dwords(struct si_shader_context *ctx,
+LLVMValueRef rsrc,
+LLVMValueRef vdata,
+unsigned num_channels,
+LLVMValueRef vaddr,
+LLVMValueRef soffset,
+unsigned inst_offset)
+{
+   static unsigned dfmt[] = {
+   V_008F0C_BUF_DATA_FORMAT_32,
+   V_008F0C_BUF_DATA_FORMAT_32_32,
+   V_008F0C_BUF_DATA_FORMAT_32_32_32,
+   V_008F0C_BUF_DATA_FORMAT_32_32_32_32
+   };
+   assert(num_channels >= 1 && num_channels <= 4);
+
+   build_tbuffer_store(ctx, rsrc, vdata, num_channels, vaddr, soffset,
+   inst_offset, dfmt[num_channels-1],
+   V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1, 0);
+}
+
 /**
  * Load from LDS.
  *
@@ -1837,75 +1906,6 @@ static void si_dump_streamout(struct 
pipe_stream_output_info *so)
}
 }
 
-/* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by 
num_channels=1..4.
- * The type of vdata must be one of i32 (num_channels=1), v2i32 
(num_channels=2),
- * or v4i32 (num_channels=3,4). */
-static void build_tbuffer_store(struct si_shader_context *ctx,
-   LLVMValueRef rsrc,
-   LLVMValueRef vdata,
-   unsigned num_channels,
-   LLVMValueRef vaddr,
-   LLVMValueRef soffset,
-   unsigned inst_offset,
-   unsigned dfmt,
-   unsigned nfmt,
-   unsigned offen,
-   unsigned idxen,
-   unsigned glc,
-   unsigned slc,
-   unsigned tfe)
-{
-   struct gallivm_state *gallivm = >radeon_bld.gallivm;
-   LLVMValueRef args[] = {
-   rsrc,
-   vdata,
-   LLVMConstInt(ctx-&

Re: [Mesa-dev] [PATCH] gallium/util: fix undefined shift to the last bit in u_bit_scan

2016-04-15 Thread Bas Nieuwenhuizen
Reviewed-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>

On Sat, Apr 16, 2016 at 2:13 AM, Marek Olšák <mar...@gmail.com> wrote:
> From: Marek Olšák <marek.ol...@amd.com>
>
> ---
>  src/gallium/auxiliary/util/u_math.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/src/gallium/auxiliary/util/u_math.h 
> b/src/gallium/auxiliary/util/u_math.h
> index 0a82915..d983af3 100644
> --- a/src/gallium/auxiliary/util/u_math.h
> +++ b/src/gallium/auxiliary/util/u_math.h
> @@ -489,7 +489,7 @@ static inline int
>  u_bit_scan(unsigned *mask)
>  {
> int i = ffs(*mask) - 1;
> -   *mask &= ~(1 << i);
> +   *mask &= ~(1u << i);
> return i;
>  }
>
> --
> 2.5.0
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] gallium/util: fix u_bit_scan_consecutive_range for mask == 0xffffffff

2016-04-15 Thread Bas Nieuwenhuizen
On Sat, Apr 16, 2016 at 12:34 AM, Marek Olšák <mar...@gmail.com> wrote:
> From: Marek Olšák <marek.ol...@amd.com>
>
> The second ffs returns 0, yielding count == -1.
> ---
>  src/gallium/auxiliary/util/u_math.h | 6 ++
>  1 file changed, 6 insertions(+)
>
> diff --git a/src/gallium/auxiliary/util/u_math.h 
> b/src/gallium/auxiliary/util/u_math.h
> index b4ac0db..2880eea 100644
> --- a/src/gallium/auxiliary/util/u_math.h
> +++ b/src/gallium/auxiliary/util/u_math.h
> @@ -518,6 +518,12 @@ u_bit_scan64(uint64_t *mask)
>  static inline void
>  u_bit_scan_consecutive_range(unsigned *mask, int *start, int *count)
>  {
> +   if (*mask == 0x) {
> +  *start = 0;
> +  *count = 32;
> +  *mask = 0;
> +  return;
> +   }
> *start = ffs(*mask) - 1;
> *count = ffs(~(*mask >> *start)) - 1;
> *mask &= ~(((1 << *count) - 1) << *start);

This signed shift needs to be fixed for *count == 31 too. Either way,

Reviewed-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>

> --
> 2.5.0
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v4] radeonsi: enable TGSI support cap for compute shaders

2016-04-19 Thread Bas Nieuwenhuizen
v2: Use chip_class instead of family.

v3: Check kernel version for SI.

v4: Preemptively allow amdgpu winsys for SI.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 docs/GL3.txt  |  4 ++--
 docs/relnotes/11.3.0.html |  1 +
 src/gallium/drivers/radeon/r600_pipe_common.c | 21 -
 src/gallium/drivers/radeonsi/si_pipe.c| 16 ++--
 4 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 3febd6e..6214f8d 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -167,7 +167,7 @@ GL 4.3, GLSL 4.30:
   GL_ARB_arrays_of_arrays   DONE (all drivers that 
support GLSL 1.30)
   GL_ARB_ES3_compatibility  DONE (all drivers that 
support GLSL 3.30)
   GL_ARB_clear_buffer_objectDONE (all drivers)
-  GL_ARB_compute_shader DONE (i965)
+  GL_ARB_compute_shader DONE (i965, radeonsi)
   GL_ARB_copy_image DONE (i965, nv50, 
nvc0, r600, radeonsi)
   GL_KHR_debug  DONE (all drivers)
   GL_ARB_explicit_uniform_location  DONE (all drivers that 
support GLSL)
@@ -225,7 +225,7 @@ GL 4.5, GLSL 4.50:
 These are the extensions cherry-picked to make GLES 3.1
 GLES3.1, GLSL ES 3.1
   GL_ARB_arrays_of_arrays   DONE (all drivers that 
support GLSL 1.30)
-  GL_ARB_compute_shader DONE (i965)
+  GL_ARB_compute_shader DONE (i965, radeonsi)
   GL_ARB_draw_indirect  DONE (i965, nvc0, 
r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_explicit_uniform_location  DONE (all drivers that 
support GLSL)
   GL_ARB_framebuffer_no_attachments DONE (i965, nvc0, 
r600, radeonsi, softpipe)
diff --git a/docs/relnotes/11.3.0.html b/docs/relnotes/11.3.0.html
index 0f9aed8..5a7083c 100644
--- a/docs/relnotes/11.3.0.html
+++ b/docs/relnotes/11.3.0.html
@@ -45,6 +45,7 @@ Note: some of the new features are only available with 
certain drivers.
 
 
 OpenGL 4.2 on radeonsi
+GL_ARB_compute_shader on radeonsi
 GL_ARB_framebuffer_no_attachments on nvc0, r600, radeonsi, softpipe
 GL_ARB_internalformat_query2 on all drivers
 GL_ARB_robust_buffer_access_behavior on radeonsi
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c 
b/src/gallium/drivers/radeon/r600_pipe_common.c
index a8660f2..9ed6da6 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -646,23 +646,34 @@ static int r600_get_compute_param(struct pipe_screen 
*screen,
uint64_t *grid_size = ret;
grid_size[0] = 65535;
grid_size[1] = 65535;
-   grid_size[2] = 1;
+   grid_size[2] = 65535;
}
return 3 * sizeof(uint64_t) ;
 
case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
if (ret) {
uint64_t *block_size = ret;
-   block_size[0] = 256;
-   block_size[1] = 256;
-   block_size[2] = 256;
+   if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 &&
+   ir_type == PIPE_SHADER_IR_TGSI) {
+   block_size[0] = 2048;
+   block_size[1] = 2048;
+   block_size[2] = 2048;
+   } else {
+   block_size[0] = 256;
+   block_size[1] = 256;
+   block_size[2] = 256;
+   }
}
return 3 * sizeof(uint64_t);
 
case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
if (ret) {
uint64_t *max_threads_per_block = ret;
-   *max_threads_per_block = 256;
+   if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 &&
+   ir_type == PIPE_SHADER_IR_TGSI)
+   *max_threads_per_block = 2048;
+   else
+   *max_threads_per_block = 256;
}
return sizeof(uint64_t);
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
b/src/gallium/drivers/radeonsi/si_pipe.c
index dabd28a..17d59b6 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -473,6 +473,8 @@ static int si_get_param(struct pipe_screen* pscreen, enum 
pipe_cap param)
 
 static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, 
enum pipe_shader_cap param)
 {
+  

Re: [Mesa-dev] [PATCH] radeonsi: enable GLSL 4.30 and therefore OpenGL 4.3

2016-04-18 Thread Bas Nieuwenhuizen
On Mon, Apr 18, 2016 at 7:58 PM, Ian Romanick  wrote:
> On 04/15/2016 03:33 AM, Marek Olšák wrote:
>> The same thing Nicolai said: This can be committed before the UE4
>> compile failure is fixed.
>
> Is there a bug filed for that problem?  Has anyone diagnosed the issue?
>

I just filed a bug for this issue at
https://bugs.freedesktop.org/show_bug.cgi?id=95005

- Bas
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v3 2/2] radeonsi: enable TGSI support cap for compute shaders

2016-04-19 Thread Bas Nieuwenhuizen
On Tue, Apr 19, 2016 at 4:03 PM, Alex Deucher <alexdeuc...@gmail.com> wrote:
> On Tue, Apr 19, 2016 at 6:56 AM, Marek Olšák <mar...@gmail.com> wrote:
>> Reviewed-by: Marek Olšák <marek.ol...@amd.com>
>>
>> Marek
>>
>> On Tue, Apr 19, 2016 at 1:39 AM, Bas Nieuwenhuizen
>> <b...@basnieuwenhuizen.nl> wrote:
>>> v2: Use chip_class instead of family.
>>>
>>> v3: Check kernel version for SI.
>>>
>>> Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
>>> ---
>>>  docs/GL3.txt  |  4 ++--
>>>  docs/relnotes/11.3.0.html |  1 +
>>>  src/gallium/drivers/radeon/r600_pipe_common.c | 21 -
>>>  src/gallium/drivers/radeonsi/si_pipe.c| 15 +--
>>>  4 files changed, 32 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/docs/GL3.txt b/docs/GL3.txt
>>> index 3febd6e..6214f8d 100644
>>> --- a/docs/GL3.txt
>>> +++ b/docs/GL3.txt
>>> @@ -167,7 +167,7 @@ GL 4.3, GLSL 4.30:
>>>GL_ARB_arrays_of_arrays   DONE (all drivers 
>>> that support GLSL 1.30)
>>>GL_ARB_ES3_compatibility  DONE (all drivers 
>>> that support GLSL 3.30)
>>>GL_ARB_clear_buffer_objectDONE (all drivers)
>>> -  GL_ARB_compute_shader DONE (i965)
>>> +  GL_ARB_compute_shader DONE (i965, 
>>> radeonsi)
>>>GL_ARB_copy_image DONE (i965, nv50, 
>>> nvc0, r600, radeonsi)
>>>GL_KHR_debug  DONE (all drivers)
>>>GL_ARB_explicit_uniform_location  DONE (all drivers 
>>> that support GLSL)
>>> @@ -225,7 +225,7 @@ GL 4.5, GLSL 4.50:
>>>  These are the extensions cherry-picked to make GLES 3.1
>>>  GLES3.1, GLSL ES 3.1
>>>GL_ARB_arrays_of_arrays   DONE (all drivers 
>>> that support GLSL 1.30)
>>> -  GL_ARB_compute_shader DONE (i965)
>>> +  GL_ARB_compute_shader DONE (i965, 
>>> radeonsi)
>>>GL_ARB_draw_indirect  DONE (i965, nvc0, 
>>> r600, radeonsi, llvmpipe, softpipe)
>>>GL_ARB_explicit_uniform_location  DONE (all drivers 
>>> that support GLSL)
>>>GL_ARB_framebuffer_no_attachments DONE (i965, nvc0, 
>>> r600, radeonsi, softpipe)
>>> diff --git a/docs/relnotes/11.3.0.html b/docs/relnotes/11.3.0.html
>>> index 0f9aed8..5a7083c 100644
>>> --- a/docs/relnotes/11.3.0.html
>>> +++ b/docs/relnotes/11.3.0.html
>>> @@ -45,6 +45,7 @@ Note: some of the new features are only available with 
>>> certain drivers.
>>>
>>>  
>>>  OpenGL 4.2 on radeonsi
>>> +GL_ARB_compute_shader on radeonsi
>>>  GL_ARB_framebuffer_no_attachments on nvc0, r600, radeonsi, 
>>> softpipe
>>>  GL_ARB_internalformat_query2 on all drivers
>>>  GL_ARB_robust_buffer_access_behavior on radeonsi
>>> diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c 
>>> b/src/gallium/drivers/radeon/r600_pipe_common.c
>>> index a7477ab..64da62f 100644
>>> --- a/src/gallium/drivers/radeon/r600_pipe_common.c
>>> +++ b/src/gallium/drivers/radeon/r600_pipe_common.c
>>> @@ -645,23 +645,34 @@ static int r600_get_compute_param(struct pipe_screen 
>>> *screen,
>>> uint64_t *grid_size = ret;
>>> grid_size[0] = 65535;
>>> grid_size[1] = 65535;
>>> -   grid_size[2] = 1;
>>> +   grid_size[2] = 65535;
>>> }
>>> return 3 * sizeof(uint64_t) ;
>>>
>>> case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
>>> if (ret) {
>>> uint64_t *block_size = ret;
>>> -   block_size[0] = 256;
>>> -   block_size[1] = 256;
>>> -   block_size[2] = 256;
>>> +   if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 
>>> &&
>>> +   ir_type == PIPE_SHADER_IR_TGSI) {
>>> +

[Mesa-dev] [PATCH v2 04/12] winsys/amdgpu: Enlarge const IB size.

2016-04-16 Thread Bas Nieuwenhuizen
Necessary to prevent performance regressions due to extra flushing.

Probably should enlarge it even further when also updating
uniforms through the CE, but this seems large enough for now.

v2: Add preamble IB.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 28 
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index 0182660..69902c4 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -199,14 +199,26 @@ amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx 
*rwctx)
 /* COMMAND SUBMISSION */
 
 static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_ib *ib,
-  struct amdgpu_cs_ib_info *info)
+  struct amdgpu_cs_ib_info *info, unsigned ib_type)
 {
/* Small IBs are better than big IBs, because the GPU goes idle quicker
 * and there is less waiting for buffers and fences. Proof:
 *   http://www.phoronix.com/scan.php?page=article=mesa-111-si=1
 */
-   const unsigned buffer_size = 128 * 1024 * 4;
-   const unsigned ib_size = 20 * 1024 * 4;
+   unsigned buffer_size, ib_size;
+
+   switch (ib_type) {
+   case IB_CONST_PREAMBLE:
+  buffer_size = 4 * 1024 * 4;
+  ib_size = 1024 * 4;
+   case IB_CONST:
+  buffer_size = 512 * 1024 * 4;
+  ib_size = 128 * 1024 * 4;
+  break;
+   case IB_MAIN:
+  buffer_size = 128 * 1024 * 4;
+  ib_size = 20 * 1024 * 4;
+   }
 
ib->base.cdw = 0;
ib->base.buf = NULL;
@@ -350,7 +362,7 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
   return NULL;
}
 
-   if (!amdgpu_get_new_ib(>ws->base, >main, >ib[IB_MAIN])) {
+   if (!amdgpu_get_new_ib(>ws->base, >main, >ib[IB_MAIN], 
IB_MAIN)) {
   amdgpu_destroy_cs_context(cs);
   FREE(cs);
   return NULL;
@@ -373,7 +385,7 @@ amdgpu_cs_add_const_ib(struct radeon_winsys_cs *rcs)
if (cs->ring_type != RING_GFX || cs->const_ib.ib_mapped)
   return NULL;
 
-   if (!amdgpu_get_new_ib(>base, >const_ib, >ib[IB_CONST]))
+   if (!amdgpu_get_new_ib(>base, >const_ib, >ib[IB_CONST], 
IB_CONST))
   return NULL;
 
cs->request.number_of_ibs = 2;
@@ -760,12 +772,12 @@ static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
 cleanup:
amdgpu_cs_context_cleanup(cs);
 
-   amdgpu_get_new_ib(>base, >main, >ib[IB_MAIN]);
+   amdgpu_get_new_ib(>base, >main, >ib[IB_MAIN], IB_MAIN);
if (cs->const_ib.ib_mapped)
-  amdgpu_get_new_ib(>base, >const_ib, >ib[IB_CONST]);
+  amdgpu_get_new_ib(>base, >const_ib, >ib[IB_CONST], IB_CONST);
if (cs->const_preamble_ib.ib_mapped)
   amdgpu_get_new_ib(>base, >const_preamble_ib,
- >ib[IB_CONST_PREAMBLE]);
+ >ib[IB_CONST_PREAMBLE], 
IB_CONST_PREAMBLE);
 
ws->num_cs_flushes++;
 }
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 08/12] radeonsi: Allocate chunks of CE ram.

2016-04-16 Thread Bas Nieuwenhuizen
v2: Use 32 byte alignment.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_descriptors.c | 30 +++
 src/gallium/drivers/radeonsi/si_state.h   |  3 +++
 2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index 7fc1461..a937973 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -98,7 +98,8 @@ static void si_init_descriptors(struct si_descriptors *desc,
unsigned shader_userdata_index,
unsigned element_dw_size,
unsigned num_elements,
-   const uint32_t *null_descriptor)
+   const uint32_t *null_descriptor,
+   unsigned *ce_offset)
 {
int i;
 
@@ -109,6 +110,10 @@ static void si_init_descriptors(struct si_descriptors 
*desc,
desc->num_elements = num_elements;
desc->list_dirty = true; /* upload the list before the next draw */
desc->shader_userdata_offset = shader_userdata_index * 4;
+   desc->ce_offset = *ce_offset;
+
+   /* make sure that ce_offset stays 32 byte aligned */
+   *ce_offset += align(element_dw_size * num_elements * 4, 32);
 
/* Initialize the array to NULL descriptors if the element size is 8. */
if (null_descriptor) {
@@ -511,14 +516,15 @@ static void si_init_buffer_resources(struct 
si_buffer_resources *buffers,
 unsigned num_buffers,
 unsigned shader_userdata_index,
 enum radeon_bo_usage shader_usage,
-enum radeon_bo_priority priority)
+enum radeon_bo_priority priority,
+unsigned *ce_offset)
 {
buffers->shader_usage = shader_usage;
buffers->priority = priority;
buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
 
si_init_descriptors(>desc, shader_userdata_index, 4,
-   num_buffers, NULL);
+   num_buffers, NULL, ce_offset);
 }
 
 static void si_release_buffer_resources(struct si_buffer_resources *buffers)
@@ -1326,29 +1332,35 @@ void si_emit_shader_userdata(struct si_context *sctx, 
struct r600_atom *atom)
 void si_init_all_descriptors(struct si_context *sctx)
 {
int i;
+   unsigned ce_offset = 0;
 
for (i = 0; i < SI_NUM_SHADERS; i++) {
si_init_buffer_resources(>const_buffers[i],
 SI_NUM_CONST_BUFFERS, 
SI_SGPR_CONST_BUFFERS,
-RADEON_USAGE_READ, 
RADEON_PRIO_CONST_BUFFER);
+RADEON_USAGE_READ, 
RADEON_PRIO_CONST_BUFFER,
+_offset);
si_init_buffer_resources(>rw_buffers[i],
 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
-RADEON_USAGE_READWRITE, 
RADEON_PRIO_RINGS_STREAMOUT);
+RADEON_USAGE_READWRITE, 
RADEON_PRIO_RINGS_STREAMOUT,
+_offset);
si_init_buffer_resources(>shader_buffers[i],
 SI_NUM_SHADER_BUFFERS, 
SI_SGPR_SHADER_BUFFERS,
-RADEON_USAGE_READWRITE, 
RADEON_PRIO_SHADER_RW_BUFFER);
+RADEON_USAGE_READWRITE, 
RADEON_PRIO_SHADER_RW_BUFFER,
+_offset);
 
si_init_descriptors(>samplers[i].views.desc,
SI_SGPR_SAMPLERS, 16, SI_NUM_SAMPLERS,
-   null_texture_descriptor);
+   null_texture_descriptor, _offset);
 
si_init_descriptors(>images[i].desc,
SI_SGPR_IMAGES, 8, SI_NUM_IMAGES,
-   null_image_descriptor);
+   null_image_descriptor, _offset);
}
 
si_init_descriptors(>vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
-   4, SI_NUM_VERTEX_BUFFERS, NULL);
+   4, SI_NUM_VERTEX_BUFFERS, NULL, _offset);
+
+   assert(ce_offset <= 32768);
 
/* Set pipe_context functions. */
sctx->b.b.bind_sampler_states = si_bind_sampler_states;
diff --git a/src/gallium/drivers/radeonsi/si_state.h 
b/src/gallium/drivers/radeonsi/si_state.h
index 6748f80..fbdc8ee 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/rade

[Mesa-dev] [PATCH v2 03/12] winsys/amdgpu: Add support for const IB.

2016-04-16 Thread Bas Nieuwenhuizen
From: Marek Olšák <marek.ol...@amd.com>

v2: Use the correct IB to update request (Bas Nieuwenhuizen)
v3: Add preamble IB. (Bas Nieuwenhuizen)
---
 src/gallium/drivers/radeon/radeon_winsys.h | 30 ++
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c  | 88 --
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.h  | 11 +++-
 3 files changed, 124 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_winsys.h 
b/src/gallium/drivers/radeon/radeon_winsys.h
index aa94df6..451d8a4 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -603,6 +603,36 @@ struct radeon_winsys {
   void *flush_ctx);
 
 /**
+ * Add a constant engine IB to a graphics CS. This makes the graphics CS
+ * from "cs_create" a group of two IBs that share a buffer list and are
+ * flushed together.
+ *
+ * The returned constant CS is only a stream for writing packets to the new
+ * IB. Calling other winsys functions with it is not allowed, not even
+ * "cs_destroy".
+ *
+ * In order to add buffers and check memory usage, use the graphics CS.
+ * In order to flush it, use the graphics CS, which will flush both IBs.
+ * Destroying the graphics CS will destroy both of them.
+ *
+ * \param cs  The graphics CS from "cs_create" that will hold the buffer
+ *list and will be used for flushing.
+ */
+struct radeon_winsys_cs *(*cs_add_const_ib)(struct radeon_winsys_cs *cs);
+
+ /**
+ * Add a constant engine preamble IB to a graphics CS. This add an extra IB
+ * in similar manner to cs_add_const_ib. This should always be called after
+ * cs_add_const_ib.
+ *
+ * The returned IB is a constant engine IB that only gets flushed if the
+ * context changed.
+ *
+ * \param cs  The graphics CS from "cs_create" that will hold the buffer
+ *list and will be used for flushing.
+ */
+struct radeon_winsys_cs *(*cs_add_const_preamble_ib)(struct 
radeon_winsys_cs *cs);
+/**
  * Destroy a command stream.
  *
  * \param csA command stream to destroy.
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index b0fe8b9..0182660 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -350,19 +350,62 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
   return NULL;
}
 
-   if (!amdgpu_get_new_ib(>ws->base, >main, >ib)) {
+   if (!amdgpu_get_new_ib(>ws->base, >main, >ib[IB_MAIN])) {
   amdgpu_destroy_cs_context(cs);
   FREE(cs);
   return NULL;
}
 
cs->request.number_of_ibs = 1;
-   cs->request.ibs = >ib;
+   cs->request.ibs = >ib[IB_MAIN];
 
p_atomic_inc(>ws->num_cs);
return >main.base;
 }
 
+static struct radeon_winsys_cs *
+amdgpu_cs_add_const_ib(struct radeon_winsys_cs *rcs)
+{
+   struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs;
+   struct amdgpu_winsys *ws = cs->ctx->ws;
+
+   /* only one const IB can be added */
+   if (cs->ring_type != RING_GFX || cs->const_ib.ib_mapped)
+  return NULL;
+
+   if (!amdgpu_get_new_ib(>base, >const_ib, >ib[IB_CONST]))
+  return NULL;
+
+   cs->request.number_of_ibs = 2;
+   cs->request.ibs = >ib[IB_CONST];
+   cs->ib[IB_CONST].flags = AMDGPU_IB_FLAG_CE;
+
+   return >const_ib.base;
+}
+
+static struct radeon_winsys_cs *
+amdgpu_cs_add_const_preamble_ib(struct radeon_winsys_cs *rcs)
+{
+   struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs;
+   struct amdgpu_winsys *ws = cs->ctx->ws;
+
+   /* only one const preamble IB can be added and only when the const IB has
+* also been mapped */
+   if (cs->ring_type != RING_GFX || !cs->const_ib.ib_mapped ||
+   cs->const_preamble_ib.ib_mapped)
+  return NULL;
+
+   if (!amdgpu_get_new_ib(>base, >const_preamble_ib,
+ >ib[IB_CONST_PREAMBLE], 
IB_CONST_PREAMBLE))
+  return NULL;
+
+   cs->request.number_of_ibs = 3;
+   cs->request.ibs = >ib[IB_CONST_PREAMBLE];
+   cs->ib[IB_CONST_PREAMBLE].flags = AMDGPU_IB_FLAG_CE | 
AMDGPU_IB_FLAG_PREAMBLE;
+
+   return >const_preamble_ib.base;
+}
+
 #define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value)
 
 int amdgpu_lookup_buffer(struct amdgpu_cs *cs, struct amdgpu_winsys_bo *bo)
@@ -621,6 +664,15 @@ static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
   /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements */
   while (rcs->cdw & 7)
  OUT_CS(rcs, 0x1000); /* type3 nop packet */
+
+  /* Also pad the const IB. */
+  if (cs->const_ib.ib_mapped)
+ while (!cs->const_ib.base.cdw || (cs->const_ib.base.cdw & 7))
+OUT_CS(>const_ib.

[Mesa-dev] [PATCH v2 07/12] radeonsi: Add CE synchronization.

2016-04-16 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
Reviewed-by: Marek Olšák <marek.ol...@amd.com>
---
 src/gallium/drivers/radeonsi/si_pipe.h   |  1 +
 src/gallium/drivers/radeonsi/si_state_draw.c | 24 
 2 files changed, 25 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index b8db3b2..b3f5ed5 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -193,6 +193,7 @@ struct si_context {
struct si_screen*screen;
struct radeon_winsys_cs *ce_ib;
struct radeon_winsys_cs *ce_preamble_ib;
+   boolce_need_synchronization;
 
struct pipe_fence_handle*last_gfx_fence;
struct si_shader_ctx_state  fixed_func_tcs_shader;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c 
b/src/gallium/drivers/radeonsi/si_state_draw.c
index 40cad50..dd13d51 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -753,6 +753,25 @@ static void si_get_draw_start_count(struct si_context 
*sctx,
}
 }
 
+static void si_ce_pre_draw_synchronization(struct si_context *sctx) {
+   if (sctx->ce_need_synchronization) {
+   radeon_emit(sctx->ce_ib, PKT3(PKT3_INCREMENT_CE_COUNTER, 0, 0));
+   radeon_emit(sctx->ce_ib, 1);
+
+   radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_WAIT_ON_CE_COUNTER, 0, 
0));
+   radeon_emit(sctx->b.gfx.cs, 1);
+   }
+}
+
+static void si_ce_post_draw_synchronization(struct si_context *sctx) {
+   if (sctx->ce_need_synchronization) {
+   radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_INCREMENT_DE_COUNTER, 0, 
0));
+   radeon_emit(sctx->b.gfx.cs, 0);
+
+   sctx->ce_need_synchronization = false;
+   }
+}
+
 void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 {
struct si_context *sctx = (struct si_context *)ctx;
@@ -882,8 +901,13 @@ void si_draw_vbo(struct pipe_context *ctx, const struct 
pipe_draw_info *info)
si_emit_scratch_reloc(sctx);
si_emit_rasterizer_prim_state(sctx);
si_emit_draw_registers(sctx, info);
+
+   si_ce_pre_draw_synchronization(sctx);
+
si_emit_draw_packets(sctx, info, );
 
+   si_ce_post_draw_synchronization(sctx);
+
if (sctx->trace_buf)
si_trace_emit(sctx);
 
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 02/12] winsys/amdgpu: split IB data into a new structure in preparation for CE

2016-04-16 Thread Bas Nieuwenhuizen
From: Marek Olšák <marek.ol...@amd.com>

Reviewed-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.c |  5 ---
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.h |  6 +++
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 68 +++
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.h | 16 
 4 files changed, 48 insertions(+), 47 deletions(-)

diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index 1b2793a..036301e 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -38,11 +38,6 @@
 #include 
 #include 
 
-static inline struct amdgpu_winsys_bo *amdgpu_winsys_bo(struct pb_buffer *bo)
-{
-   return (struct amdgpu_winsys_bo *)bo;
-}
-
 static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
enum radeon_bo_usage usage)
 {
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h 
b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
index 54f5dbd..69ada10 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
@@ -69,6 +69,12 @@ void amdgpu_bo_destroy(struct pb_buffer *_buf);
 void amdgpu_bo_init_functions(struct amdgpu_winsys *ws);
 
 static inline
+struct amdgpu_winsys_bo *amdgpu_winsys_bo(struct pb_buffer *bo)
+{
+   return (struct amdgpu_winsys_bo *)bo;
+}
+
+static inline
 void amdgpu_winsys_bo_reference(struct amdgpu_winsys_bo **dst,
 struct amdgpu_winsys_bo *src)
 {
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index 63c72fc..b0fe8b9 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -198,7 +198,8 @@ amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx 
*rwctx)
 
 /* COMMAND SUBMISSION */
 
-static bool amdgpu_get_new_ib(struct amdgpu_cs *cs)
+static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_ib *ib,
+  struct amdgpu_cs_ib_info *info)
 {
/* Small IBs are better than big IBs, because the GPU goes idle quicker
 * and there is less waiting for buffers and fences. Proof:
@@ -207,39 +208,36 @@ static bool amdgpu_get_new_ib(struct amdgpu_cs *cs)
const unsigned buffer_size = 128 * 1024 * 4;
const unsigned ib_size = 20 * 1024 * 4;
 
-   cs->base.cdw = 0;
-   cs->base.buf = NULL;
+   ib->base.cdw = 0;
+   ib->base.buf = NULL;
 
/* Allocate a new buffer for IBs if the current buffer is all used. */
-   if (!cs->big_ib_buffer ||
-   cs->used_ib_space + ib_size > cs->big_ib_buffer->size) {
-  struct radeon_winsys *ws = >ctx->ws->base;
+   if (!ib->big_ib_buffer ||
+   ib->used_ib_space + ib_size > ib->big_ib_buffer->size) {
 
-  pb_reference(>big_ib_buffer, NULL);
-  cs->big_ib_winsys_buffer = NULL;
-  cs->ib_mapped = NULL;
-  cs->used_ib_space = 0;
+  pb_reference(>big_ib_buffer, NULL);
+  ib->ib_mapped = NULL;
+  ib->used_ib_space = 0;
 
-  cs->big_ib_buffer = ws->buffer_create(ws, buffer_size,
+  ib->big_ib_buffer = ws->buffer_create(ws, buffer_size,
 4096, true,
 RADEON_DOMAIN_GTT,
 RADEON_FLAG_CPU_ACCESS);
-  if (!cs->big_ib_buffer)
+  if (!ib->big_ib_buffer)
  return false;
 
-  cs->ib_mapped = ws->buffer_map(cs->big_ib_buffer, NULL,
+  ib->ib_mapped = ws->buffer_map(ib->big_ib_buffer, NULL,
  PIPE_TRANSFER_WRITE);
-  if (!cs->ib_mapped) {
- pb_reference(>big_ib_buffer, NULL);
+  if (!ib->ib_mapped) {
+ pb_reference(>big_ib_buffer, NULL);
  return false;
   }
-
-  cs->big_ib_winsys_buffer = (struct amdgpu_winsys_bo*)cs->big_ib_buffer;
}
 
-   cs->ib.ib_mc_address = cs->big_ib_winsys_buffer->va + cs->used_ib_space;
-   cs->base.buf = (uint32_t*)(cs->ib_mapped + cs->used_ib_space);
-   cs->base.max_dw = ib_size / 4;
+   info->ib_mc_address = amdgpu_winsys_bo(ib->big_ib_buffer)->va +
+ ib->used_ib_space;
+   ib->base.buf = (uint32_t*)(ib->ib_mapped + ib->used_ib_space);
+   ib->base.max_dw = ib_size / 4;
return true;
 }
 
@@ -271,9 +269,6 @@ static boolean amdgpu_init_cs_context(struct amdgpu_cs *cs,
   break;
}
 
-   cs->request.number_of_ibs = 1;
-   cs->request.ibs = >ib;
-
cs->max_num_buffers = 512;
cs->buffers = (struct amdgpu_cs_buffer*)
   CALLOC(1, cs->max_num_buffers * sizeof(struct 
amdgpu_cs_buffer));
@@ -355,14 +350,17 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
   return NULL;
}
 
-   if (!amdgpu_get_new_ib

[Mesa-dev] [PATCH v2 10/12] radeonsi: Replace list_dirty with a mask.

2016-04-16 Thread Bas Nieuwenhuizen
We can then upload only the dirty ones with the constant engine.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_descriptors.c | 37 ---
 src/gallium/drivers/radeonsi/si_state.h   |  9 +--
 2 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index 0b44ecf..8ca0253 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -109,7 +109,7 @@ static void si_init_descriptors(struct si_descriptors *desc,
desc->list = CALLOC(num_elements, element_dw_size * 4);
desc->element_dw_size = element_dw_size;
desc->num_elements = num_elements;
-   desc->list_dirty = true; /* upload the list before the next draw */
+   desc->dirty_mask = num_elements == 64 ? ~0llu : (1llu << num_elements) 
- 1;
desc->shader_userdata_offset = shader_userdata_index * 4;
desc->ce_offset = *ce_offset;
 
@@ -159,7 +159,7 @@ static bool si_upload_descriptors(struct si_context *sctx,
unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
void *ptr;
 
-   if (!desc->list_dirty)
+   if (!desc->dirty_mask)
return true;
 
u_upload_alloc(sctx->b.uploader, 0, list_size, 256,
@@ -173,7 +173,7 @@ static bool si_upload_descriptors(struct si_context *sctx,
radeon_add_to_buffer_list(>b, >b.gfx, desc->buffer,
  RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
 
-   desc->list_dirty = false;
+   desc->dirty_mask = 0;
desc->pointer_dirty = true;
si_mark_atom_dirty(sctx, >shader_userdata.atom);
return true;
@@ -216,6 +216,8 @@ static void si_sampler_views_begin_new_cs(struct si_context 
*sctx,
si_sampler_view_add_buffer(sctx, views->views[i]->texture);
}
 
+   views->desc.ce_ram_dirty = true;
+
if (!views->desc.buffer)
return;
radeon_add_to_buffer_list(>b, >b.gfx, views->desc.buffer,
@@ -267,7 +269,7 @@ static void si_set_sampler_view(struct si_context *sctx,
views->desc.enabled_mask &= ~(1llu << slot);
}
 
-   views->desc.list_dirty = true;
+   views->desc.dirty_mask |= 1llu << slot;
 }
 
 static bool is_compressed_colortex(struct r600_texture *rtex)
@@ -373,6 +375,8 @@ si_image_views_begin_new_cs(struct si_context *sctx, struct 
si_images_info *imag
si_sampler_view_add_buffer(sctx, view->resource);
}
 
+   images->desc.ce_ram_dirty = true;
+
if (images->desc.buffer) {
radeon_add_to_buffer_list(>b, >b.gfx,
  images->desc.buffer,
@@ -390,7 +394,7 @@ si_disable_shader_image(struct si_images_info *images, 
unsigned slot)
 
memcpy(images->desc.list + slot*8, null_image_descriptor, 8*4);
images->desc.enabled_mask &= ~(1llu << slot);
-   images->desc.list_dirty = true;
+   images->desc.dirty_mask |= 1llu << slot;
}
 }
 
@@ -471,7 +475,7 @@ si_set_shader_images(struct pipe_context *pipe, unsigned 
shader,
}
 
images->desc.enabled_mask |= 1llu << slot;
-   images->desc.list_dirty = true;
+   images->desc.dirty_mask |= 1llu << slot;
}
 }
 
@@ -529,7 +533,7 @@ static void si_bind_sampler_states(struct pipe_context 
*ctx, unsigned shader,
continue;
 
memcpy(desc->list + slot * 16 + 12, sstates[i]->val, 4*4);
-   desc->list_dirty = true;
+   desc->dirty_mask |= 1llu << slot;
}
 }
 
@@ -576,6 +580,8 @@ static void si_buffer_resources_begin_new_cs(struct 
si_context *sctx,
  buffers->shader_usage, buffers->priority);
}
 
+   buffers->desc.ce_ram_dirty = true;
+
if (!buffers->desc.buffer)
return;
radeon_add_to_buffer_list(>b, >b.gfx,
@@ -772,7 +778,7 @@ static void si_set_constant_buffer(struct pipe_context 
*ctx, uint shader, uint s
buffers->desc.enabled_mask &= ~(1llu << slot);
}
 
-   buffers->desc.list_dirty = true;
+   buffers->desc.dirty_mask |= 1llu << slot;
 }
 
 /* SHADER BUFFERS */
@@ -819,9 +825,9 @@ static void si_set_shader_buffers(struct pipe_context *ctx, 
unsigned shader,
radeon_add_to_buffer_list(>b, >b.gfx, buf,
  buffers->shader_usage, buffers->priority);
buffers->desc.enabled_mask |= 1llu << slot;
+   buffers->

[Mesa-dev] [PATCH v2 01/12] gallium/radeon: move ring_type into winsyses

2016-04-16 Thread Bas Nieuwenhuizen
From: Marek Olšák <marek.ol...@amd.com>

Not used by drivers.

Reviewed-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeon/radeon_winsys.h|  1 -
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c |  8 
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.h |  1 +
 src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 10 +-
 src/gallium/winsys/radeon/drm/radeon_drm_cs.h |  1 +
 5 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_winsys.h 
b/src/gallium/drivers/radeon/radeon_winsys.h
index 0c03652..aa94df6 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -229,7 +229,6 @@ struct radeon_winsys_cs {
 unsignedcdw;  /* Number of used dwords. */
 unsignedmax_dw; /* Maximum number of dwords. */
 uint32_t*buf; /* The command buffer. */
-enum ring_type  ring_type;
 };
 
 struct radeon_info {
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index a9fc55f..63c72fc 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -348,7 +348,7 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
cs->ctx = ctx;
cs->flush_cs = flush;
cs->flush_data = flush_ctx;
-   cs->base.ring_type = ring_type;
+   cs->ring_type = ring_type;
 
if (!amdgpu_init_cs_context(cs, ring_type)) {
   FREE(cs);
@@ -570,7 +570,7 @@ static void amdgpu_cs_do_submission(struct amdgpu_cs *cs,
cs->request.fence_info.handle = NULL;
if (cs->request.ip_type != AMDGPU_HW_IP_UVD && cs->request.ip_type != 
AMDGPU_HW_IP_VCE) {
cs->request.fence_info.handle = cs->ctx->user_fence_bo;
-   cs->request.fence_info.offset = cs->base.ring_type;
+   cs->request.fence_info.offset = cs->ring_type;
}
 
r = amdgpu_cs_submit(cs->ctx->ctx, 0, >request, 1);
@@ -591,7 +591,7 @@ static void amdgpu_cs_do_submission(struct amdgpu_cs *cs,
   amdgpu_fence_submitted(fence, >request, user_fence);
 
   for (i = 0; i < cs->num_buffers; i++)
- amdgpu_fence_reference(>buffers[i].bo->fence[cs->base.ring_type],
+ amdgpu_fence_reference(>buffers[i].bo->fence[cs->ring_type],
 fence);
}
pipe_mutex_unlock(ws->bo_fence_lock);
@@ -613,7 +613,7 @@ static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
struct amdgpu_cs *cs = amdgpu_cs(rcs);
struct amdgpu_winsys *ws = cs->ctx->ws;
 
-   switch (cs->base.ring_type) {
+   switch (cs->ring_type) {
case RING_DMA:
   /* pad DMA ring to 8 DWs */
   while (rcs->cdw & 7)
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
index a2fb44a..f4709e9 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
@@ -66,6 +66,7 @@ struct amdgpu_cs {
unsigned used_ib_space;
 
/* amdgpu_cs_submit parameters */
+   enum ring_type  ring_type;
struct amdgpu_cs_requestrequest;
struct amdgpu_cs_ib_infoib;
 
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c 
b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index b50e19c..6b2694c 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -197,8 +197,8 @@ radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
 cs->csc = >csc1;
 cs->cst = >csc2;
 cs->base.buf = cs->csc->buf;
-cs->base.ring_type = ring_type;
 cs->base.max_dw = ARRAY_SIZE(cs->csc->buf);
+cs->ring_type = ring_type;
 
 p_atomic_inc(>num_cs);
 return >base;
@@ -281,7 +281,7 @@ static unsigned radeon_add_buffer(struct radeon_drm_cs *cs,
  * This doesn't have to be done if virtual memory is enabled,
  * because there is no offset patching with virtual memory.
  */
-if (cs->base.ring_type != RING_DMA || cs->ws->info.has_virtual_memory) 
{
+if (cs->ring_type != RING_DMA || cs->ws->info.has_virtual_memory) {
 return i;
 }
 }
@@ -466,7 +466,7 @@ static void radeon_drm_cs_flush(struct radeon_winsys_cs 
*rcs,
 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 struct radeon_cs_context *tmp;
 
-switch (cs->base.ring_type) {
+switch (cs->ring_type) {
 case RING_DMA:
 /* pad DMA ring to 8 DWs */
 if (cs->ws->info.chip_class <= SI) {
@@ -526,7 +526,7 @@ static void radeon_drm_cs_flush(struct radeon_winsys_cs 
*rcs,
 p_atomic_inc(>cst->relocs_bo[i].bo->num_active_ioctls);
 }
 
-switch (cs->base.ring_type) {
+switch (cs->ring_type) {
 case RI

[Mesa-dev] [PATCH v2 05/12] radeonsi: Create CE IB.

2016-04-16 Thread Bas Nieuwenhuizen
Based on work by Marek Olšák.

v2: Add preamble IB.

Leaves the load packet in the space calculation as the
radeon winsys might not be able to support a premable.

The added space calculation may look expensive, but
is converted to a constant with (at least) -O2 and -O3.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeon/r600_pipe_common.c |  1 +
 src/gallium/drivers/radeon/r600_pipe_common.h |  1 +
 src/gallium/drivers/radeonsi/si_hw_context.c  | 32 ++-
 src/gallium/drivers/radeonsi/si_pipe.c| 12 ++
 src/gallium/drivers/radeonsi/si_pipe.h|  3 +++
 5 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c 
b/src/gallium/drivers/radeon/r600_pipe_common.c
index a7477ab..a8660f2 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -402,6 +402,7 @@ static const struct debug_named_value 
common_debug_options[] = {
{ "norbplus", DBG_NO_RB_PLUS, "Disable RB+ on Stoney." },
{ "sisched", DBG_SI_SCHED, "Enable LLVM SI Machine Instruction 
Scheduler." },
{ "mono", DBG_MONOLITHIC_SHADERS, "Use old-style monolithic shaders 
compiled on demand" },
+   { "noce", DBG_NO_CE, "Disable the constant engine"},
 
DEBUG_NAMED_VALUE_END /* must be last */
 };
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h 
b/src/gallium/drivers/radeon/r600_pipe_common.h
index b23a780..91f8d5e 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -95,6 +95,7 @@
 #define DBG_NO_RB_PLUS (1llu << 45)
 #define DBG_SI_SCHED   (1llu << 46)
 #define DBG_MONOLITHIC_SHADERS (1llu << 47)
+#define DBG_NO_CE  (1llu << 48)
 
 #define R600_MAP_BUFFER_ALIGNMENT 64
 #define R600_MAX_VIEWPORTS16
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c 
b/src/gallium/drivers/radeonsi/si_hw_context.c
index b621b55..60f2b58 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -26,10 +26,38 @@
 
 #include "si_pipe.h"
 
+static unsigned si_descriptor_list_cs_space(unsigned count, unsigned 
element_size)
+{
+   /* 5 dwords for possible load to reinitialize + 5 dwords for write to
+* L2 + 3 bytes for every range written to CE RAM.
+*/
+   return 5 + 5 + 3 + count * MAX2(3, element_size);
+}
+
+static unsigned si_ce_needed_cs_space() {
+   unsigned space = 0;
+
+   space += si_descriptor_list_cs_space(SI_NUM_CONST_BUFFERS, 4);
+   space += si_descriptor_list_cs_space(SI_NUM_RW_BUFFERS, 4);
+   space += si_descriptor_list_cs_space(SI_NUM_SHADER_BUFFERS, 4);
+   space += si_descriptor_list_cs_space(SI_NUM_SAMPLERS, 16);
+   space += si_descriptor_list_cs_space(SI_NUM_IMAGES, 8);
+
+   space *= SI_NUM_SHADERS;
+
+   space += si_descriptor_list_cs_space(SI_NUM_VERTEX_BUFFERS, 4);
+
+   /* Increment CE counter packet */
+   space += 2;
+
+   return space;
+}
+
 /* initialize */
 void si_need_cs_space(struct si_context *ctx)
 {
struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
+   struct radeon_winsys_cs *ce_ib = ctx->ce_ib;
struct radeon_winsys_cs *dma = ctx->b.dma.cs;
 
/* Flush the DMA IB if it's not empty. */
@@ -53,7 +81,9 @@ void si_need_cs_space(struct si_context *ctx)
/* If the CS is sufficiently large, don't count the space needed
 * and just flush if there is not enough space left.
 */
-   if (unlikely(cs->cdw > cs->max_dw - 2048))
+   if (unlikely(cs->cdw > cs->max_dw - 2048 ||
+ (ce_ib && ce_ib->max_dw - ce_ib->cdw <
+  si_ce_needed_cs_space(
ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
b/src/gallium/drivers/radeonsi/si_pipe.c
index 6a990ed..ceacf37 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -142,6 +142,18 @@ static struct pipe_context *si_create_context(struct 
pipe_screen *screen,
 
sctx->b.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX,
   si_context_gfx_flush, sctx);
+
+   if (!(sscreen->b.debug_flags & DBG_NO_CE) && ws->cs_add_const_ib) {
+   sctx->ce_ib = ws->cs_add_const_ib(sctx->b.gfx.cs);
+   if (!sctx->ce_ib)
+   goto fail;
+
+   if (ws->cs_add_const_preamble_ib) {
+   sctx->ce_preamble_ib =
+  ws->cs_add_const_preamble_ib(sctx->b.gfx.cs);
+   }
+   }
+
sctx->b.gf

[Mesa-dev] [PATCH v2 12/12] radeonsi: Use CE for all descriptors.

2016-04-16 Thread Bas Nieuwenhuizen
v2: Load previous list for new CS instead of re-emitting
all descriptors.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_descriptors.c | 70 +++
 1 file changed, 60 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index 8ca0253..e4f06e7 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -60,6 +60,7 @@
 #include "si_shader.h"
 #include "sid.h"
 
+#include "util/u_math.h"
 #include "util/u_memory.h"
 #include "util/u_suballoc.h"
 #include "util/u_upload_mgr.h"
@@ -152,29 +153,78 @@ static bool si_ce_upload(struct si_context *sctx, 
unsigned ce_offset, unsigned s
return true;
 }
 
+static void si_reinitialize_ce_ram(struct si_context *sctx,
+struct si_descriptors *desc)
+{
+   if (desc->buffer) {
+   struct r600_resource *buffer = (struct 
r600_resource*)desc->buffer;
+   unsigned list_size = desc->num_elements * desc->element_dw_size 
* 4;
+   uint64_t va = buffer->gpu_address + desc->buffer_offset;
+   struct radeon_winsys_cs *ib = sctx->ce_preamble_ib;
+
+   if (!ib)
+   ib = sctx->ce_ib;
+
+   list_size = align(list_size, 32);
+
+   radeon_emit(ib, PKT3(PKT3_LOAD_CONST_RAM, 3, 0));
+   radeon_emit(ib, va);
+   radeon_emit(ib, va >> 32);
+   radeon_emit(ib, list_size / 4);
+   radeon_emit(ib, desc->ce_offset);
+
+   radeon_add_to_buffer_list(>b, >b.gfx, desc->buffer,
+   RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
+   }
+   desc->ce_ram_dirty = false;
+}
 
 static bool si_upload_descriptors(struct si_context *sctx,
  struct si_descriptors *desc)
 {
unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
-   void *ptr;
 
if (!desc->dirty_mask)
return true;
 
-   u_upload_alloc(sctx->b.uploader, 0, list_size, 256,
-  >buffer_offset,
-  (struct pipe_resource**)>buffer, );
-   if (!desc->buffer)
-   return false; /* skip the draw call */
+   if (sctx->ce_ib) {
+   uint32_t const* list = (uint32_t const*)desc->list;
 
-   util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
+   if (desc->ce_ram_dirty)
+   si_reinitialize_ce_ram(sctx, desc);
 
-   radeon_add_to_buffer_list(>b, >b.gfx, desc->buffer,
- RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
+   while(desc->dirty_mask) {
+   int begin, count;
+   u_bit_scan_consecutive_range64(>dirty_mask, 
,
+  );
 
-   desc->dirty_mask = 0;
+   begin *= desc->element_dw_size;
+   count *= desc->element_dw_size;
+
+   radeon_emit(sctx->ce_ib,
+   PKT3(PKT3_WRITE_CONST_RAM, count, 0));
+   radeon_emit(sctx->ce_ib, desc->ce_offset + begin * 4);
+   radeon_emit_array(sctx->ce_ib, list + begin, count);
+   }
+
+   if (!si_ce_upload(sctx, desc->ce_offset, list_size,
+  >buffer_offset, >buffer))
+   return false;
+   } else {
+   void *ptr;
+
+   u_upload_alloc(sctx->b.uploader, 0, list_size, 256,
+   >buffer_offset,
+   (struct pipe_resource**)>buffer, );
+   if (!desc->buffer)
+   return false; /* skip the draw call */
+
+   util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
+   }
+   radeon_add_to_buffer_list(>b, >b.gfx, desc->buffer,
+ RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
desc->pointer_dirty = true;
+   desc->dirty_mask = 0;
si_mark_atom_dirty(sctx, >shader_userdata.atom);
return true;
 }
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 11/12] gallium/util: Add u_bit_scan_consecutive_range64.

2016-04-16 Thread Bas Nieuwenhuizen
For use by radeonsi.

v2: Make sure that it works for all 64 bits set.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/auxiliary/util/u_math.h | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/src/gallium/auxiliary/util/u_math.h 
b/src/gallium/auxiliary/util/u_math.h
index b4ac0db..96c1618 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -523,6 +523,20 @@ u_bit_scan_consecutive_range(unsigned *mask, int *start, 
int *count)
*mask &= ~(((1 << *count) - 1) << *start);
 }
 
+static inline void
+u_bit_scan_consecutive_range64(uint64_t *mask, int *start, int *count)
+{
+   if (*mask == ~0llu) {
+  *start = 0;
+  *count = 64;
+  *mask = 0;
+  return;
+   }
+   *start = ffsll(*mask) - 1;
+   *count = ffsll(~(*mask >> *start)) - 1;
+   *mask &= ~(((1llu << *count) - 1) << *start);
+}
+
 /**
  * Return float bits.
  */
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 00/12] Use the constant engine in radeonsi

2016-04-16 Thread Bas Nieuwenhuizen
For this v2 I did the following changes:
 - Use Nicolai's proposed dirty mask structure.

 - Use load packets to reinitialize CE ram.

 - Use a preamble IB for reinitializing CE ram. I've made the
   preamble IB optional in radeonsi, as the radeon kernel
   driver does not support it as far as I can see. 

 - Changed the calculation of needed space in the CE IB.

 - Dropped the patch for using the CE for the vertex buffer descriptors.
   Given that this series did not really change GPU performance anyway,
   and the CE way costing slightly more CPU as we never partially update
   them, I think it is not useful to add it already.

 - Fix u_bit_scan_consecutive_range64 for *mask = ~0llu.

 - Minor whitespace fixes.

Bas Nieuwenhuizen (9):
  winsys/amdgpu: Enlarge const IB size.
  radeonsi: Create CE IB.
  radeonsi: Add CE packet definitions.
  radeonsi: Add CE synchronization.
  radeonsi: Allocate chunks of CE ram.
  radeonsi: Add CE uploader.
  radeonsi: Replace list_dirty with a mask.
  gallium/util: Add u_bit_scan_consecutive_range64.
  radeonsi: Use CE for all descriptors.

Marek Olšák (3):
  gallium/radeon: move ring_type into winsyses
  winsys/amdgpu: split IB data into a new structure in preparation for
CE
  winsys/amdgpu: Add support for const IB.

 src/gallium/auxiliary/util/u_math.h   |  14 +++
 src/gallium/drivers/radeon/r600_pipe_common.c |   1 +
 src/gallium/drivers/radeon/r600_pipe_common.h |   1 +
 src/gallium/drivers/radeon/radeon_winsys.h|  31 -
 src/gallium/drivers/radeonsi/si_descriptors.c | 161 +++-
 src/gallium/drivers/radeonsi/si_hw_context.c  |  32 -
 src/gallium/drivers/radeonsi/si_pipe.c|  23 
 src/gallium/drivers/radeonsi/si_pipe.h|   7 ++
 src/gallium/drivers/radeonsi/si_state.h   |  12 +-
 src/gallium/drivers/radeonsi/si_state_draw.c  |  24 
 src/gallium/drivers/radeonsi/sid.h|   6 +
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.c |   5 -
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.h |   6 +
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 172 --
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.h |  28 +++--
 src/gallium/winsys/radeon/drm/radeon_drm_cs.c |  10 +-
 src/gallium/winsys/radeon/drm/radeon_drm_cs.h |   1 +
 17 files changed, 437 insertions(+), 97 deletions(-)

-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 09/12] radeonsi: Add CE uploader.

2016-04-16 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
Reviewed-by: Marek Olšák <marek.ol...@amd.com>
---
 src/gallium/drivers/radeonsi/si_descriptors.c | 23 +++
 src/gallium/drivers/radeonsi/si_pipe.c| 11 +++
 src/gallium/drivers/radeonsi/si_pipe.h|  3 +++
 3 files changed, 37 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index a937973..0b44ecf 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -61,6 +61,7 @@
 #include "sid.h"
 
 #include "util/u_memory.h"
+#include "util/u_suballoc.h"
 #include "util/u_upload_mgr.h"
 
 
@@ -130,6 +131,28 @@ static void si_release_descriptors(struct si_descriptors 
*desc)
FREE(desc->list);
 }
 
+static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned 
size,
+unsigned *out_offset, struct r600_resource **out_buf) {
+   uint64_t va;
+
+   u_suballocator_alloc(sctx->ce_suballocator, size, out_offset,
+(struct pipe_resource**)out_buf);
+   if (!out_buf)
+   return false;
+
+   va = (*out_buf)->gpu_address + *out_offset;
+
+   radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0));
+   radeon_emit(sctx->ce_ib, ce_offset);
+   radeon_emit(sctx->ce_ib, size / 4);
+   radeon_emit(sctx->ce_ib, va);
+   radeon_emit(sctx->ce_ib, va >> 32);
+
+   sctx->ce_need_synchronization = true;
+   return true;
+}
+
+
 static bool si_upload_descriptors(struct si_context *sctx,
  struct si_descriptors *desc)
 {
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
b/src/gallium/drivers/radeonsi/si_pipe.c
index ceacf37..b956cda 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -29,6 +29,7 @@
 #include "radeon/radeon_llvm_emit.h"
 #include "radeon/radeon_uvd.h"
 #include "util/u_memory.h"
+#include "util/u_suballoc.h"
 #include "vl/vl_decoder.h"
 
 /*
@@ -41,6 +42,9 @@ static void si_destroy_context(struct pipe_context *context)
 
si_release_all_descriptors(sctx);
 
+   if (sctx->ce_suballocator)
+   u_suballocator_destroy(sctx->ce_suballocator);
+
pipe_resource_reference(>esgs_ring, NULL);
pipe_resource_reference(>gsvs_ring, NULL);
pipe_resource_reference(>tf_ring, NULL);
@@ -152,6 +156,13 @@ static struct pipe_context *si_create_context(struct 
pipe_screen *screen,
sctx->ce_preamble_ib =
   ws->cs_add_const_preamble_ib(sctx->b.gfx.cs);
}
+
+   sctx->ce_suballocator =
+   u_suballocator_create(>b.b, 1024 * 1024,
+ 64, PIPE_BIND_CUSTOM,
+ PIPE_USAGE_DEFAULT, 
FALSE);
+   if (!sctx->ce_suballocator)
+   goto fail;
}
 
sctx->b.gfx.flush = si_context_gfx_flush;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index b3f5ed5..1540c7f 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -80,6 +80,7 @@
 
 struct si_compute;
 struct hash_table;
+struct u_suballocator;
 
 struct si_screen {
struct r600_common_screen   b;
@@ -191,9 +192,11 @@ struct si_context {
void*custom_blend_dcc_decompress;
void*pstipple_sampler_state;
struct si_screen*screen;
+
struct radeon_winsys_cs *ce_ib;
struct radeon_winsys_cs *ce_preamble_ib;
boolce_need_synchronization;
+   struct u_suballocator   *ce_suballocator;
 
struct pipe_fence_handle*last_gfx_fence;
struct si_shader_ctx_state  fixed_func_tcs_shader;
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 06/12] radeonsi: Add CE packet definitions.

2016-04-16 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
Reviewed-by: Marek Olšák <marek.ol...@amd.com>
---
 src/gallium/drivers/radeonsi/sid.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/sid.h 
b/src/gallium/drivers/radeonsi/sid.h
index f0aa605..1072e0a 100644
--- a/src/gallium/drivers/radeonsi/sid.h
+++ b/src/gallium/drivers/radeonsi/sid.h
@@ -145,6 +145,12 @@
 #define PKT3_SET_SH_REG0x76
 #define PKT3_SET_SH_REG_OFFSET 0x77
 #define PKT3_SET_UCONFIG_REG   0x79 /* new for CIK */
+#define PKT3_LOAD_CONST_RAM0x80
+#define PKT3_WRITE_CONST_RAM   0x81
+#define PKT3_DUMP_CONST_RAM0x83
+#define PKT3_INCREMENT_CE_COUNTER  0x84
+#define PKT3_INCREMENT_DE_COUNTER  0x85
+#define PKT3_WAIT_ON_CE_COUNTER0x86
 
 #define PKT_TYPE_S(x)   (((x) & 0x3) << 30)
 #define PKT_TYPE_G(x)   (((x) >> 30) & 0x3)
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2 05/12] radeonsi: Create CE IB.

2016-04-17 Thread Bas Nieuwenhuizen
On Mon, Apr 18, 2016 at 12:04 AM, Marek Olšák <mar...@gmail.com> wrote:
> On Sun, Apr 17, 2016 at 1:43 AM, Bas Nieuwenhuizen
> <b...@basnieuwenhuizen.nl> wrote:
>> Based on work by Marek Olšák.
>>
>> v2: Add preamble IB.
>>
>> Leaves the load packet in the space calculation as the
>> radeon winsys might not be able to support a premable.
>>
>> The added space calculation may look expensive, but
>> is converted to a constant with (at least) -O2 and -O3.
>>
>> Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
>> ---
>>  src/gallium/drivers/radeon/r600_pipe_common.c |  1 +
>>  src/gallium/drivers/radeon/r600_pipe_common.h |  1 +
>>  src/gallium/drivers/radeonsi/si_hw_context.c  | 32 
>> ++-
>>  src/gallium/drivers/radeonsi/si_pipe.c| 12 ++
>>  src/gallium/drivers/radeonsi/si_pipe.h|  3 +++
>>  5 files changed, 48 insertions(+), 1 deletion(-)
>>
>> diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c 
>> b/src/gallium/drivers/radeon/r600_pipe_common.c
>> index a7477ab..a8660f2 100644
>> --- a/src/gallium/drivers/radeon/r600_pipe_common.c
>> +++ b/src/gallium/drivers/radeon/r600_pipe_common.c
>> @@ -402,6 +402,7 @@ static const struct debug_named_value 
>> common_debug_options[] = {
>> { "norbplus", DBG_NO_RB_PLUS, "Disable RB+ on Stoney." },
>> { "sisched", DBG_SI_SCHED, "Enable LLVM SI Machine Instruction 
>> Scheduler." },
>> { "mono", DBG_MONOLITHIC_SHADERS, "Use old-style monolithic shaders 
>> compiled on demand" },
>> +   { "noce", DBG_NO_CE, "Disable the constant engine"},
>>
>> DEBUG_NAMED_VALUE_END /* must be last */
>>  };
>> diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h 
>> b/src/gallium/drivers/radeon/r600_pipe_common.h
>> index b23a780..91f8d5e 100644
>> --- a/src/gallium/drivers/radeon/r600_pipe_common.h
>> +++ b/src/gallium/drivers/radeon/r600_pipe_common.h
>> @@ -95,6 +95,7 @@
>>  #define DBG_NO_RB_PLUS (1llu << 45)
>>  #define DBG_SI_SCHED   (1llu << 46)
>>  #define DBG_MONOLITHIC_SHADERS (1llu << 47)
>> +#define DBG_NO_CE  (1llu << 48)
>>
>>  #define R600_MAP_BUFFER_ALIGNMENT 64
>>  #define R600_MAX_VIEWPORTS16
>> diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c 
>> b/src/gallium/drivers/radeonsi/si_hw_context.c
>> index b621b55..60f2b58 100644
>> --- a/src/gallium/drivers/radeonsi/si_hw_context.c
>> +++ b/src/gallium/drivers/radeonsi/si_hw_context.c
>> @@ -26,10 +26,38 @@
>>
>>  #include "si_pipe.h"
>>
>> +static unsigned si_descriptor_list_cs_space(unsigned count, unsigned 
>> element_size)
>> +{
>> +   /* 5 dwords for possible load to reinitialize + 5 dwords for write to
>> +* L2 + 3 bytes for every range written to CE RAM.
>> +*/
>> +   return 5 + 5 + 3 + count * MAX2(3, element_size);
>
> Please make it clear in the comment that the load packet is needed in
> the main CE IB only when the preamble is unsupported.
>
> Also, that MAX2 statement seems useless, because element_size is
> always >= 4. Did you mean (3 + element_size)?
>

No, just defensive programming against element_size < 3. Can remove it
if preferred.

>> +}
>> +
>> +static unsigned si_ce_needed_cs_space() {
>> +   unsigned space = 0;
>> +
>> +   space += si_descriptor_list_cs_space(SI_NUM_CONST_BUFFERS, 4);
>> +   space += si_descriptor_list_cs_space(SI_NUM_RW_BUFFERS, 4);
>> +   space += si_descriptor_list_cs_space(SI_NUM_SHADER_BUFFERS, 4);
>> +   space += si_descriptor_list_cs_space(SI_NUM_SAMPLERS, 16);
>> +   space += si_descriptor_list_cs_space(SI_NUM_IMAGES, 8);
>> +
>> +   space *= SI_NUM_SHADERS;
>> +
>> +   space += si_descriptor_list_cs_space(SI_NUM_VERTEX_BUFFERS, 4);
>
> You dropped vertex buffer support, didn't you? This looks like a leftover.
>

Indeed, will fix.

>> +
>> +   /* Increment CE counter packet */
>> +   space += 2;
>> +
>> +   return space;
>> +}
>> +
>>  /* initialize */
>>  void si_need_cs_space(struct si_context *ctx)
>>  {
>> struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
>> +   struct radeon_winsys_cs *ce_ib = ctx->ce_ib;
>> struct radeon_winsys_cs *dma = ctx->b.dma.cs;
>>
>> /* Flush the DMA IB if it's n

Re: [Mesa-dev] [PATCH v2 08/12] radeonsi: Allocate chunks of CE ram.

2016-04-17 Thread Bas Nieuwenhuizen
On Mon, Apr 18, 2016 at 12:13 AM, Marek Olšák <mar...@gmail.com> wrote:
> On Sun, Apr 17, 2016 at 1:43 AM, Bas Nieuwenhuizen
> <b...@basnieuwenhuizen.nl> wrote:
>> v2: Use 32 byte alignment.
>>
>> Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
>> ---
>>  src/gallium/drivers/radeonsi/si_descriptors.c | 30 
>> +++
>>  src/gallium/drivers/radeonsi/si_state.h   |  3 +++
>>  2 files changed, 24 insertions(+), 9 deletions(-)
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
>> b/src/gallium/drivers/radeonsi/si_descriptors.c
>> index 7fc1461..a937973 100644
>> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
>> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
>> @@ -98,7 +98,8 @@ static void si_init_descriptors(struct si_descriptors 
>> *desc,
>> unsigned shader_userdata_index,
>> unsigned element_dw_size,
>> unsigned num_elements,
>> -   const uint32_t *null_descriptor)
>> +   const uint32_t *null_descriptor,
>> +   unsigned *ce_offset)
>>  {
>> int i;
>>
>> @@ -109,6 +110,10 @@ static void si_init_descriptors(struct si_descriptors 
>> *desc,
>> desc->num_elements = num_elements;
>> desc->list_dirty = true; /* upload the list before the next draw */
>> desc->shader_userdata_offset = shader_userdata_index * 4;
>> +   desc->ce_offset = *ce_offset;
>> +
>> +   /* make sure that ce_offset stays 32 byte aligned */
>> +   *ce_offset += align(element_dw_size * num_elements * 4, 32);
>
> Please define SI_GLOBAL_L2_CACHE_LINE_SIZE 64 in si_pipe.h and use that.
>
> Aligning the offset is indeed a good idea, but aligning to the cache
> line size should be even better.

The 32-byte alignment is a restriction of the LOAD_CONST_RAM packet
the wants size and offset to be 32 byte aligned.

Note that this is alignment within the CE ram, not the aligment of the
memory loaded and stored. Does the L2 cache line size really matter
here? I would expect the alignment that the uploader allocation gives
to be more important.

- Bas
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/4] radeonsi: use CE suballocator for CP DMA realignment.

2016-04-19 Thread Bas Nieuwenhuizen
I retract patch 1 and 2. Large scratch buffers are nice, but the
hardware only supports a 32-bit offset into it.

- Bas

On Wed, Apr 20, 2016 at 12:50 AM, Bas Nieuwenhuizen
<b...@basnieuwenhuizen.nl> wrote:
> Use the CE suballocator instead of the normal one as the usage
> is most similar to the CE, i.e. only read and written on GPU
> and not mapped to CPU.
>
> Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
> ---
>  src/gallium/drivers/radeonsi/si_cp_dma.c | 27 ++-
>  1 file changed, 10 insertions(+), 17 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c 
> b/src/gallium/drivers/radeonsi/si_cp_dma.c
> index 38e0ee6..264789d 100644
> --- a/src/gallium/drivers/radeonsi/si_cp_dma.c
> +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
> @@ -222,31 +222,24 @@ static void si_clear_buffer(struct pipe_context *ctx, 
> struct pipe_resource *dst,
>   */
>  static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size)
>  {
> +
> uint64_t va;
> unsigned dma_flags = 0;
> unsigned scratch_size = CP_DMA_ALIGNMENT * 2;
> +   unsigned offset;
> +   struct r600_resource *tmp_buf;
>
> assert(size < CP_DMA_ALIGNMENT);
>
> -   /* Use the scratch buffer as the dummy buffer. The 3D engine should be
> -* idle at this point.
> -*/
> -   if (!sctx->scratch_buffer ||
> -   sctx->scratch_buffer->b.b.width0 < scratch_size) {
> -   r600_resource_reference(>scratch_buffer, NULL);
> -   sctx->scratch_buffer =
> -   si_resource_create_custom(>screen->b.b,
> - PIPE_USAGE_DEFAULT,
> - scratch_size);
> -   if (!sctx->scratch_buffer)
> -   return;
> -   sctx->emit_scratch_reloc = true;
> -   }
> +   u_suballocator_alloc(sctx->ce_suballocator, scratch_size, ,
> +(struct pipe_resource**)_buf);
> +   if (!tmp_buf)
> +   return;
>
> -   si_cp_dma_prepare(sctx, >scratch_buffer->b.b,
> - >scratch_buffer->b.b, size, size, _flags);
> +   si_cp_dma_prepare(sctx, _buf->b.b,
> + _buf->b.b, size, size, _flags);
>
> -   va = sctx->scratch_buffer->gpu_address;
> +   va = tmp_buf->gpu_address + offset;
> si_emit_cp_dma_copy_buffer(sctx, va, va + CP_DMA_ALIGNMENT, size,
>dma_flags);
>  }
> --
> 2.8.0
>
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/4] radeonsi: Prevent overflow when calculating the scratch size.

2016-04-19 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_compute.c   | 5 -
 src/gallium/drivers/radeonsi/si_state_shaders.c | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index b4981b4..b46a2fe 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -201,7 +201,10 @@ static bool si_setup_compute_scratch_buffer(struct 
si_context *sctx,
 {
uint64_t scratch_bo_size, scratch_needed;
scratch_bo_size = 0;
-   scratch_needed = config->scratch_bytes_per_wave * sctx->scratch_waves;
+
+   scratch_needed = config->scratch_bytes_per_wave *
+ (uint64_t)sctx->scratch_waves;
+
if (sctx->compute_scratch_buffer)
scratch_bo_size = sctx->compute_scratch_buffer->size;
 
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c 
b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 412a4c9..fef676b 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1679,7 +1679,7 @@ static bool si_update_spi_tmpring_size(struct si_context 
*sctx)
unsigned scratch_bytes_per_wave =
si_get_max_scratch_bytes_per_wave(sctx);
unsigned scratch_needed_size = scratch_bytes_per_wave *
-   sctx->scratch_waves;
+   (uint64_t)sctx->scratch_waves;
unsigned spi_tmpring_size;
int r;
 
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/4] radeonsi: Use winsys pb_buffer for scratch buffers.

2016-04-19 Thread Bas Nieuwenhuizen
Allows allocation of >= 4 GiB.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_compute.c   | 23 +++
 src/gallium/drivers/radeonsi/si_pipe.c  |  4 ++--
 src/gallium/drivers/radeonsi/si_pipe.h  |  4 ++--
 src/gallium/drivers/radeonsi/si_shader.c|  2 +-
 src/gallium/drivers/radeonsi/si_shader.h|  2 +-
 src/gallium/drivers/radeonsi/si_state_draw.c|  6 +++---
 src/gallium/drivers/radeonsi/si_state_shaders.c | 18 +-
 7 files changed, 29 insertions(+), 30 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 905c169..b4981b4 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -203,30 +203,29 @@ static bool si_setup_compute_scratch_buffer(struct 
si_context *sctx,
scratch_bo_size = 0;
scratch_needed = config->scratch_bytes_per_wave * sctx->scratch_waves;
if (sctx->compute_scratch_buffer)
-   scratch_bo_size = sctx->compute_scratch_buffer->b.b.width0;
+   scratch_bo_size = sctx->compute_scratch_buffer->size;
 
if (scratch_bo_size < scratch_needed) {
-   pipe_resource_reference(
-   (struct pipe_resource**)>compute_scratch_buffer,
-   NULL);
+   pb_reference(>compute_scratch_buffer, NULL);
 
-   sctx->compute_scratch_buffer =
-   si_resource_create_custom(>screen->b.b,
-PIPE_USAGE_DEFAULT, scratch_needed);
+   sctx->compute_scratch_buffer = 
sctx->b.ws->buffer_create(sctx->b.ws,
+   scratch_needed, 256, false, RADEON_DOMAIN_VRAM,
+   RADEON_FLAG_NO_CPU_ACCESS);
 
if (!sctx->compute_scratch_buffer)
return false;
}
 
if (sctx->compute_scratch_buffer != shader->scratch_bo && 
scratch_needed) {
-   uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
+   uint64_t scratch_va = sctx->b.ws->buffer_get_virtual_address(
+ sctx->compute_scratch_buffer);
 
si_shader_apply_scratch_relocs(sctx, shader, scratch_va);
 
if (si_shader_binary_upload(sctx->screen, shader))
return false;
 
-   r600_resource_reference(>scratch_bo,
+   pb_reference(>scratch_bo,
sctx->compute_scratch_buffer);
}
 
@@ -282,9 +281,9 @@ static bool si_switch_compute_shader(struct si_context 
*sctx,
config->scratch_bytes_per_wave *
sctx->scratch_waves);
 
-   radeon_add_to_buffer_list(>b, >b.gfx,
- shader->scratch_bo, RADEON_USAGE_READWRITE,
- RADEON_PRIO_SCRATCH_BUFFER);
+   sctx->b.ws->cs_add_buffer(sctx->b.gfx.cs, shader->scratch_bo,
+ RADEON_USAGE_READWRITE, 
RADEON_DOMAIN_VRAM,
+ RADEON_PRIO_SCRATCH_BUFFER);
}
 
shader_va = shader->bo->gpu_address + offset;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
b/src/gallium/drivers/radeonsi/si_pipe.c
index 17d59b6..89df7db 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -51,8 +51,8 @@ static void si_destroy_context(struct pipe_context *context)
pipe_resource_reference(>null_const_buf.buffer, NULL);
r600_resource_reference(>border_color_buffer, NULL);
free(sctx->border_color_table);
-   r600_resource_reference(>scratch_buffer, NULL);
-   r600_resource_reference(>compute_scratch_buffer, NULL);
+   pb_reference(>scratch_buffer, NULL);
+   pb_reference(>compute_scratch_buffer, NULL);
sctx->b.ws->fence_reference(>last_gfx_fence, NULL);
 
si_pm4_free_state(sctx, sctx->init_config, ~0);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index a28c7d70..6ff6c3e 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -300,12 +300,12 @@ struct si_context {
unsignedlast_gsvs_itemsize;
 
/* Scratch buffer */
-   struct r600_resource*scratch_buffer;
+   struct pb_buffer*scratch_buffer;
boolean emit_scratch_reloc;
unsignedscratch_waves;
unsignedspi_tmpring_size;
 
-   struct r600_resource*compute_scratch_buffer;
+   struct pb_buffer*comp

[Mesa-dev] [PATCH 1/4] radeonsi: use CE suballocator for CP DMA realignment.

2016-04-19 Thread Bas Nieuwenhuizen
Use the CE suballocator instead of the normal one as the usage
is most similar to the CE, i.e. only read and written on GPU
and not mapped to CPU.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_cp_dma.c | 27 ++-
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c 
b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 38e0ee6..264789d 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -222,31 +222,24 @@ static void si_clear_buffer(struct pipe_context *ctx, 
struct pipe_resource *dst,
  */
 static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size)
 {
+
uint64_t va;
unsigned dma_flags = 0;
unsigned scratch_size = CP_DMA_ALIGNMENT * 2;
+   unsigned offset;
+   struct r600_resource *tmp_buf;
 
assert(size < CP_DMA_ALIGNMENT);
 
-   /* Use the scratch buffer as the dummy buffer. The 3D engine should be
-* idle at this point.
-*/
-   if (!sctx->scratch_buffer ||
-   sctx->scratch_buffer->b.b.width0 < scratch_size) {
-   r600_resource_reference(>scratch_buffer, NULL);
-   sctx->scratch_buffer =
-   si_resource_create_custom(>screen->b.b,
- PIPE_USAGE_DEFAULT,
- scratch_size);
-   if (!sctx->scratch_buffer)
-   return;
-   sctx->emit_scratch_reloc = true;
-   }
+   u_suballocator_alloc(sctx->ce_suballocator, scratch_size, ,
+(struct pipe_resource**)_buf);
+   if (!tmp_buf)
+   return;
 
-   si_cp_dma_prepare(sctx, >scratch_buffer->b.b,
- >scratch_buffer->b.b, size, size, _flags);
+   si_cp_dma_prepare(sctx, _buf->b.b,
+ _buf->b.b, size, size, _flags);
 
-   va = sctx->scratch_buffer->gpu_address;
+   va = tmp_buf->gpu_address + offset;
si_emit_cp_dma_copy_buffer(sctx, va, va + CP_DMA_ALIGNMENT, size,
   dma_flags);
 }
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 4/4] radeonsi: Print a message when scratch allocation fails.

2016-04-19 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_compute.c   | 5 -
 src/gallium/drivers/radeonsi/si_state_shaders.c | 5 -
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index b46a2fe..7d91ac6 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -215,8 +215,11 @@ static bool si_setup_compute_scratch_buffer(struct 
si_context *sctx,
scratch_needed, 256, false, RADEON_DOMAIN_VRAM,
RADEON_FLAG_NO_CPU_ACCESS);
 
-   if (!sctx->compute_scratch_buffer)
+   if (!sctx->compute_scratch_buffer) {
+   fprintf(stderr, "Warning: Failed to allocate the "
+   "scratch buffer\n");
return false;
+   }
}
 
if (sctx->compute_scratch_buffer != shader->scratch_bo && 
scratch_needed) {
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c 
b/src/gallium/drivers/radeonsi/si_state_shaders.c
index fef676b..2396b8e 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1692,8 +1692,11 @@ static bool si_update_spi_tmpring_size(struct si_context 
*sctx)
scratch_needed_size, 256, 
false,
RADEON_DOMAIN_VRAM,
RADEON_FLAG_NO_CPU_ACCESS);
-   if (!sctx->scratch_buffer)
+   if (!sctx->scratch_buffer) {
+   fprintf(stderr, "Warning: Failed to allocate 
the "
+   "scratch buffer\n");
return false;
+   }
sctx->emit_scratch_reloc = true;
}
 
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/2] winsys/amdgpu: clean up and fix switch statement

2016-04-19 Thread Bas Nieuwenhuizen
On Wed, Apr 20, 2016 at 1:13 AM, Grigori Goronzy  wrote:
> Add missing break, add default case. Additionally initialize variables
> to avoid compiler warnings.
> ---
>  src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 6 +-
>  1 file changed, 5 insertions(+), 1 deletion(-)
>
> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c 
> b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
> index b9a7c5b..d978a0d 100644
> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
> @@ -202,12 +202,13 @@ static bool amdgpu_get_new_ib(struct radeon_winsys *ws, 
> struct amdgpu_ib *ib,
>struct amdgpu_cs_ib_info *info, unsigned 
> ib_type)
>  {
> struct amdgpu_winsys *aws = (struct amdgpu_winsys *)ws;
> -   unsigned buffer_size, ib_size;
> +   unsigned buffer_size = 0, ib_size = 0;
>
> switch (ib_type) {
> case IB_CONST_PREAMBLE:
>buffer_size = 4 * 1024 * 4;
>ib_size = 1024 * 4;
> +  break;
> case IB_CONST:
>buffer_size = 512 * 1024 * 4;
>ib_size = 128 * 1024 * 4;
> @@ -225,6 +226,9 @@ static bool amdgpu_get_new_ib(struct radeon_winsys *ws, 
> struct amdgpu_ib *ib,
>ib_size = 32 * 1024 * 4;
>if (aws->buffer_wait_time_avg > IB_SIZE_WAIT_THRESHOLD_NS)
>   ib_size = 10 * 1024 * 4;
> +  break;
> +   default:
> +  assert(!"unreachable");

You can use the unreachable macro, this also puts a
__asume_unreachable() in there, so the compiler won't complain about
that path anymore.

- Bas

> }
>
> ib->base.cdw = 0;
> --
> 1.9.1
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/4] radeonsi: use CE suballocator for CP DMA realignment.

2016-04-19 Thread Bas Nieuwenhuizen
On Wed, Apr 20, 2016 at 2:13 AM, Nicolai Hähnle <nhaeh...@gmail.com> wrote:
> On 19.04.2016 18:29, Bas Nieuwenhuizen wrote:
>>
>> I retract patch 1 and 2. Large scratch buffers are nice, but the
>> hardware only supports a 32-bit offset into it.
>
>
> Do you mean patch 2 and 3? Do you plan alternative patches to error out when
> there is an integer overflow? That's still kind of important...
>
> Cheers,
> Nicolai

Really, patch 1 and 2. I did patch 1 only so I did not need to make
the wole cp_dma work with pb_buffer.

Although I guess patch 3 can best be merged with the to be written
patch that checks that the resulting size fits in 32 bit.

- Bas

>
>>
>> - Bas
>>
>> On Wed, Apr 20, 2016 at 12:50 AM, Bas Nieuwenhuizen
>> <b...@basnieuwenhuizen.nl> wrote:
>>>
>>> Use the CE suballocator instead of the normal one as the usage
>>> is most similar to the CE, i.e. only read and written on GPU
>>> and not mapped to CPU.
>>>
>>> Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
>>> ---
>>>   src/gallium/drivers/radeonsi/si_cp_dma.c | 27
>>> ++-
>>>   1 file changed, 10 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c
>>> b/src/gallium/drivers/radeonsi/si_cp_dma.c
>>> index 38e0ee6..264789d 100644
>>> --- a/src/gallium/drivers/radeonsi/si_cp_dma.c
>>> +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
>>> @@ -222,31 +222,24 @@ static void si_clear_buffer(struct pipe_context
>>> *ctx, struct pipe_resource *dst,
>>>*/
>>>   static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned
>>> size)
>>>   {
>>> +
>>>  uint64_t va;
>>>  unsigned dma_flags = 0;
>>>  unsigned scratch_size = CP_DMA_ALIGNMENT * 2;
>>> +   unsigned offset;
>>> +   struct r600_resource *tmp_buf;
>>>
>>>  assert(size < CP_DMA_ALIGNMENT);
>>>
>>> -   /* Use the scratch buffer as the dummy buffer. The 3D engine
>>> should be
>>> -* idle at this point.
>>> -*/
>>> -   if (!sctx->scratch_buffer ||
>>> -   sctx->scratch_buffer->b.b.width0 < scratch_size) {
>>> -   r600_resource_reference(>scratch_buffer, NULL);
>>> -   sctx->scratch_buffer =
>>> -   si_resource_create_custom(>screen->b.b,
>>> - PIPE_USAGE_DEFAULT,
>>> - scratch_size);
>>> -   if (!sctx->scratch_buffer)
>>> -   return;
>>> -   sctx->emit_scratch_reloc = true;
>>> -   }
>>> +   u_suballocator_alloc(sctx->ce_suballocator, scratch_size,
>>> ,
>>> +(struct pipe_resource**)_buf);
>>> +   if (!tmp_buf)
>>> +   return;
>>>
>>> -   si_cp_dma_prepare(sctx, >scratch_buffer->b.b,
>>> - >scratch_buffer->b.b, size, size,
>>> _flags);
>>> +   si_cp_dma_prepare(sctx, _buf->b.b,
>>> + _buf->b.b, size, size, _flags);
>>>
>>> -   va = sctx->scratch_buffer->gpu_address;
>>> +   va = tmp_buf->gpu_address + offset;
>>>  si_emit_cp_dma_copy_buffer(sctx, va, va + CP_DMA_ALIGNMENT,
>>> size,
>>> dma_flags);
>>>   }
>>> --
>>> 2.8.0
>>>
>> ___
>> mesa-dev mailing list
>> mesa-dev@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>>
>
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] glsl: Use correct mode for split components.

2016-04-20 Thread Bas Nieuwenhuizen
The mode should stay the same as the original struct. In
particular, shared should not be changed to temporary.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/compiler/glsl/opt_structure_splitting.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/compiler/glsl/opt_structure_splitting.cpp 
b/src/compiler/glsl/opt_structure_splitting.cpp
index 0d18a2f..f4c129e 100644
--- a/src/compiler/glsl/opt_structure_splitting.cpp
+++ b/src/compiler/glsl/opt_structure_splitting.cpp
@@ -351,7 +351,7 @@ do_structure_splitting(exec_list *instructions)
 entry->components[i] =
new(entry->mem_ctx) ir_variable(type->fields.structure[i].type,
name,
-   ir_var_temporary);
+   (ir_variable_mode) 
entry->var->data.mode);
 entry->var->insert_before(entry->components[i]);
   }
 
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] st/mesa: Use correct size for compute CAPs.

2016-04-20 Thread Bas Nieuwenhuizen
Some CAPs are stored as 64-bit value while Mesa stores
the related constant as 32-bit value.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/mesa/state_tracker/st_extensions.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/mesa/state_tracker/st_extensions.c 
b/src/mesa/state_tracker/st_extensions.c
index 939f15d..3f769b6 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -1152,6 +1152,7 @@ void st_init_extensions(struct pipe_screen *screen,
   PIPE_SHADER_CAP_SUPPORTED_IRS);
   if (compute_supported_irs & (1 << PIPE_SHADER_IR_TGSI)) {
  uint64_t grid_size[3], block_size[3];
+ uint64_t max_local_size, max_threads_per_block;
 
  screen->get_compute_param(screen, PIPE_SHADER_IR_TGSI,
PIPE_COMPUTE_CAP_MAX_GRID_SIZE, grid_size);
@@ -1159,10 +1160,13 @@ void st_init_extensions(struct pipe_screen *screen,
PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE, 
block_size);
  screen->get_compute_param(screen, PIPE_SHADER_IR_TGSI,
PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK,
-   >MaxComputeWorkGroupInvocations);
+   _threads_per_block);
  screen->get_compute_param(screen, PIPE_SHADER_IR_TGSI,
PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE,
-   >MaxComputeSharedMemorySize);
+   _local_size);
+
+ consts->MaxComputeWorkGroupInvocations = max_threads_per_block;
+ consts->MaxComputeSharedMemorySize = max_local_size;
 
  for (i = 0; i < 3; i++) {
 consts->MaxComputeWorkGroupCount[i] = grid_size[i];
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 09/20] radeonsi: don't pass scratch buffer to user SGPRs

2016-04-13 Thread Bas Nieuwenhuizen
As far as I can see we use relocations for clover too.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
Reviewed-by: Marek Olšák <marek.ol...@amd.com>
Reviewed-by: Nicolai Hähnle <nicolai.haeh...@amd.com>
---
 src/gallium/drivers/radeonsi/si_compute.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 27f779a..8bb101a 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -285,7 +285,6 @@ static void si_launch_grid(
struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
struct si_compute *program = sctx->cs_shader_state.program;
struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
-   uint64_t scratch_buffer_va = 0;
uint64_t shader_va;
unsigned i;
struct si_shader *shader = >shader;
@@ -327,15 +326,8 @@ static void si_launch_grid(
  shader->scratch_bo,
  RADEON_USAGE_READWRITE,
  RADEON_PRIO_SCRATCH_BUFFER);
-
-   scratch_buffer_va = shader->scratch_bo->gpu_address;
}
 
-   si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0 + 8, 
scratch_buffer_va);
-   si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0 + 12,
-   S_008F04_BASE_ADDRESS_HI(scratch_buffer_va >> 32)
-   |  S_008F04_STRIDE(shader->config.scratch_bytes_per_wave / 64));
-
si_pm4_set_reg(pm4, R_00B810_COMPUTE_START_X, 0);
si_pm4_set_reg(pm4, R_00B814_COMPUTE_START_Y, 0);
si_pm4_set_reg(pm4, R_00B818_COMPUTE_START_Z, 0);
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 01/20] radeonsi: lower compute shader arguments

2016-04-13 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
Reviewed-by: Marek Olšák <marek.ol...@amd.com>
Reviewed-by: Nicolai Hähnle <nicolai.haeh...@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 41 
 src/gallium/drivers/radeonsi/si_shader.h |  7 ++
 2 files changed, 48 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index c58467d..1ccdcac 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1282,6 +1282,36 @@ static void declare_system_value(
value = get_primitive_id(_bld->soa.bld_base, 0);
break;
 
+   case TGSI_SEMANTIC_GRID_SIZE:
+   value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GRID_SIZE);
+   break;
+
+   case TGSI_SEMANTIC_BLOCK_SIZE:
+   {
+   LLVMValueRef values[3];
+   unsigned i;
+   unsigned *properties = ctx->shader->selector->info.properties;
+   unsigned sizes[3] = {
+   properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
+   properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
+   properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
+   };
+
+   for (i = 0; i < 3; ++i)
+   values[i] = lp_build_const_int32(gallivm, sizes[i]);
+
+   value = lp_build_gather_values(gallivm, values, 3);
+   break;
+   }
+
+   case TGSI_SEMANTIC_BLOCK_ID:
+   value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_ID);
+   break;
+
+   case TGSI_SEMANTIC_THREAD_ID:
+   value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID);
+   break;
+
default:
assert(!"unknown system value");
return;
@@ -4823,6 +4853,14 @@ static void create_function(struct si_shader_context 
*ctx)
}
break;
 
+   case TGSI_PROCESSOR_COMPUTE:
+   params[SI_PARAM_GRID_SIZE] = v3i32;
+   params[SI_PARAM_BLOCK_ID] = v3i32;
+   last_sgpr = SI_PARAM_BLOCK_ID;
+
+   params[SI_PARAM_THREAD_ID] = v3i32;
+   num_params = SI_PARAM_THREAD_ID + 1;
+   break;
default:
assert(0 && "unimplemented shader");
return;
@@ -5600,6 +5638,7 @@ void si_dump_shader_key(unsigned shader, union 
si_shader_key *key, FILE *f)
break;
 
case PIPE_SHADER_GEOMETRY:
+   case PIPE_SHADER_COMPUTE:
break;
 
case PIPE_SHADER_FRAGMENT:
@@ -5784,6 +5823,8 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
else
bld_base->emit_epilogue = si_llvm_return_fs_outputs;
break;
+   case TGSI_PROCESSOR_COMPUTE:
+   break;
default:
assert(!"Unsupported shader type");
return -1;
diff --git a/src/gallium/drivers/radeonsi/si_shader.h 
b/src/gallium/drivers/radeonsi/si_shader.h
index 013c8a2..5043d43 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -91,6 +91,7 @@ struct radeon_shader_reloc;
 #define SI_SGPR_TCS_OUT_LAYOUT 11 /* TCS & TES only */
 #define SI_SGPR_TCS_IN_LAYOUT  12 /* TCS only */
 #define SI_SGPR_ALPHA_REF  10 /* PS only */
+#define SI_SGPR_GRID_SIZE  10 /* CS only */
 
 #define SI_VS_NUM_USER_SGPR15 /* API VS */
 #define SI_ES_NUM_USER_SGPR14 /* API VS */
@@ -100,6 +101,7 @@ struct radeon_shader_reloc;
 #define SI_GS_NUM_USER_SGPR10
 #define SI_GSCOPY_NUM_USER_SGPR4
 #define SI_PS_NUM_USER_SGPR11
+#define SI_CS_NUM_USER_SGPR13
 
 /* LLVM function parameter indices */
 #define SI_PARAM_RW_BUFFERS0
@@ -173,6 +175,11 @@ struct radeon_shader_reloc;
 #define SI_PARAM_SAMPLE_COVERAGE   21
 #define SI_PARAM_POS_FIXED_PT  22
 
+/* CS only parameters */
+#define SI_PARAM_GRID_SIZE 5
+#define SI_PARAM_BLOCK_ID  6
+#define SI_PARAM_THREAD_ID 7
+
 #define SI_NUM_PARAMS (SI_PARAM_POS_FIXED_PT + 9) /* +8 for COLOR[0..1] */
 
 struct si_shader;
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 10/20] radeonsi: do per cs setup for compute shaders once per cs

2016-04-13 Thread Bas Nieuwenhuizen
Also removes PKT3_CONTEXT_CONTROL as that is already being done
by si_begin_new_cs, when emitting init_config.

v2: - Use radeon_set_sh_reg_seq.
- Also set COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 for CIK+

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
Reviewed-by: Nicolai Hähnle <nicolai.haeh...@amd.com>
---
 src/gallium/drivers/radeonsi/si_compute.c| 77 
 src/gallium/drivers/radeonsi/si_hw_context.c |  2 +
 src/gallium/drivers/radeonsi/si_pipe.h   |  1 +
 3 files changed, 48 insertions(+), 32 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 8bb101a..4db436e 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -228,6 +228,47 @@ static unsigned compute_num_waves_for_scratch(
return scratch_waves;
 }
 
+static void si_initialize_compute(struct si_context *sctx)
+{
+   struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+
+   radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
+   radeon_emit(cs, 0);
+   radeon_emit(cs, 0);
+   radeon_emit(cs, 0);
+
+   radeon_set_sh_reg_seq(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, 3);
+   radeon_emit(cs, 0);
+   /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1 */
+   radeon_emit(cs, S_00B858_SH0_CU_EN(0x) | 
S_00B858_SH1_CU_EN(0x));
+   radeon_emit(cs, S_00B85C_SH0_CU_EN(0x) | 
S_00B85C_SH1_CU_EN(0x));
+
+   if (sctx->b.chip_class >= CIK) {
+   /* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */
+   radeon_set_sh_reg_seq(cs,
+R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 
2);
+   radeon_emit(cs, S_00B864_SH0_CU_EN(0x) |
+   S_00B864_SH1_CU_EN(0x));
+   radeon_emit(cs, S_00B868_SH0_CU_EN(0x) |
+   S_00B868_SH1_CU_EN(0x));
+   }
+
+   /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID
+* and is now per pipe, so it should be handled in the
+* kernel if we want to use something other than the default value,
+* which is now 0x22f.
+*/
+   if (sctx->b.chip_class <= SI) {
+   /* XXX: This should be:
+* (number of compute units) * 4 * (waves per simd) - 1 */
+
+   radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID,
+ 0x190 /* Default value */);
+   }
+
+   sctx->cs_shader_state.initialized = true;
+}
+
 static void si_upload_compute_input(struct si_context *sctx,
   const struct pipe_grid_info *info)
 {
@@ -282,7 +323,6 @@ static void si_launch_grid(
struct pipe_context *ctx, const struct pipe_grid_info *info)
 {
struct si_context *sctx = (struct si_context*)ctx;
-   struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
struct si_compute *program = sctx->cs_shader_state.program;
struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
uint64_t shader_va;
@@ -291,9 +331,10 @@ static void si_launch_grid(
unsigned lds_blocks;
unsigned num_waves_for_scratch;
 
-   radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0) | 
PKT3_SHADER_TYPE_S(1));
-   radeon_emit(cs, 0x8000);
-   radeon_emit(cs, 0x8000);
+   si_need_cs_space(sctx);
+
+   if (!sctx->cs_shader_state.initialized)
+   si_initialize_compute(sctx);
 
sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
 SI_CONTEXT_INV_GLOBAL_L2 |
@@ -328,10 +369,6 @@ static void si_launch_grid(
  RADEON_PRIO_SCRATCH_BUFFER);
}
 
-   si_pm4_set_reg(pm4, R_00B810_COMPUTE_START_X, 0);
-   si_pm4_set_reg(pm4, R_00B814_COMPUTE_START_Y, 0);
-   si_pm4_set_reg(pm4, R_00B818_COMPUTE_START_Z, 0);
-
si_pm4_set_reg(pm4, R_00B81C_COMPUTE_NUM_THREAD_X,
S_00B81C_NUM_THREAD_FULL(info->block[0]));
si_pm4_set_reg(pm4, R_00B820_COMPUTE_NUM_THREAD_Y,
@@ -351,19 +388,6 @@ static void si_launch_grid(
  RADEON_PRIO_COMPUTE_GLOBAL);
}
 
-   /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID
-* and is now per pipe, so it should be handled in the
-* kernel if we want to use something other than the default value,
-* which is now 0x22f.
-*/
-   if (sctx->b.chip_class <= SI) {
-   /* XXX: This should be:
-* (number of compute units) * 4 * (waves per simd) - 1 */
-
-   si_pm4_set_reg(pm4, R_00B82C_COMPUTE_MAX_WAVE_ID,
-   0x190 /* Default value */);
-   }
-
shader_va = shader->bo->gpu_address;
shader_va += info->pc;
 
@@ -392,17 +416,6

[Mesa-dev] [PATCH v2 06/20] radeonsi: update shader count for compute shaders

2016-04-13 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
Reviewed-by: Marek Olšák <marek.ol...@amd.com>
Reviewed-by: Nicolai Hähnle <nicolai.haeh...@amd.com>
---
 src/gallium/drivers/radeonsi/si_state.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_state.h 
b/src/gallium/drivers/radeonsi/si_state.h
index 6748f80..4d4823f 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -30,7 +30,8 @@
 #include "si_pm4.h"
 #include "radeon/r600_pipe_common.h"
 
-#define SI_NUM_SHADERS (PIPE_SHADER_TESS_EVAL+1)
+#define SI_NUM_GRAPHICS_SHADERS (PIPE_SHADER_TESS_EVAL+1)
+#define SI_NUM_SHADERS (PIPE_SHADER_COMPUTE+1)
 #define SI_MAX_ATTRIBS 16
 
 struct si_screen;
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 15/20] radeonsi: split texture decompression for compute shaders

2016-04-13 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
Reviewed-by: Marek Olšák <marek.ol...@amd.com>
Reviewed-by: Nicolai Hähnle <nicolai.haeh...@amd.com>
---
 src/gallium/drivers/radeonsi/si_blit.c   | 13 +++--
 src/gallium/drivers/radeonsi/si_compute.c|  2 ++
 src/gallium/drivers/radeonsi/si_pipe.h   |  3 ++-
 src/gallium/drivers/radeonsi/si_state_draw.c |  2 +-
 4 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_blit.c 
b/src/gallium/drivers/radeonsi/si_blit.c
index 54da7a2..7ca0e23 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -374,7 +374,8 @@ si_decompress_image_color_textures(struct si_context *sctx,
}
 }
 
-void si_decompress_textures(struct si_context *sctx)
+static void si_decompress_textures(struct si_context *sctx, int shader_start,
+   int shader_end)
 {
unsigned compressed_colortex_counter;
 
@@ -389,7 +390,7 @@ void si_decompress_textures(struct si_context *sctx)
}
 
/* Flush depth textures which need to be flushed. */
-   for (int i = 0; i < SI_NUM_SHADERS; i++) {
+   for (int i = shader_start; i < shader_end; i++) {
if (sctx->samplers[i].depth_texture_mask) {
si_flush_depth_textures(sctx, >samplers[i]);
}
@@ -402,6 +403,14 @@ void si_decompress_textures(struct si_context *sctx)
}
 }
 
+void si_decompress_graphics_textures(struct si_context *sctx) {
+   si_decompress_textures(sctx, 0, SI_NUM_GRAPHICS_SHADERS);
+}
+
+void si_decompress_compute_textures(struct si_context *sctx) {
+   si_decompress_textures(sctx, SI_NUM_GRAPHICS_SHADERS, SI_NUM_SHADERS);
+}
+
 static void si_clear(struct pipe_context *ctx, unsigned buffers,
 const union pipe_color_union *color,
 double depth, unsigned stencil)
diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 0b248cb..dee0b3a 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -432,6 +432,8 @@ static void si_launch_grid(
struct si_compute *program = sctx->cs_shader_state.program;
int i;
 
+   si_decompress_compute_textures(sctx);
+
si_need_cs_space(sctx);
 
if (!sctx->cs_shader_state.initialized)
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index 4a06854..9b2be6f 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -332,7 +332,8 @@ void cik_sdma_copy(struct pipe_context *ctx,
 
 /* si_blit.c */
 void si_init_blit_functions(struct si_context *sctx);
-void si_decompress_textures(struct si_context *sctx);
+void si_decompress_graphics_textures(struct si_context *sctx);
+void si_decompress_compute_textures(struct si_context *sctx);
 void si_resource_copy_region(struct pipe_context *ctx,
 struct pipe_resource *dst,
 unsigned dst_level,
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c 
b/src/gallium/drivers/radeonsi/si_state_draw.c
index 40cad50..ac33e45 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -787,7 +787,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct 
pipe_draw_info *info)
si_mark_atom_dirty(sctx, >framebuffer.atom);
}
 
-   si_decompress_textures(sctx);
+   si_decompress_graphics_textures(sctx);
 
/* Set the rasterization primitive type.
 *
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 02/20] radeonsi: add shared memory

2016-04-13 Thread Bas Nieuwenhuizen
Declares the shared memory as a global variable so that
LLVM is aware of it and it does not conflict with passes
like AMDGPUPromoteAlloca.

v2: - Use ctx->i8.
- Dropped null-check for declare_memory_region.
- Changed memory region array to single region.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeon/radeon_llvm.h   |  3 +++
 .../drivers/radeon/radeon_setup_tgsi_llvm.c|  4 
 src/gallium/drivers/radeonsi/si_shader.c   | 27 ++
 src/gallium/drivers/radeonsi/si_shader.h   |  3 +++
 4 files changed, 37 insertions(+)

diff --git a/src/gallium/drivers/radeon/radeon_llvm.h 
b/src/gallium/drivers/radeon/radeon_llvm.h
index 0a164bb..3e11b36 100644
--- a/src/gallium/drivers/radeon/radeon_llvm.h
+++ b/src/gallium/drivers/radeon/radeon_llvm.h
@@ -68,6 +68,9 @@ struct radeon_llvm_context {
unsigned index,
const struct tgsi_full_declaration *decl);
 
+   void (*declare_memory_region)(struct radeon_llvm_context *,
+   const struct tgsi_full_declaration *decl);
+
/** This array contains the input values for the shader.  Typically 
these
  * values will be in the form of a target intrinsic that will inform 
the
  * backend how to load the actual inputs to the shader. 
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c 
b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index fb883cb..0828197 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -366,6 +366,10 @@ static void emit_declaration(
break;
}
 
+   case TGSI_FILE_MEMORY:
+   ctx->declare_memory_region(ctx, decl);
+   break;
+
default:
break;
}
diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index 1ccdcac..5a76435 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -121,6 +121,8 @@ struct si_shader_context
LLVMTypeRef v4i32;
LLVMTypeRef v4f32;
LLVMTypeRef v8i32;
+
+   LLVMValueRef shared_memory;
 };
 
 static struct si_shader_context *si_shader_context(
@@ -1320,6 +1322,30 @@ static void declare_system_value(
radeon_bld->system_values[index] = value;
 }
 
+static void declare_compute_memory(struct radeon_llvm_context *radeon_bld,
+   const struct tgsi_full_declaration *decl)
+{
+   struct si_shader_context *ctx =
+   si_shader_context(_bld->soa.bld_base);
+   struct si_shader_selector *sel = ctx->shader->selector;
+   struct gallivm_state *gallivm = _bld->gallivm;
+
+   LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
+   LLVMValueRef var;
+
+   assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
+   assert(decl->Range.First == decl->Range.Last);
+   assert(!ctx->shared_memory);
+
+   var = LLVMAddGlobalInAddressSpace(gallivm->module,
+ LLVMArrayType(ctx->i8, 
sel->local_size),
+ "compute_lds",
+ LOCAL_ADDR_SPACE);
+   LLVMSetAlignment(var, 4);
+
+   ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
+}
+
 static LLVMValueRef fetch_constant(
struct lp_build_tgsi_context *bld_base,
const struct tgsi_full_src_register *reg,
@@ -5824,6 +5850,7 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
bld_base->emit_epilogue = si_llvm_return_fs_outputs;
break;
case TGSI_PROCESSOR_COMPUTE:
+   ctx.radeon_bld.declare_memory_region = declare_compute_memory;
break;
default:
assert(!"Unsupported shader type");
diff --git a/src/gallium/drivers/radeonsi/si_shader.h 
b/src/gallium/drivers/radeonsi/si_shader.h
index 5043d43..70ae46f 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -222,6 +222,9 @@ struct si_shader_selector {
 */
unsignedcolors_written_4bit;
 
+   /* CS parameters */
+   unsigned local_size;
+
/* masks of "get_unique_index" bits */
uint64_toutputs_written;
uint32_tpatch_outputs_written;
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/2] winsys/amdgpu: Silence possibly uninitialized variable warning.

2016-04-21 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index bbd29fc..69fb9bb 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -219,6 +219,9 @@ static bool amdgpu_get_new_ib(struct radeon_winsys *ws, 
struct amdgpu_ib *ib,
case IB_MAIN:
   buffer_size = 128 * 1024 * 4;
   ib_size = 20 * 1024 * 4;
+  break;
+   default:
+  unreachable("unhandled IB type");
}
 
ib->base.cdw = 0;
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/2] gallium/radeon: Silence possibly uninitialized variable warning.

2016-04-21 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeon/radeon_llvm_emit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.c 
b/src/gallium/drivers/radeon/radeon_llvm_emit.c
index 7174132..d3f5ae3 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.c
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.c
@@ -100,7 +100,7 @@ void radeon_llvm_shader_type(LLVMValueRef F, unsigned type)
calling_conv = RADEON_LLVM_AMDGPU_CS;
break;
default:
-   assert(0);
+   unreachable("Unhandle shader type");
}
 
if (HAVE_LLVM >= 0x309)
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 02/10] radeonsi: make RW buffer descriptor array global, not per shader stage

2016-04-21 Thread Bas Nieuwenhuizen
On Wed, Apr 20, 2016 at 5:47 PM, Marek Olšák  wrote:
> From: Marek Olšák 
>
> ---
>  src/gallium/drivers/radeonsi/si_descriptors.c | 50 
> +--
>  src/gallium/drivers/radeonsi/si_pipe.h|  2 +-
>  2 files changed, 25 insertions(+), 27 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
> b/src/gallium/drivers/radeonsi/si_descriptors.c
> index 01cf79e..c802b1e 100644
> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
> @@ -900,7 +900,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint 
> shader, uint slot,
> unsigned element_size, unsigned index_stride, 
> uint64_t offset)
>  {
> struct si_context *sctx = (struct si_context *)ctx;
> -   struct si_buffer_resources *buffers = >rw_buffers[shader];
> +   struct si_buffer_resources *buffers = >rw_buffers;
>
> if (shader >= SI_NUM_SHADERS)
> return;

I think it would be nice to remove this check and the shader argument
to this function.

> @@ -994,7 +994,7 @@ static void si_set_streamout_targets(struct pipe_context 
> *ctx,
>  const unsigned *offsets)
>  {
> struct si_context *sctx = (struct si_context *)ctx;
> -   struct si_buffer_resources *buffers = 
> >rw_buffers[PIPE_SHADER_VERTEX];
> +   struct si_buffer_resources *buffers = >rw_buffers;
> unsigned old_num_targets = sctx->b.streamout.num_targets;
> unsigned i, bufidx;
>
> @@ -1198,7 +1198,7 @@ static void si_invalidate_buffer(struct pipe_context 
> *ctx, struct pipe_resource
>
> /* Read/Write buffers. */
> for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
> -   struct si_buffer_resources *buffers = 
> >rw_buffers[shader];
> +   struct si_buffer_resources *buffers = >rw_buffers;
> uint64_t mask = buffers->desc.enabled_mask;

Looping over the shaders here seems not necessary. Also the check in
the loop is wrong when we add the PS entries to the rw_buffers set, as
all of the entries get first processed at the VS.

- Bas

>
> while (mask) {
> @@ -1289,7 +1289,6 @@ static void si_mark_shader_pointers_dirty(struct 
> si_context *sctx,
>   unsigned shader)
>  {
> sctx->const_buffers[shader].desc.pointer_dirty = true;
> -   sctx->rw_buffers[shader].desc.pointer_dirty = true;
> sctx->shader_buffers[shader].desc.pointer_dirty = true;
> sctx->samplers[shader].views.desc.pointer_dirty = true;
> sctx->images[shader].desc.pointer_dirty = true;
> @@ -1307,6 +1306,7 @@ static void si_shader_userdata_begin_new_cs(struct 
> si_context *sctx)
> for (i = 0; i < SI_NUM_SHADERS; i++) {
> si_mark_shader_pointers_dirty(sctx, i);
> }
> +   sctx->rw_buffers.desc.pointer_dirty = true;
>  }
>
>  /* Set a base register address for user data constants in the given shader.
> @@ -1385,22 +1385,23 @@ void si_emit_graphics_shader_userdata(struct 
> si_context *sctx,
> uint32_t *sh_base = sctx->shader_userdata.sh_base;
>
> if (sctx->gs_shader.cso) {
> -   /* The VS copy shader needs these for clipping, streamout, 
> and rings. */
> +   /* The VS copy shader needs this for clipping. */
> unsigned vs_base = R_00B130_SPI_SHADER_USER_DATA_VS_0;
> unsigned i = PIPE_SHADER_VERTEX;
>
> si_emit_shader_pointer(sctx, >const_buffers[i].desc, 
> vs_base, true);
> -   si_emit_shader_pointer(sctx, >rw_buffers[i].desc, 
> vs_base, true);
> +   }
>
> -   if (sctx->tes_shader.cso) {
> -   /* The TESSEVAL shader needs this for the ESGS ring 
> buffer. */
> -   si_emit_shader_pointer(sctx, 
> >rw_buffers[i].desc,
> -  
> R_00B330_SPI_SHADER_USER_DATA_ES_0, true);
> -   }
> -   } else if (sctx->tes_shader.cso) {
> -   /* The TESSEVAL shader needs this for streamout. */
> -   si_emit_shader_pointer(sctx, 
> >rw_buffers[PIPE_SHADER_VERTEX].desc,
> +   if (sctx->rw_buffers.desc.pointer_dirty) {
> +   si_emit_shader_pointer(sctx, >rw_buffers.desc,
>R_00B130_SPI_SHADER_USER_DATA_VS_0, 
> true);
> +   si_emit_shader_pointer(sctx, >rw_buffers.desc,
> +  R_00B230_SPI_SHADER_USER_DATA_GS_0, 
> true);
> +   si_emit_shader_pointer(sctx, >rw_buffers.desc,
> +  R_00B330_SPI_SHADER_USER_DATA_ES_0, 
> true);
> +   si_emit_shader_pointer(sctx, >rw_buffers.desc,
> +  R_00B430_SPI_SHADER_USER_DATA_HS_0, 
> true);
> +   sctx->rw_buffers.desc.pointer_dirty = false;
> 

[Mesa-dev] [PATCH 1/2] radeonsi: Use defines for CONTEXT_CONTROL instead of magic values.

2016-04-20 Thread Bas Nieuwenhuizen
I have no source for the actual name of these fields, as these are
not in the kernel headers. I hope they are clear though.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_state.c | 4 ++--
 src/gallium/drivers/radeonsi/sid.h  | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state.c 
b/src/gallium/drivers/radeonsi/si_state.c
index 305a70b..bd9a45c 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -3821,8 +3821,8 @@ static void si_init_config(struct si_context *sctx)
return;
 
si_pm4_cmd_begin(pm4, PKT3_CONTEXT_CONTROL);
-   si_pm4_cmd_add(pm4, 0x8000);
-   si_pm4_cmd_add(pm4, 0x8000);
+   si_pm4_cmd_add(pm4, LOAD_CONTROL_UPDATE(1));
+   si_pm4_cmd_add(pm4, SHADOW_ENABLE_UPDATE(1));
si_pm4_cmd_end(pm4, false);
 
si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64));
diff --git a/src/gallium/drivers/radeonsi/sid.h 
b/src/gallium/drivers/radeonsi/sid.h
index 516e114..6fb12f62 100644
--- a/src/gallium/drivers/radeonsi/sid.h
+++ b/src/gallium/drivers/radeonsi/sid.h
@@ -88,6 +88,9 @@
 #define PKT3_INDEX_BASE0x26
 #define PKT3_DRAW_INDEX_2  0x27
 #define PKT3_CONTEXT_CONTROL   0x28
+#define LOAD_CONTROL_UPDATE(x) (((x) & 0x1) << 31)
+#define LOAD_CONTROL_CE_RAM_EN(x)  (((x) & 0x1) << 28)
+#define SHADOW_ENABLE_UPDATE(x)(((x) & 0x1) << 31)
 #define PKT3_INDEX_TYPE0x2A
 #define PKT3_DRAW_INDIRECT_MULTI   0x2C
 #define PKT3_DRAW_INDEX_AUTO   0x2D
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 4/4] radeonsi: Print a message when scratch allocation fails.

2016-04-21 Thread Bas Nieuwenhuizen
On Wed, Apr 20, 2016 at 8:33 AM,  <eocallag...@alterapraxis.com> wrote:
> On 2016-04-20 11:46, Nicolai Hähnle wrote:
>>
>> On 19.04.2016 17:50, Bas Nieuwenhuizen wrote:
>>>
>>> Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
>>> ---
>>>   src/gallium/drivers/radeonsi/si_compute.c   | 5 -
>>>   src/gallium/drivers/radeonsi/si_state_shaders.c | 5 -
>>>   2 files changed, 8 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/src/gallium/drivers/radeonsi/si_compute.c
>>> b/src/gallium/drivers/radeonsi/si_compute.c
>>> index b46a2fe..7d91ac6 100644
>>> --- a/src/gallium/drivers/radeonsi/si_compute.c
>>> +++ b/src/gallium/drivers/radeonsi/si_compute.c
>>> @@ -215,8 +215,11 @@ static bool si_setup_compute_scratch_buffer(struct
>>> si_context *sctx,
>>> scratch_needed, 256, false,
>>> RADEON_DOMAIN_VRAM,
>>> RADEON_FLAG_NO_CPU_ACCESS);
>>>
>>> -   if (!sctx->compute_scratch_buffer)
>>> +   if (!sctx->compute_scratch_buffer) {
>>> +   fprintf(stderr, "Warning: Failed to allocate the
>>> "
>>> +   "scratch buffer\n");
>>> return false;
>>> +   }
>>
>>
>> Here and below, please change the "Warning" into "radeonsi" so
>> unsuspecting users will be more likely to understand what's going on.
>> With that changed, the patch is
>>
>> Reviewed-by: Nicolai Hähnle <nicolai.haeh...@amd.com>
>
>
> Wait, why not use the std R600_ERR() macro that wraps fprintf() calls?

Because I didn't think of it. Looking through the source code, the
usage is pretty mixed. Is one preferred over the other?

- Bas
>
>>
>>> }
>>>
>>> if (sctx->compute_scratch_buffer != shader->scratch_bo &&
>>> scratch_needed) {
>>> diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c
>>> b/src/gallium/drivers/radeonsi/si_state_shaders.c
>>> index fef676b..2396b8e 100644
>>> --- a/src/gallium/drivers/radeonsi/si_state_shaders.c
>>> +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
>>> @@ -1692,8 +1692,11 @@ static bool si_update_spi_tmpring_size(struct
>>> si_context *sctx)
>>> scratch_needed_size,
>>> 256, false,
>>> RADEON_DOMAIN_VRAM,
>>>
>>> RADEON_FLAG_NO_CPU_ACCESS);
>>> -   if (!sctx->scratch_buffer)
>>> +   if (!sctx->scratch_buffer) {
>>> +   fprintf(stderr, "Warning: Failed to
>>> allocate the "
>>> +   "scratch buffer\n");
>>> return false;
>>> +   }
>>> sctx->emit_scratch_reloc = true;
>>> }
>>>
>>>
>> ___
>> mesa-dev mailing list
>> mesa-dev@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
>
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 00/10] RadeonSI: cleanup RW shader slots

2016-04-21 Thread Bas Nieuwenhuizen
On Wed, Apr 20, 2016 at 5:47 PM, Marek Olšák <mar...@gmail.com> wrote:
> Hi,
>
> This moves all private shader resources to the RW buffer bindings, including 
> all driver constant buffers, and the poly stipple image (which is converted 
> into a constant buffer).
>
> RW buffer descriptors are made global, not per shader stage, so all shaders 
> receive the same pointer.
>
> Finally, all shader resource binding masks are shortened to 32 bits.
>
> Please review.

Except for patch 2, which I've commented on, the series is

Reviewed-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>

- Bas
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/3] gallium/ddebug: Add passthrough for get_compute_param.

2016-04-21 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/ddebug/dd_screen.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/src/gallium/drivers/ddebug/dd_screen.c 
b/src/gallium/drivers/ddebug/dd_screen.c
index fbc0bec..ebe090b 100644
--- a/src/gallium/drivers/ddebug/dd_screen.c
+++ b/src/gallium/drivers/ddebug/dd_screen.c
@@ -74,6 +74,17 @@ dd_screen_get_paramf(struct pipe_screen *_screen,
 }
 
 static int
+dd_screen_get_compute_param(struct pipe_screen *_screen,
+enum pipe_shader_ir ir_type,
+enum pipe_compute_cap param,
+void *ret)
+{
+   struct pipe_screen *screen = dd_screen(_screen)->screen;
+
+   return screen->get_compute_param(screen, ir_type, param, ret);
+}
+
+static int
 dd_screen_get_shader_param(struct pipe_screen *_screen, unsigned shader,
enum pipe_shader_cap param)
 {
@@ -319,6 +330,7 @@ ddebug_screen_create(struct pipe_screen *screen)
dscreen->base.get_device_vendor = dd_screen_get_device_vendor;
dscreen->base.get_param = dd_screen_get_param;
dscreen->base.get_paramf = dd_screen_get_paramf;
+   dscreen->base.get_compute_param = dd_screen_get_compute_param;
dscreen->base.get_shader_param = dd_screen_get_shader_param;
/* get_video_param */
/* get_compute_param */
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/3] gallium/ddebug: Implement launch_grid.

2016-04-21 Thread Bas Nieuwenhuizen
Does not implement dumping info.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/ddebug/dd_draw.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/src/gallium/drivers/ddebug/dd_draw.c 
b/src/gallium/drivers/ddebug/dd_draw.c
index 45e4e10..f0c8887 100644
--- a/src/gallium/drivers/ddebug/dd_draw.c
+++ b/src/gallium/drivers/ddebug/dd_draw.c
@@ -35,6 +35,7 @@
 enum call_type
 {
CALL_DRAW_VBO,
+   CALL_LAUNCH_GRID,
CALL_RESOURCE_COPY_REGION,
CALL_BLIT,
CALL_FLUSH_RESOURCE,
@@ -77,6 +78,7 @@ struct dd_call
 
union {
   struct pipe_draw_info draw_vbo;
+  struct pipe_grid_info launch_grid;
   struct call_resource_copy_region resource_copy_region;
   struct pipe_blit_info blit;
   struct pipe_resource *flush_resource;
@@ -372,6 +374,13 @@ dd_dump_draw_vbo(struct dd_context *dctx, struct 
pipe_draw_info *info, FILE *f)
 }
 
 static void
+dd_dump_launch_grid(struct dd_context *dctx, struct pipe_grid_info *info, FILE 
*f)
+{
+   fprintf(f, "%s:\n", __func__+8);
+   /* TODO */
+}
+
+static void
 dd_dump_resource_copy_region(struct dd_context *dctx,
  struct call_resource_copy_region *info,
  FILE *f)
@@ -485,6 +494,9 @@ dd_dump_call(struct dd_context *dctx, struct dd_call *call, 
unsigned flags)
case CALL_DRAW_VBO:
   dd_dump_draw_vbo(dctx, >info.draw_vbo, f);
   break;
+   case CALL_LAUNCH_GRID:
+  dd_dump_launch_grid(dctx, >info.launch_grid, f);
+  break;
case CALL_RESOURCE_COPY_REGION:
   dd_dump_resource_copy_region(dctx, >info.resource_copy_region, f);
   break;
@@ -649,6 +661,22 @@ dd_context_draw_vbo(struct pipe_context *_pipe,
 }
 
 static void
+dd_context_launch_grid(struct pipe_context *_pipe,
+   const struct pipe_grid_info *info)
+{
+   struct dd_context *dctx = dd_context(_pipe);
+   struct pipe_context *pipe = dctx->pipe;
+   struct dd_call call;
+
+   call.type = CALL_LAUNCH_GRID;
+   call.info.launch_grid = *info;
+
+   dd_before_draw(dctx);
+   pipe->launch_grid(pipe, info);
+   dd_after_draw(dctx, );
+}
+
+static void
 dd_context_resource_copy_region(struct pipe_context *_pipe,
 struct pipe_resource *dst, unsigned dst_level,
 unsigned dstx, unsigned dsty, unsigned dstz,
@@ -789,6 +817,7 @@ dd_init_draw_functions(struct dd_context *dctx)
 {
CTX_INIT(flush);
CTX_INIT(draw_vbo);
+   CTX_INIT(launch_grid);
CTX_INIT(resource_copy_region);
CTX_INIT(blit);
CTX_INIT(clear);
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/3] gallium/ddebug: Support compute states.

2016-04-21 Thread Bas Nieuwenhuizen
Note that compute states have a different struct than
the other shader states, so we cannot reuse the macro.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/ddebug/dd_context.c | 37 +
 1 file changed, 37 insertions(+)

diff --git a/src/gallium/drivers/ddebug/dd_context.c 
b/src/gallium/drivers/ddebug/dd_context.c
index 72a950a..731f6a5 100644
--- a/src/gallium/drivers/ddebug/dd_context.c
+++ b/src/gallium/drivers/ddebug/dd_context.c
@@ -295,6 +295,40 @@ DD_SHADER(GEOMETRY, gs)
 DD_SHADER(TESS_CTRL, tcs)
 DD_SHADER(TESS_EVAL, tes)
 
+static void * \
+dd_context_create_compute_state(struct pipe_context *_pipe,
+ const struct pipe_compute_state *state)
+{
+   struct pipe_context *pipe = dd_context(_pipe)->pipe;
+   struct dd_state *hstate = CALLOC_STRUCT(dd_state);
+
+   if (!hstate)
+  return NULL;
+   hstate->cso = pipe->create_compute_state(pipe, state);
+   return hstate;
+}
+
+static void
+dd_context_bind_compute_state(struct pipe_context *_pipe, void *state)
+{
+   struct dd_context *dctx = dd_context(_pipe);
+   struct pipe_context *pipe = dctx->pipe;
+   struct dd_state *hstate = state;
+
+   dctx->shaders[PIPE_SHADER_COMPUTE] = hstate;
+   pipe->bind_compute_state(pipe, hstate ? hstate->cso : NULL);
+}
+
+static void
+dd_context_delete_compute_state(struct pipe_context *_pipe, void *state)
+{
+   struct dd_context *dctx = dd_context(_pipe);
+   struct pipe_context *pipe = dctx->pipe;
+   struct dd_state *hstate = state;
+
+   pipe->delete_compute_state(pipe, hstate->cso);
+   FREE(hstate);
+}
 
 /
  * immediate states
@@ -703,6 +737,9 @@ dd_context_create(struct dd_screen *dscreen, struct 
pipe_context *pipe)
CTX_INIT(create_tes_state);
CTX_INIT(bind_tes_state);
CTX_INIT(delete_tes_state);
+   CTX_INIT(create_compute_state);
+   CTX_INIT(bind_compute_state);
+   CTX_INIT(delete_compute_state);
CTX_INIT(create_vertex_elements_state);
CTX_INIT(bind_vertex_elements_state);
CTX_INIT(delete_vertex_elements_state);
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 09/12] radeonsi: Add CE uploader.

2016-04-19 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
Reviewed-by: Marek Olšák <marek.ol...@amd.com>
---
 src/gallium/drivers/radeonsi/si_descriptors.c | 23 +++
 src/gallium/drivers/radeonsi/si_pipe.c| 11 +++
 src/gallium/drivers/radeonsi/si_pipe.h|  3 +++
 3 files changed, 37 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index c41923d..e9458ec 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -61,6 +61,7 @@
 #include "sid.h"
 
 #include "util/u_memory.h"
+#include "util/u_suballoc.h"
 #include "util/u_upload_mgr.h"
 
 
@@ -133,6 +134,28 @@ static void si_release_descriptors(struct si_descriptors 
*desc)
FREE(desc->list);
 }
 
+static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned 
size,
+unsigned *out_offset, struct r600_resource **out_buf) {
+   uint64_t va;
+
+   u_suballocator_alloc(sctx->ce_suballocator, size, out_offset,
+(struct pipe_resource**)out_buf);
+   if (!out_buf)
+   return false;
+
+   va = (*out_buf)->gpu_address + *out_offset;
+
+   radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0));
+   radeon_emit(sctx->ce_ib, ce_offset);
+   radeon_emit(sctx->ce_ib, size / 4);
+   radeon_emit(sctx->ce_ib, va);
+   radeon_emit(sctx->ce_ib, va >> 32);
+
+   sctx->ce_need_synchronization = true;
+   return true;
+}
+
+
 static bool si_upload_descriptors(struct si_context *sctx,
  struct si_descriptors *desc)
 {
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
b/src/gallium/drivers/radeonsi/si_pipe.c
index ddfa59f..ca07331 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -29,6 +29,7 @@
 #include "radeon/radeon_llvm_emit.h"
 #include "radeon/radeon_uvd.h"
 #include "util/u_memory.h"
+#include "util/u_suballoc.h"
 #include "vl/vl_decoder.h"
 
 /*
@@ -41,6 +42,9 @@ static void si_destroy_context(struct pipe_context *context)
 
si_release_all_descriptors(sctx);
 
+   if (sctx->ce_suballocator)
+   u_suballocator_destroy(sctx->ce_suballocator);
+
pipe_resource_reference(>esgs_ring, NULL);
pipe_resource_reference(>gsvs_ring, NULL);
pipe_resource_reference(>tf_ring, NULL);
@@ -155,6 +159,13 @@ static struct pipe_context *si_create_context(struct 
pipe_screen *screen,
if (!sctx->ce_preamble_ib)
goto fail;
}
+
+   sctx->ce_suballocator =
+   u_suballocator_create(>b.b, 1024 * 1024,
+ 64, PIPE_BIND_CUSTOM,
+ PIPE_USAGE_DEFAULT, 
FALSE);
+   if (!sctx->ce_suballocator)
+   goto fail;
}
 
sctx->b.gfx.flush = si_context_gfx_flush;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index b3f5ed5..1540c7f 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -80,6 +80,7 @@
 
 struct si_compute;
 struct hash_table;
+struct u_suballocator;
 
 struct si_screen {
struct r600_common_screen   b;
@@ -191,9 +192,11 @@ struct si_context {
void*custom_blend_dcc_decompress;
void*pstipple_sampler_state;
struct si_screen*screen;
+
struct radeon_winsys_cs *ce_ib;
struct radeon_winsys_cs *ce_preamble_ib;
boolce_need_synchronization;
+   struct u_suballocator   *ce_suballocator;
 
struct pipe_fence_handle*last_gfx_fence;
struct si_shader_ctx_state  fixed_func_tcs_shader;
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 05/12] radeonsi: Create CE IB.

2016-04-19 Thread Bas Nieuwenhuizen
Based on work by Marek Olšák.

v2: Add preamble IB.

Leaves the load packet in the space calculation as the
radeon winsys might not be able to support a premable.

The added space calculation may look expensive, but
is converted to a constant with (at least) -O2 and -O3.

v3: - Fix code style.
- Remove needed space for vertex buffer descriptors.
- Fail when the preamble cannot be created.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeon/r600_pipe_common.c |  1 +
 src/gallium/drivers/radeon/r600_pipe_common.h |  1 +
 src/gallium/drivers/radeonsi/si_hw_context.c  | 35 ++-
 src/gallium/drivers/radeonsi/si_pipe.c| 15 
 src/gallium/drivers/radeonsi/si_pipe.h|  3 +++
 5 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c 
b/src/gallium/drivers/radeon/r600_pipe_common.c
index a7477ab..a8660f2 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -402,6 +402,7 @@ static const struct debug_named_value 
common_debug_options[] = {
{ "norbplus", DBG_NO_RB_PLUS, "Disable RB+ on Stoney." },
{ "sisched", DBG_SI_SCHED, "Enable LLVM SI Machine Instruction 
Scheduler." },
{ "mono", DBG_MONOLITHIC_SHADERS, "Use old-style monolithic shaders 
compiled on demand" },
+   { "noce", DBG_NO_CE, "Disable the constant engine"},
 
DEBUG_NAMED_VALUE_END /* must be last */
 };
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h 
b/src/gallium/drivers/radeon/r600_pipe_common.h
index ba390a9..44ab675 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -95,6 +95,7 @@
 #define DBG_NO_RB_PLUS (1llu << 45)
 #define DBG_SI_SCHED   (1llu << 46)
 #define DBG_MONOLITHIC_SHADERS (1llu << 47)
+#define DBG_NO_CE  (1llu << 48)
 
 #define R600_MAP_BUFFER_ALIGNMENT 64
 #define R600_MAX_VIEWPORTS16
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c 
b/src/gallium/drivers/radeonsi/si_hw_context.c
index b621b55..5294898 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -26,10 +26,41 @@
 
 #include "si_pipe.h"
 
+static unsigned si_descriptor_list_cs_space(unsigned count, unsigned 
element_size)
+{
+   /* Ensure we have enough space to start a new range in a hole */
+   assert(element_size >= 3);
+
+   /* 5 dwords for possible load to reinitialize when we have no preamble
+* IB + 5 dwords for write to L2 + 3 bytes for every range written to
+* CE RAM.
+*/
+   return 5 + 5 + 3 + count * element_size;
+}
+
+static unsigned si_ce_needed_cs_space(void)
+{
+   unsigned space = 0;
+
+   space += si_descriptor_list_cs_space(SI_NUM_CONST_BUFFERS, 4);
+   space += si_descriptor_list_cs_space(SI_NUM_RW_BUFFERS, 4);
+   space += si_descriptor_list_cs_space(SI_NUM_SHADER_BUFFERS, 4);
+   space += si_descriptor_list_cs_space(SI_NUM_SAMPLERS, 16);
+   space += si_descriptor_list_cs_space(SI_NUM_IMAGES, 8);
+
+   space *= SI_NUM_SHADERS;
+
+   /* Increment CE counter packet */
+   space += 2;
+
+   return space;
+}
+
 /* initialize */
 void si_need_cs_space(struct si_context *ctx)
 {
struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
+   struct radeon_winsys_cs *ce_ib = ctx->ce_ib;
struct radeon_winsys_cs *dma = ctx->b.dma.cs;
 
/* Flush the DMA IB if it's not empty. */
@@ -53,7 +84,9 @@ void si_need_cs_space(struct si_context *ctx)
/* If the CS is sufficiently large, don't count the space needed
 * and just flush if there is not enough space left.
 */
-   if (unlikely(cs->cdw > cs->max_dw - 2048))
+   if (unlikely(cs->cdw > cs->max_dw - 2048 ||
+ (ce_ib && ce_ib->max_dw - ce_ib->cdw <
+  si_ce_needed_cs_space(
ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
b/src/gallium/drivers/radeonsi/si_pipe.c
index 6a990ed..ddfa59f 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -142,6 +142,21 @@ static struct pipe_context *si_create_context(struct 
pipe_screen *screen,
 
sctx->b.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX,
   si_context_gfx_flush, sctx);
+
+   if (!(sscreen->b.debug_flags & DBG_NO_CE) && ws->cs_add_const_ib) {
+   sctx->ce_ib = ws->cs_add_const_ib(sctx->b.gfx.cs);
+   if (!sctx->ce_ib)
+   goto fail;
+
+   if (ws

[Mesa-dev] [PATCH v3 01/12] gallium/radeon: move ring_type into winsyses

2016-04-19 Thread Bas Nieuwenhuizen
From: Marek Olšák <marek.ol...@amd.com>

Not used by drivers.

Reviewed-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeon/radeon_winsys.h|  1 -
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c |  8 
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.h |  1 +
 src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 10 +-
 src/gallium/winsys/radeon/drm/radeon_drm_cs.h |  1 +
 5 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_winsys.h 
b/src/gallium/drivers/radeon/radeon_winsys.h
index 0c03652..aa94df6 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -229,7 +229,6 @@ struct radeon_winsys_cs {
 unsignedcdw;  /* Number of used dwords. */
 unsignedmax_dw; /* Maximum number of dwords. */
 uint32_t*buf; /* The command buffer. */
-enum ring_type  ring_type;
 };
 
 struct radeon_info {
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index a9fc55f..63c72fc 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -348,7 +348,7 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
cs->ctx = ctx;
cs->flush_cs = flush;
cs->flush_data = flush_ctx;
-   cs->base.ring_type = ring_type;
+   cs->ring_type = ring_type;
 
if (!amdgpu_init_cs_context(cs, ring_type)) {
   FREE(cs);
@@ -570,7 +570,7 @@ static void amdgpu_cs_do_submission(struct amdgpu_cs *cs,
cs->request.fence_info.handle = NULL;
if (cs->request.ip_type != AMDGPU_HW_IP_UVD && cs->request.ip_type != 
AMDGPU_HW_IP_VCE) {
cs->request.fence_info.handle = cs->ctx->user_fence_bo;
-   cs->request.fence_info.offset = cs->base.ring_type;
+   cs->request.fence_info.offset = cs->ring_type;
}
 
r = amdgpu_cs_submit(cs->ctx->ctx, 0, >request, 1);
@@ -591,7 +591,7 @@ static void amdgpu_cs_do_submission(struct amdgpu_cs *cs,
   amdgpu_fence_submitted(fence, >request, user_fence);
 
   for (i = 0; i < cs->num_buffers; i++)
- amdgpu_fence_reference(>buffers[i].bo->fence[cs->base.ring_type],
+ amdgpu_fence_reference(>buffers[i].bo->fence[cs->ring_type],
 fence);
}
pipe_mutex_unlock(ws->bo_fence_lock);
@@ -613,7 +613,7 @@ static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
struct amdgpu_cs *cs = amdgpu_cs(rcs);
struct amdgpu_winsys *ws = cs->ctx->ws;
 
-   switch (cs->base.ring_type) {
+   switch (cs->ring_type) {
case RING_DMA:
   /* pad DMA ring to 8 DWs */
   while (rcs->cdw & 7)
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
index a2fb44a..f4709e9 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
@@ -66,6 +66,7 @@ struct amdgpu_cs {
unsigned used_ib_space;
 
/* amdgpu_cs_submit parameters */
+   enum ring_type  ring_type;
struct amdgpu_cs_requestrequest;
struct amdgpu_cs_ib_infoib;
 
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c 
b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index b50e19c..6b2694c 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -197,8 +197,8 @@ radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
 cs->csc = >csc1;
 cs->cst = >csc2;
 cs->base.buf = cs->csc->buf;
-cs->base.ring_type = ring_type;
 cs->base.max_dw = ARRAY_SIZE(cs->csc->buf);
+cs->ring_type = ring_type;
 
 p_atomic_inc(>num_cs);
 return >base;
@@ -281,7 +281,7 @@ static unsigned radeon_add_buffer(struct radeon_drm_cs *cs,
  * This doesn't have to be done if virtual memory is enabled,
  * because there is no offset patching with virtual memory.
  */
-if (cs->base.ring_type != RING_DMA || cs->ws->info.has_virtual_memory) 
{
+if (cs->ring_type != RING_DMA || cs->ws->info.has_virtual_memory) {
 return i;
 }
 }
@@ -466,7 +466,7 @@ static void radeon_drm_cs_flush(struct radeon_winsys_cs 
*rcs,
 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 struct radeon_cs_context *tmp;
 
-switch (cs->base.ring_type) {
+switch (cs->ring_type) {
 case RING_DMA:
 /* pad DMA ring to 8 DWs */
 if (cs->ws->info.chip_class <= SI) {
@@ -526,7 +526,7 @@ static void radeon_drm_cs_flush(struct radeon_winsys_cs 
*rcs,
 p_atomic_inc(>cst->relocs_bo[i].bo->num_active_ioctls);
 }
 
-switch (cs->base.ring_type) {
+switch (cs->ring_type) {
 case RI

[Mesa-dev] [PATCH v3 10/12] radeonsi: Replace list_dirty with a mask.

2016-04-19 Thread Bas Nieuwenhuizen
We can then upload only the dirty ones with the constant engine.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
Reviewed-by: Marek Olšák <marek.ol...@amd.com>
---
 src/gallium/drivers/radeonsi/si_descriptors.c | 37 ---
 src/gallium/drivers/radeonsi/si_state.h   |  9 +--
 2 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index e9458ec..a5018db 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -109,7 +109,7 @@ static void si_init_descriptors(struct si_descriptors *desc,
desc->list = CALLOC(num_elements, element_dw_size * 4);
desc->element_dw_size = element_dw_size;
desc->num_elements = num_elements;
-   desc->list_dirty = true; /* upload the list before the next draw */
+   desc->dirty_mask = num_elements == 64 ? ~0llu : (1llu << num_elements) 
- 1;
desc->shader_userdata_offset = shader_userdata_index * 4;
 
if (ce_offset) {
@@ -162,7 +162,7 @@ static bool si_upload_descriptors(struct si_context *sctx,
unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
void *ptr;
 
-   if (!desc->list_dirty)
+   if (!desc->dirty_mask)
return true;
 
u_upload_alloc(sctx->b.uploader, 0, list_size, 256,
@@ -176,7 +176,7 @@ static bool si_upload_descriptors(struct si_context *sctx,
radeon_add_to_buffer_list(>b, >b.gfx, desc->buffer,
  RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
 
-   desc->list_dirty = false;
+   desc->dirty_mask = 0;
desc->pointer_dirty = true;
si_mark_atom_dirty(sctx, >shader_userdata.atom);
return true;
@@ -219,6 +219,8 @@ static void si_sampler_views_begin_new_cs(struct si_context 
*sctx,
si_sampler_view_add_buffer(sctx, views->views[i]->texture);
}
 
+   views->desc.ce_ram_dirty = true;
+
if (!views->desc.buffer)
return;
radeon_add_to_buffer_list(>b, >b.gfx, views->desc.buffer,
@@ -270,7 +272,7 @@ static void si_set_sampler_view(struct si_context *sctx,
views->desc.enabled_mask &= ~(1llu << slot);
}
 
-   views->desc.list_dirty = true;
+   views->desc.dirty_mask |= 1llu << slot;
 }
 
 static bool is_compressed_colortex(struct r600_texture *rtex)
@@ -376,6 +378,8 @@ si_image_views_begin_new_cs(struct si_context *sctx, struct 
si_images_info *imag
si_sampler_view_add_buffer(sctx, view->resource);
}
 
+   images->desc.ce_ram_dirty = true;
+
if (images->desc.buffer) {
radeon_add_to_buffer_list(>b, >b.gfx,
  images->desc.buffer,
@@ -393,7 +397,7 @@ si_disable_shader_image(struct si_images_info *images, 
unsigned slot)
 
memcpy(images->desc.list + slot*8, null_image_descriptor, 8*4);
images->desc.enabled_mask &= ~(1llu << slot);
-   images->desc.list_dirty = true;
+   images->desc.dirty_mask |= 1llu << slot;
}
 }
 
@@ -474,7 +478,7 @@ si_set_shader_images(struct pipe_context *pipe, unsigned 
shader,
}
 
images->desc.enabled_mask |= 1llu << slot;
-   images->desc.list_dirty = true;
+   images->desc.dirty_mask |= 1llu << slot;
}
 }
 
@@ -532,7 +536,7 @@ static void si_bind_sampler_states(struct pipe_context 
*ctx, unsigned shader,
continue;
 
memcpy(desc->list + slot * 16 + 12, sstates[i]->val, 4*4);
-   desc->list_dirty = true;
+   desc->dirty_mask |= 1llu << slot;
}
 }
 
@@ -579,6 +583,8 @@ static void si_buffer_resources_begin_new_cs(struct 
si_context *sctx,
  buffers->shader_usage, buffers->priority);
}
 
+   buffers->desc.ce_ram_dirty = true;
+
if (!buffers->desc.buffer)
return;
radeon_add_to_buffer_list(>b, >b.gfx,
@@ -775,7 +781,7 @@ static void si_set_constant_buffer(struct pipe_context 
*ctx, uint shader, uint s
buffers->desc.enabled_mask &= ~(1llu << slot);
}
 
-   buffers->desc.list_dirty = true;
+   buffers->desc.dirty_mask |= 1llu << slot;
 }
 
 /* SHADER BUFFERS */
@@ -822,9 +828,9 @@ static void si_set_shader_buffers(struct pipe_context *ctx, 
unsigned shader,
radeon_add_to_buffer_list(>b, >b.gfx, buf,
  buffers->shader_usage, buffers->priority);
buffers->desc.enabled_mask |= 1llu &l

[Mesa-dev] [PATCH v3 07/12] radeonsi: Add CE synchronization.

2016-04-19 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
Reviewed-by: Marek Olšák <marek.ol...@amd.com>
---
 src/gallium/drivers/radeonsi/si_pipe.h   |  1 +
 src/gallium/drivers/radeonsi/si_state_draw.c | 24 
 2 files changed, 25 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index b8db3b2..b3f5ed5 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -193,6 +193,7 @@ struct si_context {
struct si_screen*screen;
struct radeon_winsys_cs *ce_ib;
struct radeon_winsys_cs *ce_preamble_ib;
+   boolce_need_synchronization;
 
struct pipe_fence_handle*last_gfx_fence;
struct si_shader_ctx_state  fixed_func_tcs_shader;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c 
b/src/gallium/drivers/radeonsi/si_state_draw.c
index d31f77f..f5ea359 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -757,6 +757,25 @@ static void si_get_draw_start_count(struct si_context 
*sctx,
}
 }
 
+static void si_ce_pre_draw_synchronization(struct si_context *sctx) {
+   if (sctx->ce_need_synchronization) {
+   radeon_emit(sctx->ce_ib, PKT3(PKT3_INCREMENT_CE_COUNTER, 0, 0));
+   radeon_emit(sctx->ce_ib, 1);
+
+   radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_WAIT_ON_CE_COUNTER, 0, 
0));
+   radeon_emit(sctx->b.gfx.cs, 1);
+   }
+}
+
+static void si_ce_post_draw_synchronization(struct si_context *sctx) {
+   if (sctx->ce_need_synchronization) {
+   radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_INCREMENT_DE_COUNTER, 0, 
0));
+   radeon_emit(sctx->b.gfx.cs, 0);
+
+   sctx->ce_need_synchronization = false;
+   }
+}
+
 void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 {
struct si_context *sctx = (struct si_context *)ctx;
@@ -886,8 +905,13 @@ void si_draw_vbo(struct pipe_context *ctx, const struct 
pipe_draw_info *info)
si_emit_scratch_reloc(sctx);
si_emit_rasterizer_prim_state(sctx);
si_emit_draw_registers(sctx, info);
+
+   si_ce_pre_draw_synchronization(sctx);
+
si_emit_draw_packets(sctx, info, );
 
+   si_ce_post_draw_synchronization(sctx);
+
if (sctx->trace_buf)
si_trace_emit(sctx);
 
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 08/12] radeonsi: Allocate chunks of CE ram.

2016-04-19 Thread Bas Nieuwenhuizen
v2: Use 32 byte alignment.

v3: Don't allocate CE space for vertex buffer descriptors.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_descriptors.c | 33 +++
 src/gallium/drivers/radeonsi/si_state.h   |  3 +++
 2 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index 7fc1461..c41923d 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -98,7 +98,8 @@ static void si_init_descriptors(struct si_descriptors *desc,
unsigned shader_userdata_index,
unsigned element_dw_size,
unsigned num_elements,
-   const uint32_t *null_descriptor)
+   const uint32_t *null_descriptor,
+   unsigned *ce_offset)
 {
int i;
 
@@ -110,6 +111,13 @@ static void si_init_descriptors(struct si_descriptors 
*desc,
desc->list_dirty = true; /* upload the list before the next draw */
desc->shader_userdata_offset = shader_userdata_index * 4;
 
+   if (ce_offset) {
+   desc->ce_offset = *ce_offset;
+
+   /* make sure that ce_offset stays 32 byte aligned */
+   *ce_offset += align(element_dw_size * num_elements * 4, 32);
+   }
+
/* Initialize the array to NULL descriptors if the element size is 8. */
if (null_descriptor) {
assert(element_dw_size % 8 == 0);
@@ -511,14 +519,15 @@ static void si_init_buffer_resources(struct 
si_buffer_resources *buffers,
 unsigned num_buffers,
 unsigned shader_userdata_index,
 enum radeon_bo_usage shader_usage,
-enum radeon_bo_priority priority)
+enum radeon_bo_priority priority,
+unsigned *ce_offset)
 {
buffers->shader_usage = shader_usage;
buffers->priority = priority;
buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
 
si_init_descriptors(>desc, shader_userdata_index, 4,
-   num_buffers, NULL);
+   num_buffers, NULL, ce_offset);
 }
 
 static void si_release_buffer_resources(struct si_buffer_resources *buffers)
@@ -1326,29 +1335,35 @@ void si_emit_shader_userdata(struct si_context *sctx, 
struct r600_atom *atom)
 void si_init_all_descriptors(struct si_context *sctx)
 {
int i;
+   unsigned ce_offset = 0;
 
for (i = 0; i < SI_NUM_SHADERS; i++) {
si_init_buffer_resources(>const_buffers[i],
 SI_NUM_CONST_BUFFERS, 
SI_SGPR_CONST_BUFFERS,
-RADEON_USAGE_READ, 
RADEON_PRIO_CONST_BUFFER);
+RADEON_USAGE_READ, 
RADEON_PRIO_CONST_BUFFER,
+_offset);
si_init_buffer_resources(>rw_buffers[i],
 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
-RADEON_USAGE_READWRITE, 
RADEON_PRIO_RINGS_STREAMOUT);
+RADEON_USAGE_READWRITE, 
RADEON_PRIO_RINGS_STREAMOUT,
+_offset);
si_init_buffer_resources(>shader_buffers[i],
 SI_NUM_SHADER_BUFFERS, 
SI_SGPR_SHADER_BUFFERS,
-RADEON_USAGE_READWRITE, 
RADEON_PRIO_SHADER_RW_BUFFER);
+RADEON_USAGE_READWRITE, 
RADEON_PRIO_SHADER_RW_BUFFER,
+_offset);
 
si_init_descriptors(>samplers[i].views.desc,
SI_SGPR_SAMPLERS, 16, SI_NUM_SAMPLERS,
-   null_texture_descriptor);
+   null_texture_descriptor, _offset);
 
si_init_descriptors(>images[i].desc,
SI_SGPR_IMAGES, 8, SI_NUM_IMAGES,
-   null_image_descriptor);
+   null_image_descriptor, _offset);
}
 
si_init_descriptors(>vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
-   4, SI_NUM_VERTEX_BUFFERS, NULL);
+   4, SI_NUM_VERTEX_BUFFERS, NULL, NULL);
+
+   assert(ce_offset <= 32768);
 
/* Set pipe_context functions. */
sctx->b.b.bind_sampler_states = si_bind_sampler_states;
diff --git a/src/gallium/drivers/radeonsi/si_state.h 
b/src/gallium/drivers/radeo

[Mesa-dev] [PATCH v3 03/12] winsys/amdgpu: Add support for const IB.

2016-04-19 Thread Bas Nieuwenhuizen
From: Marek Olšák <marek.ol...@amd.com>

v2: Use the correct IB to update request (Bas Nieuwenhuizen)
v3: Add preamble IB. (Bas Nieuwenhuizen)
Reviewed-by: Marek Olšák <marek.ol...@amd.com>
---
 src/gallium/drivers/radeon/radeon_winsys.h | 30 ++
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c  | 88 --
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.h  | 11 +++-
 3 files changed, 124 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_winsys.h 
b/src/gallium/drivers/radeon/radeon_winsys.h
index aa94df6..451d8a4 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -603,6 +603,36 @@ struct radeon_winsys {
   void *flush_ctx);
 
 /**
+ * Add a constant engine IB to a graphics CS. This makes the graphics CS
+ * from "cs_create" a group of two IBs that share a buffer list and are
+ * flushed together.
+ *
+ * The returned constant CS is only a stream for writing packets to the new
+ * IB. Calling other winsys functions with it is not allowed, not even
+ * "cs_destroy".
+ *
+ * In order to add buffers and check memory usage, use the graphics CS.
+ * In order to flush it, use the graphics CS, which will flush both IBs.
+ * Destroying the graphics CS will destroy both of them.
+ *
+ * \param cs  The graphics CS from "cs_create" that will hold the buffer
+ *list and will be used for flushing.
+ */
+struct radeon_winsys_cs *(*cs_add_const_ib)(struct radeon_winsys_cs *cs);
+
+ /**
+ * Add a constant engine preamble IB to a graphics CS. This add an extra IB
+ * in similar manner to cs_add_const_ib. This should always be called after
+ * cs_add_const_ib.
+ *
+ * The returned IB is a constant engine IB that only gets flushed if the
+ * context changed.
+ *
+ * \param cs  The graphics CS from "cs_create" that will hold the buffer
+ *list and will be used for flushing.
+ */
+struct radeon_winsys_cs *(*cs_add_const_preamble_ib)(struct 
radeon_winsys_cs *cs);
+/**
  * Destroy a command stream.
  *
  * \param csA command stream to destroy.
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index b0fe8b9..0182660 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -350,19 +350,62 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
   return NULL;
}
 
-   if (!amdgpu_get_new_ib(>ws->base, >main, >ib)) {
+   if (!amdgpu_get_new_ib(>ws->base, >main, >ib[IB_MAIN])) {
   amdgpu_destroy_cs_context(cs);
   FREE(cs);
   return NULL;
}
 
cs->request.number_of_ibs = 1;
-   cs->request.ibs = >ib;
+   cs->request.ibs = >ib[IB_MAIN];
 
p_atomic_inc(>ws->num_cs);
return >main.base;
 }
 
+static struct radeon_winsys_cs *
+amdgpu_cs_add_const_ib(struct radeon_winsys_cs *rcs)
+{
+   struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs;
+   struct amdgpu_winsys *ws = cs->ctx->ws;
+
+   /* only one const IB can be added */
+   if (cs->ring_type != RING_GFX || cs->const_ib.ib_mapped)
+  return NULL;
+
+   if (!amdgpu_get_new_ib(>base, >const_ib, >ib[IB_CONST]))
+  return NULL;
+
+   cs->request.number_of_ibs = 2;
+   cs->request.ibs = >ib[IB_CONST];
+   cs->ib[IB_CONST].flags = AMDGPU_IB_FLAG_CE;
+
+   return >const_ib.base;
+}
+
+static struct radeon_winsys_cs *
+amdgpu_cs_add_const_preamble_ib(struct radeon_winsys_cs *rcs)
+{
+   struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs;
+   struct amdgpu_winsys *ws = cs->ctx->ws;
+
+   /* only one const preamble IB can be added and only when the const IB has
+* also been mapped */
+   if (cs->ring_type != RING_GFX || !cs->const_ib.ib_mapped ||
+   cs->const_preamble_ib.ib_mapped)
+  return NULL;
+
+   if (!amdgpu_get_new_ib(>base, >const_preamble_ib,
+ >ib[IB_CONST_PREAMBLE], 
IB_CONST_PREAMBLE))
+  return NULL;
+
+   cs->request.number_of_ibs = 3;
+   cs->request.ibs = >ib[IB_CONST_PREAMBLE];
+   cs->ib[IB_CONST_PREAMBLE].flags = AMDGPU_IB_FLAG_CE | 
AMDGPU_IB_FLAG_PREAMBLE;
+
+   return >const_preamble_ib.base;
+}
+
 #define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value)
 
 int amdgpu_lookup_buffer(struct amdgpu_cs *cs, struct amdgpu_winsys_bo *bo)
@@ -621,6 +664,15 @@ static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
   /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements */
   while (rcs->cdw & 7)
  OUT_CS(rcs, 0x1000); /* type3 nop packet */
+
+  /* Also pad the const IB. */
+  if (cs->const_ib.ib_mapped)
+ while (!cs->const_ib.base.cdw || (cs->c

[Mesa-dev] [PATCH v3 02/12] winsys/amdgpu: split IB data into a new structure in preparation for CE

2016-04-19 Thread Bas Nieuwenhuizen
From: Marek Olšák <marek.ol...@amd.com>

Reviewed-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.c |  5 ---
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.h |  6 +++
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 68 +++
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.h | 16 
 4 files changed, 48 insertions(+), 47 deletions(-)

diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index 1b2793a..036301e 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -38,11 +38,6 @@
 #include 
 #include 
 
-static inline struct amdgpu_winsys_bo *amdgpu_winsys_bo(struct pb_buffer *bo)
-{
-   return (struct amdgpu_winsys_bo *)bo;
-}
-
 static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
enum radeon_bo_usage usage)
 {
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h 
b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
index 54f5dbd..69ada10 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
@@ -69,6 +69,12 @@ void amdgpu_bo_destroy(struct pb_buffer *_buf);
 void amdgpu_bo_init_functions(struct amdgpu_winsys *ws);
 
 static inline
+struct amdgpu_winsys_bo *amdgpu_winsys_bo(struct pb_buffer *bo)
+{
+   return (struct amdgpu_winsys_bo *)bo;
+}
+
+static inline
 void amdgpu_winsys_bo_reference(struct amdgpu_winsys_bo **dst,
 struct amdgpu_winsys_bo *src)
 {
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index 63c72fc..b0fe8b9 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -198,7 +198,8 @@ amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx 
*rwctx)
 
 /* COMMAND SUBMISSION */
 
-static bool amdgpu_get_new_ib(struct amdgpu_cs *cs)
+static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_ib *ib,
+  struct amdgpu_cs_ib_info *info)
 {
/* Small IBs are better than big IBs, because the GPU goes idle quicker
 * and there is less waiting for buffers and fences. Proof:
@@ -207,39 +208,36 @@ static bool amdgpu_get_new_ib(struct amdgpu_cs *cs)
const unsigned buffer_size = 128 * 1024 * 4;
const unsigned ib_size = 20 * 1024 * 4;
 
-   cs->base.cdw = 0;
-   cs->base.buf = NULL;
+   ib->base.cdw = 0;
+   ib->base.buf = NULL;
 
/* Allocate a new buffer for IBs if the current buffer is all used. */
-   if (!cs->big_ib_buffer ||
-   cs->used_ib_space + ib_size > cs->big_ib_buffer->size) {
-  struct radeon_winsys *ws = >ctx->ws->base;
+   if (!ib->big_ib_buffer ||
+   ib->used_ib_space + ib_size > ib->big_ib_buffer->size) {
 
-  pb_reference(>big_ib_buffer, NULL);
-  cs->big_ib_winsys_buffer = NULL;
-  cs->ib_mapped = NULL;
-  cs->used_ib_space = 0;
+  pb_reference(>big_ib_buffer, NULL);
+  ib->ib_mapped = NULL;
+  ib->used_ib_space = 0;
 
-  cs->big_ib_buffer = ws->buffer_create(ws, buffer_size,
+  ib->big_ib_buffer = ws->buffer_create(ws, buffer_size,
 4096, true,
 RADEON_DOMAIN_GTT,
 RADEON_FLAG_CPU_ACCESS);
-  if (!cs->big_ib_buffer)
+  if (!ib->big_ib_buffer)
  return false;
 
-  cs->ib_mapped = ws->buffer_map(cs->big_ib_buffer, NULL,
+  ib->ib_mapped = ws->buffer_map(ib->big_ib_buffer, NULL,
  PIPE_TRANSFER_WRITE);
-  if (!cs->ib_mapped) {
- pb_reference(>big_ib_buffer, NULL);
+  if (!ib->ib_mapped) {
+ pb_reference(>big_ib_buffer, NULL);
  return false;
   }
-
-  cs->big_ib_winsys_buffer = (struct amdgpu_winsys_bo*)cs->big_ib_buffer;
}
 
-   cs->ib.ib_mc_address = cs->big_ib_winsys_buffer->va + cs->used_ib_space;
-   cs->base.buf = (uint32_t*)(cs->ib_mapped + cs->used_ib_space);
-   cs->base.max_dw = ib_size / 4;
+   info->ib_mc_address = amdgpu_winsys_bo(ib->big_ib_buffer)->va +
+ ib->used_ib_space;
+   ib->base.buf = (uint32_t*)(ib->ib_mapped + ib->used_ib_space);
+   ib->base.max_dw = ib_size / 4;
return true;
 }
 
@@ -271,9 +269,6 @@ static boolean amdgpu_init_cs_context(struct amdgpu_cs *cs,
   break;
}
 
-   cs->request.number_of_ibs = 1;
-   cs->request.ibs = >ib;
-
cs->max_num_buffers = 512;
cs->buffers = (struct amdgpu_cs_buffer*)
   CALLOC(1, cs->max_num_buffers * sizeof(struct 
amdgpu_cs_buffer));
@@ -355,14 +350,17 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
   return NULL;
}
 
-   if (!amdgpu_get_new_ib

[Mesa-dev] [PATCH v3 06/12] radeonsi: Add CE packet definitions.

2016-04-19 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
Reviewed-by: Marek Olšák <marek.ol...@amd.com>
---
 src/gallium/drivers/radeonsi/sid.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/sid.h 
b/src/gallium/drivers/radeonsi/sid.h
index 11d6090..516e114 100644
--- a/src/gallium/drivers/radeonsi/sid.h
+++ b/src/gallium/drivers/radeonsi/sid.h
@@ -146,6 +146,12 @@
 #define PKT3_SET_SH_REG0x76
 #define PKT3_SET_SH_REG_OFFSET 0x77
 #define PKT3_SET_UCONFIG_REG   0x79 /* new for CIK */
+#define PKT3_LOAD_CONST_RAM0x80
+#define PKT3_WRITE_CONST_RAM   0x81
+#define PKT3_DUMP_CONST_RAM0x83
+#define PKT3_INCREMENT_CE_COUNTER  0x84
+#define PKT3_INCREMENT_DE_COUNTER  0x85
+#define PKT3_WAIT_ON_CE_COUNTER0x86
 
 #define PKT_TYPE_S(x)   (((x) & 0x3) << 30)
 #define PKT_TYPE_G(x)   (((x) >> 30) & 0x3)
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 04/12] winsys/amdgpu: Enlarge const IB size.

2016-04-19 Thread Bas Nieuwenhuizen
Necessary to prevent performance regressions due to extra flushing.

Probably should enlarge it even further when also updating
uniforms through the CE, but this seems large enough for now.

v2: Add preamble IB.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
Reviewed-by: Marek Olšák <marek.ol...@amd.com>
---
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 28 
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index 0182660..69902c4 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -199,14 +199,26 @@ amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx 
*rwctx)
 /* COMMAND SUBMISSION */
 
 static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_ib *ib,
-  struct amdgpu_cs_ib_info *info)
+  struct amdgpu_cs_ib_info *info, unsigned ib_type)
 {
/* Small IBs are better than big IBs, because the GPU goes idle quicker
 * and there is less waiting for buffers and fences. Proof:
 *   http://www.phoronix.com/scan.php?page=article=mesa-111-si=1
 */
-   const unsigned buffer_size = 128 * 1024 * 4;
-   const unsigned ib_size = 20 * 1024 * 4;
+   unsigned buffer_size, ib_size;
+
+   switch (ib_type) {
+   case IB_CONST_PREAMBLE:
+  buffer_size = 4 * 1024 * 4;
+  ib_size = 1024 * 4;
+   case IB_CONST:
+  buffer_size = 512 * 1024 * 4;
+  ib_size = 128 * 1024 * 4;
+  break;
+   case IB_MAIN:
+  buffer_size = 128 * 1024 * 4;
+  ib_size = 20 * 1024 * 4;
+   }
 
ib->base.cdw = 0;
ib->base.buf = NULL;
@@ -350,7 +362,7 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
   return NULL;
}
 
-   if (!amdgpu_get_new_ib(>ws->base, >main, >ib[IB_MAIN])) {
+   if (!amdgpu_get_new_ib(>ws->base, >main, >ib[IB_MAIN], 
IB_MAIN)) {
   amdgpu_destroy_cs_context(cs);
   FREE(cs);
   return NULL;
@@ -373,7 +385,7 @@ amdgpu_cs_add_const_ib(struct radeon_winsys_cs *rcs)
if (cs->ring_type != RING_GFX || cs->const_ib.ib_mapped)
   return NULL;
 
-   if (!amdgpu_get_new_ib(>base, >const_ib, >ib[IB_CONST]))
+   if (!amdgpu_get_new_ib(>base, >const_ib, >ib[IB_CONST], 
IB_CONST))
   return NULL;
 
cs->request.number_of_ibs = 2;
@@ -760,12 +772,12 @@ static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
 cleanup:
amdgpu_cs_context_cleanup(cs);
 
-   amdgpu_get_new_ib(>base, >main, >ib[IB_MAIN]);
+   amdgpu_get_new_ib(>base, >main, >ib[IB_MAIN], IB_MAIN);
if (cs->const_ib.ib_mapped)
-  amdgpu_get_new_ib(>base, >const_ib, >ib[IB_CONST]);
+  amdgpu_get_new_ib(>base, >const_ib, >ib[IB_CONST], IB_CONST);
if (cs->const_preamble_ib.ib_mapped)
   amdgpu_get_new_ib(>base, >const_preamble_ib,
- >ib[IB_CONST_PREAMBLE]);
+ >ib[IB_CONST_PREAMBLE], 
IB_CONST_PREAMBLE);
 
ws->num_cs_flushes++;
 }
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 00/12] Constant engine for radeonsi

2016-04-19 Thread Bas Nieuwenhuizen
Changes from v2:
  - Remains of vertex buffer descriptor support have been removed. Both
wrt the space calculation and allocating CE ram.

  - Failing to create a preamble IB now rersults in failure.

  - Misc style fixes in patch 5 and 12.

- Bas

Bas Nieuwenhuizen (9):
  winsys/amdgpu: Enlarge const IB size.
  radeonsi: Create CE IB.
  radeonsi: Add CE packet definitions.
  radeonsi: Add CE synchronization.
  radeonsi: Allocate chunks of CE ram.
  radeonsi: Add CE uploader.
  radeonsi: Replace list_dirty with a mask.
  gallium/util: Add u_bit_scan_consecutive_range64.
  radeonsi: Use CE for all descriptors.

Marek Olšák (3):
  gallium/radeon: move ring_type into winsyses
  winsys/amdgpu: split IB data into a new structure in preparation for
CE
  winsys/amdgpu: Add support for const IB.

 src/gallium/auxiliary/util/u_math.h   |  14 +++
 src/gallium/drivers/radeon/r600_pipe_common.c |   1 +
 src/gallium/drivers/radeon/r600_pipe_common.h |   1 +
 src/gallium/drivers/radeon/radeon_winsys.h|  31 -
 src/gallium/drivers/radeonsi/si_descriptors.c | 165 +++-
 src/gallium/drivers/radeonsi/si_hw_context.c  |  35 +-
 src/gallium/drivers/radeonsi/si_pipe.c|  26 
 src/gallium/drivers/radeonsi/si_pipe.h|   7 ++
 src/gallium/drivers/radeonsi/si_state.h   |  12 +-
 src/gallium/drivers/radeonsi/si_state_draw.c  |  24 
 src/gallium/drivers/radeonsi/sid.h|   6 +
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.c |   5 -
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.h |   6 +
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 172 --
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.h |  28 +++--
 src/gallium/winsys/radeon/drm/radeon_drm_cs.c |  10 +-
 src/gallium/winsys/radeon/drm/radeon_drm_cs.h |   1 +
 17 files changed, 448 insertions(+), 96 deletions(-)

-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 12/12] radeonsi: Use CE for all descriptors.

2016-04-19 Thread Bas Nieuwenhuizen
v2: Load previous list for new CS instead of re-emitting
all descriptors.

v3: Do radeon_add_to_buffer_list in si_ce_upload.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
Reviewed-by: Marek Olšák <marek.ol...@amd.com>
---
 src/gallium/drivers/radeonsi/si_descriptors.c | 74 +++
 1 file changed, 64 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index a5018db..ecb72de 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -60,6 +60,7 @@
 #include "si_shader.h"
 #include "sid.h"
 
+#include "util/u_math.h"
 #include "util/u_memory.h"
 #include "util/u_suballoc.h"
 #include "util/u_upload_mgr.h"
@@ -151,33 +152,86 @@ static bool si_ce_upload(struct si_context *sctx, 
unsigned ce_offset, unsigned s
radeon_emit(sctx->ce_ib, va);
radeon_emit(sctx->ce_ib, va >> 32);
 
+   radeon_add_to_buffer_list(>b, >b.gfx, desc->buffer,
+  RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS);
+
sctx->ce_need_synchronization = true;
return true;
 }
 
+static void si_reinitialize_ce_ram(struct si_context *sctx,
+struct si_descriptors *desc)
+{
+   if (desc->buffer) {
+   struct r600_resource *buffer = (struct 
r600_resource*)desc->buffer;
+   unsigned list_size = desc->num_elements * desc->element_dw_size 
* 4;
+   uint64_t va = buffer->gpu_address + desc->buffer_offset;
+   struct radeon_winsys_cs *ib = sctx->ce_preamble_ib;
+
+   if (!ib)
+   ib = sctx->ce_ib;
+
+   list_size = align(list_size, 32);
+
+   radeon_emit(ib, PKT3(PKT3_LOAD_CONST_RAM, 3, 0));
+   radeon_emit(ib, va);
+   radeon_emit(ib, va >> 32);
+   radeon_emit(ib, list_size / 4);
+   radeon_emit(ib, desc->ce_offset);
+
+   radeon_add_to_buffer_list(>b, >b.gfx, desc->buffer,
+   RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
+   }
+   desc->ce_ram_dirty = false;
+}
 
 static bool si_upload_descriptors(struct si_context *sctx,
  struct si_descriptors *desc)
 {
unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
-   void *ptr;
 
if (!desc->dirty_mask)
return true;
 
-   u_upload_alloc(sctx->b.uploader, 0, list_size, 256,
-  >buffer_offset,
-  (struct pipe_resource**)>buffer, );
-   if (!desc->buffer)
-   return false; /* skip the draw call */
+   if (sctx->ce_ib) {
+   uint32_t const* list = (uint32_t const*)desc->list;
 
-   util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
+   if (desc->ce_ram_dirty)
+   si_reinitialize_ce_ram(sctx, desc);
 
-   radeon_add_to_buffer_list(>b, >b.gfx, desc->buffer,
- RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
+   while(desc->dirty_mask) {
+   int begin, count;
+   u_bit_scan_consecutive_range64(>dirty_mask, 
,
+  );
 
-   desc->dirty_mask = 0;
+   begin *= desc->element_dw_size;
+   count *= desc->element_dw_size;
+
+   radeon_emit(sctx->ce_ib,
+   PKT3(PKT3_WRITE_CONST_RAM, count, 0));
+   radeon_emit(sctx->ce_ib, desc->ce_offset + begin * 4);
+   radeon_emit_array(sctx->ce_ib, list + begin, count);
+   }
+
+   if (!si_ce_upload(sctx, desc->ce_offset, list_size,
+  >buffer_offset, >buffer))
+   return false;
+   } else {
+   void *ptr;
+
+   u_upload_alloc(sctx->b.uploader, 0, list_size, 256,
+   >buffer_offset,
+   (struct pipe_resource**)>buffer, );
+   if (!desc->buffer)
+   return false; /* skip the draw call */
+
+   util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
+
+   radeon_add_to_buffer_list(>b, >b.gfx, desc->buffer,
+   RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
+   }
desc->pointer_dirty = true;
+   desc->dirty_mask = 0;
si_mark_atom_dirty(sctx, >shader_userdata.atom);
return true;
 }
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 11/12] gallium/util: Add u_bit_scan_consecutive_range64.

2016-04-19 Thread Bas Nieuwenhuizen
For use by radeonsi.

v2: Make sure that it works for all 64 bits set.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
Reviewed-by: Marek Olšák <marek.ol...@amd.com>
---
 src/gallium/auxiliary/util/u_math.h | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/src/gallium/auxiliary/util/u_math.h 
b/src/gallium/auxiliary/util/u_math.h
index d983af3..10f158b 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -529,6 +529,20 @@ u_bit_scan_consecutive_range(unsigned *mask, int *start, 
int *count)
*mask &= ~(((1u << *count) - 1) << *start);
 }
 
+static inline void
+u_bit_scan_consecutive_range64(uint64_t *mask, int *start, int *count)
+{
+   if (*mask == ~0llu) {
+  *start = 0;
+  *count = 64;
+  *mask = 0;
+  return;
+   }
+   *start = ffsll(*mask) - 1;
+   *count = ffsll(~(*mask >> *start)) - 1;
+   *mask &= ~(((1llu << *count) - 1) << *start);
+}
+
 /**
  * Return float bits.
  */
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 12/12] radeonsi: Use CE for all descriptors.

2016-04-19 Thread Bas Nieuwenhuizen
v2: Load previous list for new CS instead of re-emitting
all descriptors.

v3: Do radeon_add_to_buffer_list in si_ce_upload.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
Reviewed-by: Marek Olšák <marek.ol...@amd.com>
---

Forgot to save the file before amending

 src/gallium/drivers/radeonsi/si_descriptors.c | 74 +++
 1 file changed, 64 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index a5018db..944c498 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -60,6 +60,7 @@
 #include "si_shader.h"
 #include "sid.h"
 
+#include "util/u_math.h"
 #include "util/u_memory.h"
 #include "util/u_suballoc.h"
 #include "util/u_upload_mgr.h"
@@ -151,33 +152,86 @@ static bool si_ce_upload(struct si_context *sctx, 
unsigned ce_offset, unsigned s
radeon_emit(sctx->ce_ib, va);
radeon_emit(sctx->ce_ib, va >> 32);
 
+   radeon_add_to_buffer_list(>b, >b.gfx, *out_buf,
+  RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS);
+
sctx->ce_need_synchronization = true;
return true;
 }
 
+static void si_reinitialize_ce_ram(struct si_context *sctx,
+struct si_descriptors *desc)
+{
+   if (desc->buffer) {
+   struct r600_resource *buffer = (struct 
r600_resource*)desc->buffer;
+   unsigned list_size = desc->num_elements * desc->element_dw_size 
* 4;
+   uint64_t va = buffer->gpu_address + desc->buffer_offset;
+   struct radeon_winsys_cs *ib = sctx->ce_preamble_ib;
+
+   if (!ib)
+   ib = sctx->ce_ib;
+
+   list_size = align(list_size, 32);
+
+   radeon_emit(ib, PKT3(PKT3_LOAD_CONST_RAM, 3, 0));
+   radeon_emit(ib, va);
+   radeon_emit(ib, va >> 32);
+   radeon_emit(ib, list_size / 4);
+   radeon_emit(ib, desc->ce_offset);
+
+   radeon_add_to_buffer_list(>b, >b.gfx, desc->buffer,
+   RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
+   }
+   desc->ce_ram_dirty = false;
+}
 
 static bool si_upload_descriptors(struct si_context *sctx,
  struct si_descriptors *desc)
 {
unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
-   void *ptr;
 
if (!desc->dirty_mask)
return true;
 
-   u_upload_alloc(sctx->b.uploader, 0, list_size, 256,
-  >buffer_offset,
-  (struct pipe_resource**)>buffer, );
-   if (!desc->buffer)
-   return false; /* skip the draw call */
+   if (sctx->ce_ib) {
+   uint32_t const* list = (uint32_t const*)desc->list;
 
-   util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
+   if (desc->ce_ram_dirty)
+   si_reinitialize_ce_ram(sctx, desc);
 
-   radeon_add_to_buffer_list(>b, >b.gfx, desc->buffer,
- RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
+   while(desc->dirty_mask) {
+   int begin, count;
+   u_bit_scan_consecutive_range64(>dirty_mask, 
,
+  );
 
-   desc->dirty_mask = 0;
+   begin *= desc->element_dw_size;
+   count *= desc->element_dw_size;
+
+   radeon_emit(sctx->ce_ib,
+   PKT3(PKT3_WRITE_CONST_RAM, count, 0));
+   radeon_emit(sctx->ce_ib, desc->ce_offset + begin * 4);
+   radeon_emit_array(sctx->ce_ib, list + begin, count);
+   }
+
+   if (!si_ce_upload(sctx, desc->ce_offset, list_size,
+  >buffer_offset, >buffer))
+   return false;
+   } else {
+   void *ptr;
+
+   u_upload_alloc(sctx->b.uploader, 0, list_size, 256,
+   >buffer_offset,
+   (struct pipe_resource**)>buffer, );
+   if (!desc->buffer)
+   return false; /* skip the draw call */
+
+   util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
+
+   radeon_add_to_buffer_list(>b, >b.gfx, desc->buffer,
+   RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
+   }
desc->pointer_dirty = true;
+   desc->dirty_mask = 0;
si_mark_atom_dirty(sctx, >shader_userdata.atom);
return true;
 }
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 0/2] Remainder radeonsi compute patches.

2016-04-18 Thread Bas Nieuwenhuizen
I added some CS_PARTIAL_FLUSH events after MArek's response. I haven't been 
able 
to detect anything wrong without them. However at least theoretically some 
event 
has to wait on CS shaders at the new points.(e.g fbo change clearly has a 
potential write after read hazard otherwise).

I also updated the update cap patch, as I discovered that writing the USER_DATA 
registers from a COPY_DATA packet was disallowed by the kernel with the SI CS 
checker. 

Now that that has been fixed in the kernel, the new patch checks for the drm 
version that has the fix.

Bas Nieuwenhuizen (2):
  radeonsi: do not do two full flushes on every compute dispatch
  radeonsi: enable TGSI support cap for compute shaders

 docs/GL3.txt  |  4 ++--
 docs/relnotes/11.3.0.html |  1 +
 src/gallium/drivers/radeon/r600_pipe_common.c | 21 -
 src/gallium/drivers/radeonsi/si_compute.c | 17 ++---
 src/gallium/drivers/radeonsi/si_cp_dma.c  |  6 --
 src/gallium/drivers/radeonsi/si_descriptors.c |  3 ++-
 src/gallium/drivers/radeonsi/si_hw_context.c  |  1 +
 src/gallium/drivers/radeonsi/si_pipe.c| 15 +--
 src/gallium/drivers/radeonsi/si_state.c   | 12 
 9 files changed, 49 insertions(+), 31 deletions(-)

-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 2/2] radeonsi: enable TGSI support cap for compute shaders

2016-04-18 Thread Bas Nieuwenhuizen
v2: Use chip_class instead of family.

v3: Check kernel version for SI.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 docs/GL3.txt  |  4 ++--
 docs/relnotes/11.3.0.html |  1 +
 src/gallium/drivers/radeon/r600_pipe_common.c | 21 -
 src/gallium/drivers/radeonsi/si_pipe.c| 15 +--
 4 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 3febd6e..6214f8d 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -167,7 +167,7 @@ GL 4.3, GLSL 4.30:
   GL_ARB_arrays_of_arrays   DONE (all drivers that 
support GLSL 1.30)
   GL_ARB_ES3_compatibility  DONE (all drivers that 
support GLSL 3.30)
   GL_ARB_clear_buffer_objectDONE (all drivers)
-  GL_ARB_compute_shader DONE (i965)
+  GL_ARB_compute_shader DONE (i965, radeonsi)
   GL_ARB_copy_image DONE (i965, nv50, 
nvc0, r600, radeonsi)
   GL_KHR_debug  DONE (all drivers)
   GL_ARB_explicit_uniform_location  DONE (all drivers that 
support GLSL)
@@ -225,7 +225,7 @@ GL 4.5, GLSL 4.50:
 These are the extensions cherry-picked to make GLES 3.1
 GLES3.1, GLSL ES 3.1
   GL_ARB_arrays_of_arrays   DONE (all drivers that 
support GLSL 1.30)
-  GL_ARB_compute_shader DONE (i965)
+  GL_ARB_compute_shader DONE (i965, radeonsi)
   GL_ARB_draw_indirect  DONE (i965, nvc0, 
r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_explicit_uniform_location  DONE (all drivers that 
support GLSL)
   GL_ARB_framebuffer_no_attachments DONE (i965, nvc0, 
r600, radeonsi, softpipe)
diff --git a/docs/relnotes/11.3.0.html b/docs/relnotes/11.3.0.html
index 0f9aed8..5a7083c 100644
--- a/docs/relnotes/11.3.0.html
+++ b/docs/relnotes/11.3.0.html
@@ -45,6 +45,7 @@ Note: some of the new features are only available with 
certain drivers.
 
 
 OpenGL 4.2 on radeonsi
+GL_ARB_compute_shader on radeonsi
 GL_ARB_framebuffer_no_attachments on nvc0, r600, radeonsi, softpipe
 GL_ARB_internalformat_query2 on all drivers
 GL_ARB_robust_buffer_access_behavior on radeonsi
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c 
b/src/gallium/drivers/radeon/r600_pipe_common.c
index a7477ab..64da62f 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -645,23 +645,34 @@ static int r600_get_compute_param(struct pipe_screen 
*screen,
uint64_t *grid_size = ret;
grid_size[0] = 65535;
grid_size[1] = 65535;
-   grid_size[2] = 1;
+   grid_size[2] = 65535;
}
return 3 * sizeof(uint64_t) ;
 
case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
if (ret) {
uint64_t *block_size = ret;
-   block_size[0] = 256;
-   block_size[1] = 256;
-   block_size[2] = 256;
+   if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 &&
+   ir_type == PIPE_SHADER_IR_TGSI) {
+   block_size[0] = 2048;
+   block_size[1] = 2048;
+   block_size[2] = 2048;
+   } else {
+   block_size[0] = 256;
+   block_size[1] = 256;
+   block_size[2] = 256;
+   }
}
return 3 * sizeof(uint64_t);
 
case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
if (ret) {
uint64_t *max_threads_per_block = ret;
-   *max_threads_per_block = 256;
+   if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 &&
+   ir_type == PIPE_SHADER_IR_TGSI)
+   *max_threads_per_block = 2048;
+   else
+   *max_threads_per_block = 256;
}
return sizeof(uint64_t);
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
b/src/gallium/drivers/radeonsi/si_pipe.c
index f22cd03..7501a8f 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -447,6 +447,8 @@ static int si_get_param(struct pipe_screen* pscreen, enum 
pipe_cap param)
 
 static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, 
enum pipe_shader_cap param)
 {
+   struct si_screen *sscreen = (struct si_screen *)

[Mesa-dev] [PATCH v3 1/2] radeonsi: do not do two full flushes on every compute dispatch

2016-04-18 Thread Bas Nieuwenhuizen
v2: Add more CS_PARTIAL_FLUSH events.

Essentially every place with waits on finishing for pixel shaders
also has a write after read hazard with compute shaders.

Invalidating L2 waits implicitly on pixel and compute shaders,
so, we don't need a CS_PARTIAL_FLUSH for switching FBO.

v3: Add CS_PARTIAL_FLUSH events even if we already have INV_GLOBAL_L2.

According to Marek the INV_GLOBAL_L2 events don't wait for compute
shaders to finish, so wait for them explicitly.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_compute.c | 17 ++---
 src/gallium/drivers/radeonsi/si_cp_dma.c  |  6 --
 src/gallium/drivers/radeonsi/si_descriptors.c |  3 ++-
 src/gallium/drivers/radeonsi/si_hw_context.c  |  1 +
 src/gallium/drivers/radeonsi/si_state.c   | 12 
 5 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 10b88b3..6803334 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -439,13 +439,8 @@ static void si_launch_grid(
if (!sctx->cs_shader_state.initialized)
si_initialize_compute(sctx);
 
-   sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
-SI_CONTEXT_INV_GLOBAL_L2 |
-SI_CONTEXT_INV_ICACHE |
-SI_CONTEXT_INV_SMEM_L1 |
-SI_CONTEXT_FLUSH_WITH_INV_L2 |
-SI_CONTEXT_FLAG_COMPUTE;
-   si_emit_cache_flush(sctx, NULL);
+   if (sctx->b.flags)
+   si_emit_cache_flush(sctx, NULL);
 
if (!si_switch_compute_shader(sctx, program, >shader, 
info->pc))
return;
@@ -478,14 +473,6 @@ static void si_launch_grid(
si_setup_tgsi_grid(sctx, info);
 
si_emit_dispatch_packets(sctx, info);
-
-   sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-SI_CONTEXT_INV_VMEM_L1 |
-SI_CONTEXT_INV_GLOBAL_L2 |
-SI_CONTEXT_INV_ICACHE |
-SI_CONTEXT_INV_SMEM_L1 |
-SI_CONTEXT_FLAG_COMPUTE;
-   si_emit_cache_flush(sctx, NULL);
 }
 
 
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c 
b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 001ddd4..38e0ee6 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -190,7 +190,8 @@ static void si_clear_buffer(struct pipe_context *ctx, 
struct pipe_resource *dst,
uint64_t va = r600_resource(dst)->gpu_address + offset;
 
/* Flush the caches. */
-   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
+   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
 
while (size) {
unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
@@ -296,7 +297,8 @@ void si_copy_buffer(struct si_context *sctx,
}
 
/* Flush the caches. */
-   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
+   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
 
/* This is the main part doing the copying. Src is always aligned. */
main_dst_offset = dst_offset + skipped_size;
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index 5b65fae..98ad3a7 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -940,7 +940,8 @@ static void si_set_streamout_targets(struct pipe_context 
*ctx,
 * start writing to the targets.
 */
if (num_targets)
-   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+SI_CONTEXT_CS_PARTIAL_FLUSH;
 
/* Streamout buffers must be bound in 2 places:
 * 1) in VGT by setting the VGT_STRMOUT registers
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c 
b/src/gallium/drivers/radeonsi/si_hw_context.c
index 9862f07..b179092e 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -84,6 +84,7 @@ void si_context_gfx_flush(void *context, unsigned flags,
ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
SI_CONTEXT_INV_VMEM_L1 |
SI_CONTEXT_INV_GLOBAL_L2 |
+   SI_CONTEXT_CS_PARTIAL_FLUSH |
/* this is probably not needed anymore */
SI_CONTEXT_PS_PARTIAL_FLUSH;
si_emit_cache_flush(ctx, NULL);
diff --git a/src/gallium/drivers/radeonsi/si_state.c 
b/src/gallium/drivers/radeonsi/si_state.c
index af9ffdd..3

Re: [Mesa-dev] [PATCH v3 1/2] radeonsi: do not do two full flushes on every compute dispatch

2016-04-19 Thread Bas Nieuwenhuizen
Write by ssbo/image is completely synchronized by glMemoryBarrier, not
by the driver. Even for the write after read hazard, the user needs to
put a barrier before the write. This includes all writes by compute.

For transform feedback we apply the same rules as for graphics:
reading from the feedback bfufer might go wrong if transform feedback
is still active, otherwise the driver synchronizes. I can't really
find anything in the GL spec that disallows the loop though.

For write via framebuffer, writing to a framebuffer and reading it
from a CS is a render feedback loop, both for write after read and
read after write. The texture regions therefore need to be disjunct or
a glTextureBarrier() needs to be used by the user. I am not sure if
the glTextureBarrier should be enough to prevent a write after read
hazard, as the spec only talks about it solving the read after write
hazard. However, in my current patch it solves both, and I think the
user needs some form of synchronization there, probably rebinding the
framebuffer. Furthermore, we have this issue too without compute
shaders.

- Bas

On Tue, Apr 19, 2016 at 12:50 PM, Marek Olšák <mar...@gmail.com> wrote:
> There can be read-after-write hazards when transitioning from compute
> to graphics and vice versa. Is the user expected to call
> glMemoryBarrier in this case or do we need to synchronize explicitly
> in the driver?
>
> Marek
>
> On Tue, Apr 19, 2016 at 1:39 AM, Bas Nieuwenhuizen
> <b...@basnieuwenhuizen.nl> wrote:
>> v2: Add more CS_PARTIAL_FLUSH events.
>>
>> Essentially every place with waits on finishing for pixel shaders
>> also has a write after read hazard with compute shaders.
>>
>> Invalidating L2 waits implicitly on pixel and compute shaders,
>> so, we don't need a CS_PARTIAL_FLUSH for switching FBO.
>>
>> v3: Add CS_PARTIAL_FLUSH events even if we already have INV_GLOBAL_L2.
>>
>> According to Marek the INV_GLOBAL_L2 events don't wait for compute
>> shaders to finish, so wait for them explicitly.
>>
>> Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
>> ---
>>  src/gallium/drivers/radeonsi/si_compute.c | 17 ++---
>>  src/gallium/drivers/radeonsi/si_cp_dma.c  |  6 --
>>  src/gallium/drivers/radeonsi/si_descriptors.c |  3 ++-
>>  src/gallium/drivers/radeonsi/si_hw_context.c  |  1 +
>>  src/gallium/drivers/radeonsi/si_state.c   | 12 
>>  5 files changed, 17 insertions(+), 22 deletions(-)
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
>> b/src/gallium/drivers/radeonsi/si_compute.c
>> index 10b88b3..6803334 100644
>> --- a/src/gallium/drivers/radeonsi/si_compute.c
>> +++ b/src/gallium/drivers/radeonsi/si_compute.c
>> @@ -439,13 +439,8 @@ static void si_launch_grid(
>> if (!sctx->cs_shader_state.initialized)
>> si_initialize_compute(sctx);
>>
>> -   sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
>> -SI_CONTEXT_INV_GLOBAL_L2 |
>> -SI_CONTEXT_INV_ICACHE |
>> -SI_CONTEXT_INV_SMEM_L1 |
>> -SI_CONTEXT_FLUSH_WITH_INV_L2 |
>> -SI_CONTEXT_FLAG_COMPUTE;
>> -   si_emit_cache_flush(sctx, NULL);
>> +   if (sctx->b.flags)
>> +   si_emit_cache_flush(sctx, NULL);
>>
>> if (!si_switch_compute_shader(sctx, program, >shader, 
>> info->pc))
>> return;
>> @@ -478,14 +473,6 @@ static void si_launch_grid(
>> si_setup_tgsi_grid(sctx, info);
>>
>> si_emit_dispatch_packets(sctx, info);
>> -
>> -   sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
>> -SI_CONTEXT_INV_VMEM_L1 |
>> -SI_CONTEXT_INV_GLOBAL_L2 |
>> -SI_CONTEXT_INV_ICACHE |
>> -SI_CONTEXT_INV_SMEM_L1 |
>> -SI_CONTEXT_FLAG_COMPUTE;
>> -   si_emit_cache_flush(sctx, NULL);
>>  }
>>
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c 
>> b/src/gallium/drivers/radeonsi/si_cp_dma.c
>> index 001ddd4..38e0ee6 100644
>> --- a/src/gallium/drivers/radeonsi/si_cp_dma.c
>> +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
>> @@ -190,7 +190,8 @@ static void si_clear_buffer(struct pipe_context *ctx, 
>> struct pipe_resource *dst,
>> uint64_t va = r600_resource(dst)->gpu_address + offset;
>>
>> /* Flush the caches. */
>> -   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flu

[Mesa-dev] [PATCH 2/2] radeonsi: Consider input SGPR count for compute shader SGPR count.

2016-04-19 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_compute.c | 16 +++-
 src/gallium/drivers/radeonsi/si_shader.c  |  3 ++-
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 557e892..905c169 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -81,14 +81,20 @@ static void *si_create_compute_state(
 
program->shader.selector = 
 
-   if (si_compile_tgsi_shader(sscreen, sctx->tm, >shader,
-  true, >b.debug)) {
+   if (si_shader_create(sscreen, sctx->tm, >shader,
+>b.debug)) {
FREE(sel.tokens);
return NULL;
}
 
scratch_enabled = shader->config.scratch_bytes_per_wave > 0;
 
+   shader->config.rsrc1 =
+  S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) |
+  S_00B848_SGPRS((shader->config.num_sgprs - 1) / 8) |
+  S_00B848_DX10_CLAMP(1) |
+  S_00B848_FLOAT_MODE(shader->config.float_mode);
+
shader->config.rsrc2 = S_00B84C_USER_SGPR(SI_CS_NUM_USER_SGPR) |
   S_00B84C_SCRATCH_EN(scratch_enabled) |
   S_00B84C_TGID_X_EN(1) | S_00B84C_TGID_Y_EN(1) |
@@ -105,10 +111,10 @@ static void *si_create_compute_state(
radeon_elf_read(code, header->num_bytes, 
>shader.binary);
si_shader_binary_read_config(>shader.binary,
 >shader.config, 0);
+   si_shader_dump(sctx->screen, >shader, >b.debug,
+  PIPE_SHADER_COMPUTE, stderr);
+   si_shader_binary_upload(sctx->screen, >shader);
}
-   si_shader_dump(sctx->screen, >shader, >b.debug,
-  TGSI_PROCESSOR_COMPUTE, stderr);
-   si_shader_binary_upload(sctx->screen, >shader);
 
return program;
 }
diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index 605b964..3bf68eb 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -7022,7 +7022,8 @@ int si_shader_create(struct si_screen *sscreen, 
LLVMTargetMachineRef tm,
 (shader->key.vs.as_es != mainp->key.vs.as_es ||
  shader->key.vs.as_ls != mainp->key.vs.as_ls)) ||
(shader->selector->type == PIPE_SHADER_TESS_EVAL &&
-shader->key.tes.as_es != mainp->key.tes.as_es)) {
+shader->key.tes.as_es != mainp->key.tes.as_es) ||
+   shader->selector->type == PIPE_SHADER_COMPUTE) {
/* Monolithic shader (compiled as a whole, has many variants,
 * may take a long time to compile).
 */
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/2] radeonsi: Add CE synchronization for compute dispatches.

2016-04-19 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_compute.c| 4 
 src/gallium/drivers/radeonsi/si_state.h  | 2 ++
 src/gallium/drivers/radeonsi/si_state_draw.c | 4 ++--
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 105cf8c..557e892 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -474,7 +474,11 @@ static void si_launch_grid(
if (program->ir_type == PIPE_SHADER_IR_TGSI)
si_setup_tgsi_grid(sctx, info);
 
+   si_ce_pre_draw_synchronization(sctx);
+
si_emit_dispatch_packets(sctx, info);
+
+   si_ce_post_draw_synchronization(sctx);
 }
 
 
diff --git a/src/gallium/drivers/radeonsi/si_state.h 
b/src/gallium/drivers/radeonsi/si_state.h
index 3679532..c4b2b45 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -321,6 +321,8 @@ void si_destroy_shader_cache(struct si_screen *sscreen);
 
 /* si_state_draw.c */
 void si_emit_cache_flush(struct si_context *sctx, struct r600_atom *atom);
+void si_ce_pre_draw_synchronization(struct si_context *sctx);
+void si_ce_post_draw_synchronization(struct si_context *sctx);
 void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *dinfo);
 void si_trace_emit(struct si_context *sctx);
 
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c 
b/src/gallium/drivers/radeonsi/si_state_draw.c
index 15d58d4..b61c05a 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -750,7 +750,7 @@ static void si_get_draw_start_count(struct si_context *sctx,
}
 }
 
-static void si_ce_pre_draw_synchronization(struct si_context *sctx)
+void si_ce_pre_draw_synchronization(struct si_context *sctx)
 {
if (sctx->ce_need_synchronization) {
radeon_emit(sctx->ce_ib, PKT3(PKT3_INCREMENT_CE_COUNTER, 0, 0));
@@ -761,7 +761,7 @@ static void si_ce_pre_draw_synchronization(struct 
si_context *sctx)
}
 }
 
-static void si_ce_post_draw_synchronization(struct si_context *sctx)
+void si_ce_post_draw_synchronization(struct si_context *sctx)
 {
if (sctx->ce_need_synchronization) {
radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_INCREMENT_DE_COUNTER, 0, 
0));
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 08/20] radeonsi: split input upload off from si_launch_grid

2016-04-13 Thread Bas Nieuwenhuizen
Also uses a dynamically allocated buffer using u_upload_alloc.
The old buffer per program approach required serializing all
dispatches of the same program.

v2: - Clarified commit message.
- Use radeon_set_sh_reg_seq.
- Also upload input buffer for clover kernels, even when
  input_size is 0, as it contains grid parameters.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_compute.c | 93 +--
 1 file changed, 52 insertions(+), 41 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index b01b926..27f779a 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -24,6 +24,7 @@
 
 #include "tgsi/tgsi_parse.h"
 #include "util/u_memory.h"
+#include "util/u_upload_mgr.h"
 #include "radeon/r600_pipe_common.h"
 #include "radeon/radeon_elf_util.h"
 #include "radeon/radeon_llvm_util.h"
@@ -42,7 +43,6 @@ struct si_compute {
unsigned input_size;
struct si_shader shader;
 
-   struct r600_resource *input_buffer;
struct pipe_resource *global_buffers[MAX_GLOBAL_BUFFERS];
 };
 
@@ -154,11 +154,6 @@ static void *si_create_compute_state(
   TGSI_PROCESSOR_COMPUTE, stderr);
si_shader_binary_upload(sctx->screen, >shader);
 
-   if (program->input_size) {
-   program->input_buffer = 
si_resource_create_custom(sctx->b.b.screen,
-   PIPE_USAGE_IMMUTABLE, program->input_size);
-   }
-
return program;
 }
 
@@ -233,19 +228,63 @@ static unsigned compute_num_waves_for_scratch(
return scratch_waves;
 }
 
-static void si_launch_grid(
-   struct pipe_context *ctx, const struct pipe_grid_info *info)
+static void si_upload_compute_input(struct si_context *sctx,
+  const struct pipe_grid_info *info)
 {
-   struct si_context *sctx = (struct si_context*)ctx;
struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
struct si_compute *program = sctx->cs_shader_state.program;
-   struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
-   struct r600_resource *input_buffer = program->input_buffer;
+   struct r600_resource *input_buffer = NULL;
unsigned kernel_args_size;
unsigned num_work_size_bytes = 36;
uint32_t kernel_args_offset = 0;
uint32_t *kernel_args;
+   void *kernel_args_ptr;
uint64_t kernel_args_va;
+   unsigned i;
+
+   /* The extra num_work_size_bytes are for work group / work item size 
information */
+   kernel_args_size = program->input_size + num_work_size_bytes;
+
+   u_upload_alloc(sctx->b.uploader, 0, kernel_args_size, 256,
+  _args_offset,
+  (struct pipe_resource**)_buffer, _args_ptr);
+
+   kernel_args = (uint32_t*)kernel_args_ptr;
+   for (i = 0; i < 3; i++) {
+   kernel_args[i] = info->grid[i];
+   kernel_args[i + 3] = info->grid[i] * info->block[i];
+   kernel_args[i + 6] = info->block[i];
+   }
+
+   memcpy(kernel_args + (num_work_size_bytes / 4), info->input,
+  program->input_size);
+
+
+   for (i = 0; i < (kernel_args_size / 4); i++) {
+   COMPUTE_DBG(sctx->screen, "input %u : %u\n", i,
+   kernel_args[i]);
+   }
+
+   kernel_args_va = input_buffer->gpu_address + kernel_args_offset;
+
+   radeon_add_to_buffer_list(>b, >b.gfx, input_buffer,
+ RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
+
+   radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2);
+   radeon_emit(cs, kernel_args_va);
+   radeon_emit(cs, S_008F04_BASE_ADDRESS_HI (kernel_args_va >> 32) |
+   S_008F04_STRIDE(0));
+
+   pipe_resource_reference((struct pipe_resource**)_buffer, NULL);
+}
+
+static void si_launch_grid(
+   struct pipe_context *ctx, const struct pipe_grid_info *info)
+{
+   struct si_context *sctx = (struct si_context*)ctx;
+   struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+   struct si_compute *program = sctx->cs_shader_state.program;
+   struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
uint64_t scratch_buffer_va = 0;
uint64_t shader_va;
unsigned i;
@@ -270,25 +309,12 @@ static void si_launch_grid(
/* Read the config information */
si_shader_binary_read_config(>binary, >config, 
info->pc);
 
-   /* Upload the kernel arguments */
-
-   /* The extra num_work_size_bytes are for work group / work item size 
information */
-   kernel_args_size = program->input_size + num_work_size_bytes + 8 /* For 
scratch va */;
-
-   kernel_args = sctx-&

[Mesa-dev] [PATCH v2 16/20] radeonsi: split setting graphics and compute descriptors

2016-04-13 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
Reviewed-by: Marek Olšák <marek.ol...@amd.com>
Reviewed-by: Nicolai Hähnle <nicolai.haeh...@amd.com>
---
 src/gallium/drivers/radeonsi/si_compute.c |  3 ++
 src/gallium/drivers/radeonsi/si_descriptors.c | 61 ++-
 src/gallium/drivers/radeonsi/si_state.h   |  7 ++-
 src/gallium/drivers/radeonsi/si_state_draw.c  |  2 +-
 4 files changed, 59 insertions(+), 14 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index dee0b3a..10b88b3 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -450,6 +450,9 @@ static void si_launch_grid(
if (!si_switch_compute_shader(sctx, program, >shader, 
info->pc))
return;
 
+   si_upload_compute_shader_descriptors(sctx);
+   si_emit_compute_shader_userdata(sctx);
+
if (si_is_atom_dirty(sctx, sctx->atoms.s.render_cond)) {
sctx->atoms.s.render_cond->emit(>b,
sctx->atoms.s.render_cond);
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index b5557d8..a2c096f 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -126,7 +126,8 @@ static void si_release_descriptors(struct si_descriptors 
*desc)
 }
 
 static bool si_upload_descriptors(struct si_context *sctx,
- struct si_descriptors *desc)
+ struct si_descriptors *desc,
+ struct r600_atom * atom)
 {
unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
void *ptr;
@@ -147,7 +148,10 @@ static bool si_upload_descriptors(struct si_context *sctx,
 
desc->list_dirty = false;
desc->pointer_dirty = true;
-   si_mark_atom_dirty(sctx, >shader_userdata.atom);
+
+   if (atom)
+   si_mark_atom_dirty(sctx, atom);
+
return true;
 }
 
@@ -1278,7 +1282,8 @@ static void si_emit_shader_pointer(struct si_context 
*sctx,
desc->pointer_dirty = keep_dirty;
 }
 
-void si_emit_shader_userdata(struct si_context *sctx, struct r600_atom *atom)
+void si_emit_graphics_shader_userdata(struct si_context *sctx,
+  struct r600_atom *atom)
 {
unsigned i;
uint32_t *sh_base = sctx->shader_userdata.sh_base;
@@ -1302,7 +1307,7 @@ void si_emit_shader_userdata(struct si_context *sctx, 
struct r600_atom *atom)
   R_00B130_SPI_SHADER_USER_DATA_VS_0, 
true);
}
 
-   for (i = 0; i < SI_NUM_SHADERS; i++) {
+   for (i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) {
unsigned base = sh_base[i];
 
if (!base)
@@ -1319,6 +1324,20 @@ void si_emit_shader_userdata(struct si_context *sctx, 
struct r600_atom *atom)
si_emit_shader_pointer(sctx, >vertex_buffers, 
sh_base[PIPE_SHADER_VERTEX], false);
 }
 
+void si_emit_compute_shader_userdata(struct si_context *sctx)
+{
+   unsigned base = R_00B900_COMPUTE_USER_DATA_0;
+
+   si_emit_shader_pointer(sctx, 
>const_buffers[PIPE_SHADER_COMPUTE].desc,
+  base, false);
+   si_emit_shader_pointer(sctx, 
>shader_buffers[PIPE_SHADER_COMPUTE].desc,
+  base, false);
+   si_emit_shader_pointer(sctx, 
>samplers[PIPE_SHADER_COMPUTE].views.desc,
+  base, false);
+   si_emit_shader_pointer(sctx, >images[PIPE_SHADER_COMPUTE].desc,
+  base, false);
+}
+
 /* INIT/DEINIT/UPLOAD */
 
 void si_init_all_descriptors(struct si_context *sctx)
@@ -1359,7 +1378,7 @@ void si_init_all_descriptors(struct si_context *sctx)
 
/* Shader user data. */
si_init_atom(sctx, >shader_userdata.atom, 
>atoms.s.shader_userdata,
-si_emit_shader_userdata);
+si_emit_graphics_shader_userdata);
 
/* Set default and immutable mappings. */
si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, 
R_00B130_SPI_SHADER_USER_DATA_VS_0);
@@ -1368,21 +1387,41 @@ void si_init_all_descriptors(struct si_context *sctx)
si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, 
R_00B030_SPI_SHADER_USER_DATA_PS_0);
 }
 
-bool si_upload_shader_descriptors(struct si_context *sctx)
+bool si_upload_graphics_shader_descriptors(struct si_context *sctx)
 {
int i;
 
for (i = 0; i < SI_NUM_SHADERS; i++) {
-   if (!si_upload_descriptors(sctx, >const_buffers[i].desc) 
||
-   !si_upload_descriptors(sctx, >rw_buffers[i].desc) ||
-   !si_upload_descriptors(sctx, >shader_buffers[i].desc) 
||
-   !si_upload_descriptors(

[Mesa-dev] [PATCH v2 19/20] mesa/st: enable compute shaders if images are also supported

2016-04-13 Thread Bas Nieuwenhuizen
v2: Also depend on atomic counters.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
Reviewed-by: Nicolai Hähnle <nicolai.haeh...@amd.com>
---
 src/mesa/state_tracker/st_extensions.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/mesa/state_tracker/st_extensions.c 
b/src/mesa/state_tracker/st_extensions.c
index 6d407d3..939f15d 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -1168,9 +1168,10 @@ void st_init_extensions(struct pipe_screen *screen,
 consts->MaxComputeWorkGroupCount[i] = grid_size[i];
 consts->MaxComputeWorkGroupSize[i] = block_size[i];
  }
- /* XXX: ARB_compute_shader is not enabled by default because images
-  * support is still not implemented yet. */
- /* extensions->ARB_compute_shader = true; */
+
+ extensions->ARB_compute_shader =
+  extensions->ARB_shader_image_load_store 
&&
+  extensions->ARB_shader_atomic_counters;
   }
}
 }
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 18/20] radeonsi: clean up compute flush

2016-04-13 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
Reviewed-by: Marek Olšák <marek.ol...@amd.com>
Reviewed-by: Nicolai Hähnle <nicolai.haeh...@amd.com>
---
 src/gallium/drivers/radeonsi/si_pipe.h   |  3 ---
 src/gallium/drivers/radeonsi/si_state_draw.c | 27 ++-
 2 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index 9b2be6f..9f279dc 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -63,9 +63,6 @@
 #define SI_CONTEXT_CS_PARTIAL_FLUSH(R600_CONTEXT_PRIVATE_FLAG << 10)
 #define SI_CONTEXT_VGT_FLUSH   (R600_CONTEXT_PRIVATE_FLAG << 11)
 #define SI_CONTEXT_VGT_STREAMOUT_SYNC  (R600_CONTEXT_PRIVATE_FLAG << 12)
-/* Compute only. */
-#define SI_CONTEXT_FLUSH_WITH_INV_L2   (R600_CONTEXT_PRIVATE_FLAG << 13) /* 
TODO: merge with TC? */
-#define SI_CONTEXT_FLAG_COMPUTE(R600_CONTEXT_PRIVATE_FLAG << 
14)
 
 #define SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER (SI_CONTEXT_FLUSH_AND_INV_CB | \
  SI_CONTEXT_FLUSH_AND_INV_CB_META 
| \
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c 
b/src/gallium/drivers/radeonsi/si_state_draw.c
index 56d0965..f0a3f75 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -607,8 +607,6 @@ void si_emit_cache_flush(struct si_context *si_ctx, struct 
r600_atom *atom)
struct r600_common_context *sctx = _ctx->b;
struct radeon_winsys_cs *cs = sctx->gfx.cs;
uint32_t cp_coher_cntl = 0;
-   uint32_t compute =
-   PKT3_SHADER_TYPE_S(!!(sctx->flags & SI_CONTEXT_FLAG_COMPUTE));
 
/* SI has a bug that it always flushes ICACHE and KCACHE if either
 * bit is set. An alternative way is to write SQC_CACHES, but that
@@ -646,7 +644,7 @@ void si_emit_cache_flush(struct si_context *si_ctx, struct 
r600_atom *atom)
 
/* Necessary for DCC */
if (sctx->chip_class >= VI) {
-   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0) | 
compute);
+   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
radeon_emit(cs, 
EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_DATA_TS) |
EVENT_INDEX(5));
radeon_emit(cs, 0);
@@ -661,18 +659,13 @@ void si_emit_cache_flush(struct si_context *si_ctx, 
struct r600_atom *atom)
}
 
if (sctx->flags & SI_CONTEXT_FLUSH_AND_INV_CB_META) {
-   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute);
+   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | 
EVENT_INDEX(0));
}
if (sctx->flags & SI_CONTEXT_FLUSH_AND_INV_DB_META) {
-   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute);
+   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | 
EVENT_INDEX(0));
}
-   if (sctx->flags & SI_CONTEXT_FLUSH_WITH_INV_L2) {
-   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute);
-   radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH) | 
EVENT_INDEX(7) |
-   EVENT_WRITE_INV_L2);
-}
 
/* FLUSH_AND_INV events must be emitted before PS_PARTIAL_FLUSH.
 * Otherwise, clearing CMASK (CB meta) with CP DMA isn't reliable.
@@ -681,22 +674,22 @@ void si_emit_cache_flush(struct si_context *si_ctx, 
struct r600_atom *atom)
 * and it is PS_PARTIAL_FLUSH that waits for it to complete.
 */
if (sctx->flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
-   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute);
+   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | 
EVENT_INDEX(4));
} else if (sctx->flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
-   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute);
+   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | 
EVENT_INDEX(4));
}
if (sctx->flags & SI_CONTEXT_CS_PARTIAL_FLUSH) {
-   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute);
+   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | 
EVENT_INDEX(4)));
}
if (sctx->flags & SI_CONTEXT_VGT_FLUSH) {
-   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute);
+   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, 

[Mesa-dev] [PATCH v2 04/20] radeonsi: implement shared atomics

2016-04-13 Thread Bas Nieuwenhuizen
v2: - Use single region
- Use get_memory_ptr

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_shader.c | 77 +++-
 1 file changed, 76 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index 3607226..72baaac 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3452,7 +3452,7 @@ static void atomic_fetch_args(
 
buffer_append_args(ctx, emit_data, rsrc, 
bld_base->uint_bld.zero,
   offset, true);
-   } else {
+   } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
unsigned target = inst->Memory.Texture;
LLVMValueRef coords;
 
@@ -3473,17 +3473,92 @@ static void atomic_fetch_args(
}
 }
 
+static void atomic_emit_memory(struct si_shader_context *ctx,
+   struct lp_build_emit_data *emit_data) {
+   struct gallivm_state *gallivm = >radeon_bld.gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   const struct tgsi_full_instruction * inst = emit_data->inst;
+   LLVMValueRef ptr, result, arg;
+
+   ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
+
+   arg = lp_build_emit_fetch(>radeon_bld.soa.bld_base, inst, 2, 0);
+   arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
+
+   if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
+   LLVMValueRef new_data;
+   new_data = lp_build_emit_fetch(>radeon_bld.soa.bld_base,
+  inst, 3, 0);
+
+   new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
+
+#if HAVE_LLVM >= 0x309
+   result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
+  LLVMAtomicOrderingSequentiallyConsistent,
+  LLVMAtomicOrderingSequentiallyConsistent,
+  false);
+#endif
+
+   result = LLVMBuildExtractValue(builder, result, 0, "");
+   } else {
+   LLVMAtomicRMWBinOp op;
+
+   switch(inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ATOMUADD:
+   op = LLVMAtomicRMWBinOpAdd;
+   break;
+   case TGSI_OPCODE_ATOMXCHG:
+   op = LLVMAtomicRMWBinOpXchg;
+   break;
+   case TGSI_OPCODE_ATOMAND:
+   op = LLVMAtomicRMWBinOpAnd;
+   break;
+   case TGSI_OPCODE_ATOMOR:
+   op = LLVMAtomicRMWBinOpOr;
+   break;
+   case TGSI_OPCODE_ATOMXOR:
+   op = LLVMAtomicRMWBinOpXor;
+   break;
+   case TGSI_OPCODE_ATOMUMIN:
+   op = LLVMAtomicRMWBinOpUMin;
+   break;
+   case TGSI_OPCODE_ATOMUMAX:
+   op = LLVMAtomicRMWBinOpUMax;
+   break;
+   case TGSI_OPCODE_ATOMIMIN:
+   op = LLVMAtomicRMWBinOpMin;
+   break;
+   case TGSI_OPCODE_ATOMIMAX:
+   op = LLVMAtomicRMWBinOpMax;
+   break;
+   default:
+   assert(0 && "unknown atomic opcode");
+   }
+
+   result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
+  LLVMAtomicOrderingSequentiallyConsistent,
+  false);
+   }
+   emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, 
emit_data->dst_type, "");
+}
+
 static void atomic_emit(
const struct lp_build_tgsi_action *action,
struct lp_build_tgsi_context *bld_base,
struct lp_build_emit_data *emit_data)
 {
+   struct si_shader_context *ctx = si_shader_context(bld_base);
struct gallivm_state *gallivm = bld_base->base.gallivm;
LLVMBuilderRef builder = gallivm->builder;
const struct tgsi_full_instruction * inst = emit_data->inst;
char intrinsic_name[40];
LLVMValueRef tmp;
 
+   if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
+   atomic_emit_memory(ctx, emit_data);
+   return;
+   }
+
if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
snpri

[Mesa-dev] [PATCH v2 03/20] radeonsi: implement shared memory load/store

2016-04-13 Thread Bas Nieuwenhuizen
v2: - Use single region
- Combine address calculation

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_shader.c | 84 +++-
 1 file changed, 82 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index 5a76435..3607226 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3071,7 +3071,7 @@ static void load_fetch_args(
 
buffer_append_args(ctx, emit_data, rsrc, 
bld_base->uint_bld.zero,
   offset, false);
-   } else {
+   } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
LLVMValueRef coords;
 
image_fetch_rsrc(bld_base, >Src[0], false, );
@@ -3124,6 +3124,53 @@ static void load_emit_buffer(struct si_shader_context 
*ctx,
LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
 }
 
+static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
+   const struct tgsi_full_instruction *inst,
+   LLVMTypeRef type, int arg)
+{
+   struct gallivm_state *gallivm = >radeon_bld.gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef offset, ptr;
+   int addr_space;
+
+   offset = lp_build_emit_fetch(>radeon_bld.soa.bld_base, inst, arg, 
0);
+   offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
+
+   ptr = ctx->shared_memory;
+   ptr = LLVMBuildGEP(builder, ptr, , 1, "");
+   addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
+   ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), 
"");
+
+   return ptr;
+}
+
+static void load_emit_memory(
+   struct si_shader_context *ctx,
+   struct lp_build_emit_data *emit_data)
+{
+   const struct tgsi_full_instruction *inst = emit_data->inst;
+   struct lp_build_context *base = >radeon_bld.soa.bld_base.base;
+   struct gallivm_state *gallivm = >radeon_bld.gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   unsigned writemask = inst->Dst[0].Register.WriteMask;
+   LLVMValueRef channels[4], ptr, derived_ptr, index;
+   int chan;
+
+   ptr = get_memory_ptr(ctx, inst, base->elem_type, 1);
+
+   for (chan = 0; chan < 4; ++chan) {
+   if (!(writemask & (1 << chan))) {
+   channels[chan] = LLVMGetUndef(base->elem_type);
+   continue;
+   }
+
+   index = lp_build_const_int32(gallivm, chan);
+   derived_ptr = LLVMBuildGEP(builder, ptr, , 1, "");
+   channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
+   }
+   emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, 
channels, 4);
+}
+
 static void load_emit(
const struct lp_build_tgsi_action *action,
struct lp_build_tgsi_context *bld_base,
@@ -3136,6 +3183,11 @@ static void load_emit(
char intrinsic_name[32];
char coords_type[8];
 
+   if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
+   load_emit_memory(ctx, emit_data);
+   return;
+   }
+
if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
emit_optimization_barrier(ctx);
 
@@ -3201,7 +3253,7 @@ static void store_fetch_args(
 
buffer_append_args(ctx, emit_data, rsrc, 
bld_base->uint_bld.zero,
   offset, false);
-   } else {
+   } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
unsigned target = inst->Memory.Texture;
LLVMValueRef coords;
 
@@ -3297,6 +3349,31 @@ static void store_emit_buffer(
}
 }
 
+static void store_emit_memory(
+   struct si_shader_context *ctx,
+   struct lp_build_emit_data *emit_data)
+{
+   const struct tgsi_full_instruction *inst = emit_data->inst;
+   struct gallivm_state *gallivm = >radeon_bld.gallivm;
+   struct lp_build_context *base = >radeon_bld.soa.bld_base.base;
+   LLVMBuilderRef builder = gallivm->builder;
+   unsigned writemask = inst->Dst[0].Register.WriteMask;
+   LLVMValueRef ptr, derived_ptr, data, index;
+   int chan;
+
+   ptr = get_memory_ptr(ctx, inst, base->elem_type, 0);
+
+   for (chan = 0; chan < 4; ++chan) {
+   if (!(writemask & (1 << chan))) {
+   continue;
+   }
+   data = lp_build_emit_fetch(>radeon_bld.soa.bld_base, inst, 
1, chan);
+   index = lp_build_const_int32(gallivm, chan);
+   derived_ptr = LLVMBuildGEP(builder, ptr, , 1, "")

[Mesa-dev] [PATCH v2 07/20] radeonsi: implement TGSI compute shader creation

2016-04-13 Thread Bas Nieuwenhuizen
v2: Moved scratch_enabled initialization after compile.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_compute.c | 74 +++
 1 file changed, 56 insertions(+), 18 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 1ec695e..b01b926 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -22,6 +22,7 @@
  *
  */
 
+#include "tgsi/tgsi_parse.h"
 #include "util/u_memory.h"
 #include "radeon/r600_pipe_common.h"
 #include "radeon/radeon_elf_util.h"
@@ -35,13 +36,11 @@
 #define MAX_GLOBAL_BUFFERS 20
 
 struct si_compute {
-   struct si_context *ctx;
-
+   unsigned ir_type;
unsigned local_size;
unsigned private_size;
unsigned input_size;
struct si_shader shader;
-   unsigned num_user_sgprs;
 
struct r600_resource *input_buffer;
struct pipe_resource *global_buffers[MAX_GLOBAL_BUFFERS];
@@ -93,33 +92,72 @@ static void *si_create_compute_state(
const struct pipe_compute_state *cso)
 {
struct si_context *sctx = (struct si_context *)ctx;
+   struct si_screen *sscreen = (struct si_screen *)ctx->screen;
struct si_compute *program = CALLOC_STRUCT(si_compute);
-   const struct pipe_llvm_program_header *header;
-   const char *code;
+   struct si_shader *shader = >shader;
 
-   header = cso->prog;
-   code = cso->prog + sizeof(struct pipe_llvm_program_header);
 
-   program->ctx = sctx;
+   program->ir_type = cso->ir_type;
program->local_size = cso->req_local_mem;
program->private_size = cso->req_private_mem;
program->input_size = cso->req_input_mem;
 
-   radeon_elf_read(code, header->num_bytes, >shader.binary);
 
-   /* init_scratch_buffer patches the shader code with the scratch address,
-* so we need to call it before si_shader_binary_read() which uploads
-* the shader code to the GPU.
-*/
-   init_scratch_buffer(sctx, program);
-   si_shader_binary_read_config(>shader.binary,
->shader.config, 0);
+   if (cso->ir_type == PIPE_SHADER_IR_TGSI) {
+   struct si_shader_selector sel = {0};
+   bool scratch_enabled;
+
+   sel.tokens = tgsi_dup_tokens(cso->prog);
+   if (!sel.tokens) {
+   return NULL;
+   }
+
+   tgsi_scan_shader(cso->prog, );
+   sel.type = PIPE_SHADER_COMPUTE;
+   sel.local_size = cso->req_local_mem;
+
+   p_atomic_inc(>b.num_shaders_created);
+
+   program->shader.selector = 
+
+   if (si_compile_tgsi_shader(sscreen, sctx->tm, >shader,
+  true, >b.debug)) {
+   FREE(sel.tokens);
+   return NULL;
+   }
+
+   scratch_enabled = shader->config.scratch_bytes_per_wave > 0;
+
+   shader->config.rsrc2 = S_00B84C_USER_SGPR(SI_CS_NUM_USER_SGPR) |
+  S_00B84C_SCRATCH_EN(scratch_enabled) |
+  S_00B84C_TGID_X_EN(1) | S_00B84C_TGID_Y_EN(1) |
+  S_00B84C_TGID_Z_EN(1) | S_00B84C_TIDIG_COMP_CNT(2) |
+  S_00B84C_LDS_SIZE(shader->config.lds_size);
+
+   FREE(sel.tokens);
+   } else {
+   const struct pipe_llvm_program_header *header;
+   const char *code;
+   header = cso->prog;
+   code = cso->prog + sizeof(struct pipe_llvm_program_header);
+
+   radeon_elf_read(code, header->num_bytes, 
>shader.binary);
+   /* init_scratch_buffer patches the shader code with the scratch 
address,
+   * so we need to call it before si_shader_binary_read() which 
uploads
+   * the shader code to the GPU.
+   */
+   init_scratch_buffer(sctx, program);
+   si_shader_binary_read_config(>shader.binary,
+>shader.config, 0);
+   }
si_shader_dump(sctx->screen, >shader, >b.debug,
   TGSI_PROCESSOR_COMPUTE, stderr);
si_shader_binary_upload(sctx->screen, >shader);
 
-   program->input_buffer = si_resource_create_custom(sctx->b.b.screen,
-   PIPE_USAGE_IMMUTABLE, program->input_size);
+   if (program->input_size) {
+   program->input_buffer = 
si_resource_create_custom(sctx->b.b.screen,
+   PIPE_USAGE_IMMUTABLE, program->input_size);
+   }
 
return program;
 }
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 17/20] radeonsi: do not do two full flushes on every compute dispatch

2016-04-13 Thread Bas Nieuwenhuizen
v2: Add more CS_PARTIAL_FLUSH events.

Essentially every place with waits on finishing for pixel shaders
also has a write after read hazard with compute shaders.

Invalidating L2 waits implicitly on pixel and compute shaders,
so, we don't need a CS_PARTIAL_FLUSH for switching FBO.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_compute.c | 17 ++---
 src/gallium/drivers/radeonsi/si_cp_dma.c  |  6 --
 src/gallium/drivers/radeonsi/si_descriptors.c |  3 ++-
 src/gallium/drivers/radeonsi/si_state.c   |  6 --
 4 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 10b88b3..6803334 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -439,13 +439,8 @@ static void si_launch_grid(
if (!sctx->cs_shader_state.initialized)
si_initialize_compute(sctx);
 
-   sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
-SI_CONTEXT_INV_GLOBAL_L2 |
-SI_CONTEXT_INV_ICACHE |
-SI_CONTEXT_INV_SMEM_L1 |
-SI_CONTEXT_FLUSH_WITH_INV_L2 |
-SI_CONTEXT_FLAG_COMPUTE;
-   si_emit_cache_flush(sctx, NULL);
+   if (sctx->b.flags)
+   si_emit_cache_flush(sctx, NULL);
 
if (!si_switch_compute_shader(sctx, program, >shader, 
info->pc))
return;
@@ -478,14 +473,6 @@ static void si_launch_grid(
si_setup_tgsi_grid(sctx, info);
 
si_emit_dispatch_packets(sctx, info);
-
-   sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-SI_CONTEXT_INV_VMEM_L1 |
-SI_CONTEXT_INV_GLOBAL_L2 |
-SI_CONTEXT_INV_ICACHE |
-SI_CONTEXT_INV_SMEM_L1 |
-SI_CONTEXT_FLAG_COMPUTE;
-   si_emit_cache_flush(sctx, NULL);
 }
 
 
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c 
b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 001ddd4..38e0ee6 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -190,7 +190,8 @@ static void si_clear_buffer(struct pipe_context *ctx, 
struct pipe_resource *dst,
uint64_t va = r600_resource(dst)->gpu_address + offset;
 
/* Flush the caches. */
-   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
+   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
 
while (size) {
unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
@@ -296,7 +297,8 @@ void si_copy_buffer(struct si_context *sctx,
}
 
/* Flush the caches. */
-   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
+   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
 
/* This is the main part doing the copying. Src is always aligned. */
main_dst_offset = dst_offset + skipped_size;
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index a2c096f..04dada6 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -940,7 +940,8 @@ static void si_set_streamout_targets(struct pipe_context 
*ctx,
 * start writing to the targets.
 */
if (num_targets)
-   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+SI_CONTEXT_CS_PARTIAL_FLUSH;
 
/* Streamout buffers must be bound in 2 places:
 * 1) in VGT by setting the VGT_STRMOUT registers
diff --git a/src/gallium/drivers/radeonsi/si_state.c 
b/src/gallium/drivers/radeonsi/si_state.c
index 82ae4c4..a62dc52 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -3467,7 +3467,8 @@ static void si_memory_barrier(struct pipe_context *ctx, 
unsigned flags)
 
/* Subsequent commands must wait for all shader invocations to
 * complete. */
-   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+SI_CONTEXT_CS_PARTIAL_FLUSH;
 
if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
@@ -3477,7 +3478,8 @@ static void si_memory_barrier(struct pipe_context *ctx, 
unsigned flags)
 PIPE_BARRIER_SHADER_BUFFER |
 PIPE_BARRIER_TEXTURE |
 PIPE_BARRIER_IMAGE |
-PIPE_BARRIER_STREAMOUT_BUFFER)) {
+PIPE_BARRIER_STREAMOUT_BUFFER |
+PIPE_BA

[Mesa-dev] [PATCH v2 20/20] radeonsi: enable TGSI support cap for compute shaders

2016-04-13 Thread Bas Nieuwenhuizen
v2: Use chip_class instead of family.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
Reviewed-by: Nicolai Hähnle <nicolai.haeh...@amd.com>
---
 docs/GL3.txt  |  4 ++--
 docs/relnotes/11.3.0.html |  1 +
 src/gallium/drivers/radeon/r600_pipe_common.c | 21 -
 src/gallium/drivers/radeonsi/si_pipe.c|  3 ++-
 4 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index dc75cf8..6b5e016 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -167,7 +167,7 @@ GL 4.3, GLSL 4.30:
   GL_ARB_arrays_of_arrays   DONE (all drivers that 
support GLSL 1.30)
   GL_ARB_ES3_compatibility  DONE (all drivers that 
support GLSL 3.30)
   GL_ARB_clear_buffer_objectDONE (all drivers)
-  GL_ARB_compute_shader DONE (i965)
+  GL_ARB_compute_shader DONE (i965, radeonsi)
   GL_ARB_copy_image DONE (i965, nv50, 
nvc0, r600, radeonsi)
   GL_KHR_debug  DONE (all drivers)
   GL_ARB_explicit_uniform_location  DONE (all drivers that 
support GLSL)
@@ -225,7 +225,7 @@ GL 4.5, GLSL 4.50:
 These are the extensions cherry-picked to make GLES 3.1
 GLES3.1, GLSL ES 3.1
   GL_ARB_arrays_of_arrays   DONE (all drivers that 
support GLSL 1.30)
-  GL_ARB_compute_shader DONE (i965)
+  GL_ARB_compute_shader DONE (i965, radeonsi)
   GL_ARB_draw_indirect  DONE (i965, nvc0, 
r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_explicit_uniform_location  DONE (all drivers that 
support GLSL)
   GL_ARB_framebuffer_no_attachments DONE (i965, nvc0, 
r600, radeonsi, softpipe)
diff --git a/docs/relnotes/11.3.0.html b/docs/relnotes/11.3.0.html
index 0f9aed8..5a7083c 100644
--- a/docs/relnotes/11.3.0.html
+++ b/docs/relnotes/11.3.0.html
@@ -45,6 +45,7 @@ Note: some of the new features are only available with 
certain drivers.
 
 
 OpenGL 4.2 on radeonsi
+GL_ARB_compute_shader on radeonsi
 GL_ARB_framebuffer_no_attachments on nvc0, r600, radeonsi, softpipe
 GL_ARB_internalformat_query2 on all drivers
 GL_ARB_robust_buffer_access_behavior on radeonsi
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c 
b/src/gallium/drivers/radeon/r600_pipe_common.c
index a7477ab..64da62f 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -645,23 +645,34 @@ static int r600_get_compute_param(struct pipe_screen 
*screen,
uint64_t *grid_size = ret;
grid_size[0] = 65535;
grid_size[1] = 65535;
-   grid_size[2] = 1;
+   grid_size[2] = 65535;
}
return 3 * sizeof(uint64_t) ;
 
case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
if (ret) {
uint64_t *block_size = ret;
-   block_size[0] = 256;
-   block_size[1] = 256;
-   block_size[2] = 256;
+   if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 &&
+   ir_type == PIPE_SHADER_IR_TGSI) {
+   block_size[0] = 2048;
+   block_size[1] = 2048;
+   block_size[2] = 2048;
+   } else {
+   block_size[0] = 256;
+   block_size[1] = 256;
+   block_size[2] = 256;
+   }
}
return 3 * sizeof(uint64_t);
 
case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
if (ret) {
uint64_t *max_threads_per_block = ret;
-   *max_threads_per_block = 256;
+   if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 &&
+   ir_type == PIPE_SHADER_IR_TGSI)
+   *max_threads_per_block = 2048;
+   else
+   *max_threads_per_block = 256;
}
return sizeof(uint64_t);
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
b/src/gallium/drivers/radeonsi/si_pipe.c
index 8db8ca6..d169469 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -464,7 +464,8 @@ static int si_get_shader_param(struct pipe_screen* pscreen, 
unsigned shader, enu
return PIPE_SHADER_IR_NATIVE;
 
case PIPE_SHADER_CAP_SUPPORTED_IRS:
-   re

[Mesa-dev] [PATCH v2 12/20] radeonsi: only emit compute shader state when switching shaders

2016-04-13 Thread Bas Nieuwenhuizen
v2: - Do check if anything changed earlier
- Use emitted_program instead of emitted_bo to prevent
  shaders with shader->bo = NULL confusing the check
- Use radeon_set_sh_reg*

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_compute.c | 145 ++
 src/gallium/drivers/radeonsi/si_pipe.h|   2 +
 2 files changed, 88 insertions(+), 59 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 9119286..cdced47 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -183,6 +183,7 @@ static void si_initialize_compute(struct si_context *sctx)
  0x190 /* Default value */);
}
 
+   sctx->cs_shader_state.emitted_program = NULL;
sctx->cs_shader_state.initialized = true;
 }
 
@@ -224,6 +225,83 @@ static bool si_setup_compute_scratch_buffer(struct 
si_context *sctx,
return true;
 }
 
+static bool si_switch_compute_shader(struct si_context *sctx,
+ struct si_compute *program,
+ struct si_shader *shader, unsigned offset)
+{
+   struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+   struct si_shader_config inline_config = {0};
+   struct si_shader_config *config;
+   uint64_t shader_va;
+
+   if (sctx->cs_shader_state.emitted_program == program &&
+   sctx->cs_shader_state.offset == offset)
+   return true;
+
+   if (program->ir_type == PIPE_SHADER_IR_TGSI) {
+   config = >config;
+   } else {
+   unsigned lds_blocks;
+
+   config = _config;
+   si_shader_binary_read_config(>binary, config, offset);
+
+   lds_blocks = config->lds_size;
+   /* XXX: We are over allocating LDS.  For SI, the shader reports
+   * LDS in blocks of 256 bytes, so if there are 4 bytes lds
+   * allocated in the shader and 4 bytes allocated by the state
+   * tracker, then we will set LDS_SIZE to 512 bytes rather than 
256.
+   */
+   if (sctx->b.chip_class <= SI) {
+   lds_blocks += align(program->local_size, 256) >> 8;
+   } else {
+   lds_blocks += align(program->local_size, 512) >> 9;
+   }
+
+   assert(lds_blocks <= 0xFF);
+
+   config->rsrc2 &= C_00B84C_LDS_SIZE;
+   config->rsrc2 |=  S_00B84C_LDS_SIZE(lds_blocks);
+   }
+
+   if (!si_setup_compute_scratch_buffer(sctx, shader, config))
+   return false;
+
+   if (shader->scratch_bo) {
+   COMPUTE_DBG(sctx->screen, "Waves: %u; Scratch per wave: %u 
bytes; "
+   "Total Scratch: %u bytes\n", sctx->scratch_waves,
+   config->scratch_bytes_per_wave,
+   config->scratch_bytes_per_wave *
+   sctx->scratch_waves);
+
+   radeon_add_to_buffer_list(>b, >b.gfx,
+ shader->scratch_bo, RADEON_USAGE_READWRITE,
+ RADEON_PRIO_SCRATCH_BUFFER);
+   }
+
+   shader_va = shader->bo->gpu_address + offset;
+
+   radeon_add_to_buffer_list(>b, >b.gfx, shader->bo,
+ RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
+
+   radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
+   radeon_emit(cs, shader_va >> 8);
+   radeon_emit(cs, shader_va >> 40);
+
+   radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
+   radeon_emit(cs, config->rsrc1);
+   radeon_emit(cs, config->rsrc2);
+
+   radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
+ S_00B860_WAVES(sctx->scratch_waves)
+| S_00B860_WAVESIZE(config->scratch_bytes_per_wave >> 10));
+
+   sctx->cs_shader_state.emitted_program = program;
+   sctx->cs_shader_state.offset = offset;
+
+   return true;
+}
+
 static void si_upload_compute_input(struct si_context *sctx,
   const struct pipe_grid_info *info)
 {
@@ -280,10 +358,7 @@ static void si_launch_grid(
struct si_context *sctx = (struct si_context*)ctx;
struct si_compute *program = sctx->cs_shader_state.program;
struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
-   uint64_t shader_va;
unsigned i;
-   struct si_shader *shader = >shader;
-   unsigned lds_blocks;
 
si_need_cs_space(sctx);
 
@@ -300,29 +375,12 @@ static void si_launch_grid(
 
pm4->compute_pkt = true;
 
-   /* Read the config information */
-   si_shader_bin

[Mesa-dev] [PATCH v2 13/20] radeonsi: implement TGSI compute dispatch

2016-04-13 Thread Bas Nieuwenhuizen
v2: - Use radeon_set_sh_reg_seq.
- Set predicate bit for conditional rendering.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_compute.c | 104 ++
 1 file changed, 77 insertions(+), 27 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index cdced47..6a4db3a 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -352,13 +352,85 @@ static void si_upload_compute_input(struct si_context 
*sctx,
pipe_resource_reference((struct pipe_resource**)_buffer, NULL);
 }
 
+static void si_setup_tgsi_grid(struct si_context *sctx,
+const struct pipe_grid_info *info)
+{
+   struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+   unsigned grid_size_reg = R_00B900_COMPUTE_USER_DATA_0 +
+ 4 * SI_SGPR_GRID_SIZE;
+
+   if (info->indirect) {
+   uint64_t base_va = r600_resource(info->indirect)->gpu_address;
+   uint64_t va = base_va + info->indirect_offset;
+   int i;
+
+   radeon_add_to_buffer_list(>b, >b.gfx,
+(struct r600_resource *)info->indirect,
+RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
+
+   for (i = 0; i < 3; ++i) {
+   radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+   radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
+   COPY_DATA_DST_SEL(COPY_DATA_REG));
+   radeon_emit(cs, (va +  4 * i));
+   radeon_emit(cs, (va + 4 * i) >> 32);
+   radeon_emit(cs, (grid_size_reg >> 2) + i);
+   radeon_emit(cs, 0);
+   }
+   } else {
+
+   radeon_set_sh_reg_seq(cs, grid_size_reg, 3);
+   radeon_emit(cs, info->grid[0]);
+   radeon_emit(cs, info->grid[1]);
+   radeon_emit(cs, info->grid[2]);
+   }
+}
+
+static void si_emit_dispatch_packets(struct si_context *sctx,
+ const struct pipe_grid_info *info)
+{
+   struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+   bool render_cond_bit = sctx->b.render_cond && 
!sctx->b.render_cond_force_off;
+
+   radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
+   radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]));
+   radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]));
+   radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]));
+
+   if (info->indirect) {
+   uint64_t base_va = r600_resource(info->indirect)->gpu_address;
+
+   radeon_add_to_buffer_list(>b, >b.gfx,
+(struct r600_resource *)info->indirect,
+RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
+
+   radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) |
+   PKT3_SHADER_TYPE_S(1));
+   radeon_emit(cs, 1);
+   radeon_emit(cs, base_va);
+   radeon_emit(cs, base_va >> 32);
+
+   radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, 
render_cond_bit) |
+   PKT3_SHADER_TYPE_S(1));
+   radeon_emit(cs, info->indirect_offset);
+   radeon_emit(cs, 1);
+   } else {
+   radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, render_cond_bit) |
+   PKT3_SHADER_TYPE_S(1));
+   radeon_emit(cs, info->grid[0]);
+   radeon_emit(cs, info->grid[1]);
+   radeon_emit(cs, info->grid[2]);
+   radeon_emit(cs, 1);
+   }
+}
+
+
 static void si_launch_grid(
struct pipe_context *ctx, const struct pipe_grid_info *info)
 {
struct si_context *sctx = (struct si_context*)ctx;
struct si_compute *program = sctx->cs_shader_state.program;
-   struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
-   unsigned i;
+   int i;
 
si_need_cs_space(sctx);
 
@@ -373,21 +445,12 @@ static void si_launch_grid(
 SI_CONTEXT_FLAG_COMPUTE;
si_emit_cache_flush(sctx, NULL);
 
-   pm4->compute_pkt = true;
-
if (!si_switch_compute_shader(sctx, program, >shader, 
info->pc))
return;
 
if (program->input_size || program->ir_type == PIPE_SHADER_IR_NATIVE)
si_upload_compute_input(sctx, info);
 
-   si_pm4_set_reg(pm4, R_00B81C_COMPUTE_NUM_THREAD_X,
-   S_00B81C_NUM_THREAD_FULL(info->block[0]));
-   si_pm4_set_reg(pm4, R_00B820_COMPUTE_NUM_THREAD_Y,
-   S_00B820_NUM_THREAD_FULL(i

[Mesa-dev] [PATCH v2 05/20] radeonsi: set maximum work group size based on block size

2016-04-13 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
Reviewed-by: Marek Olšák <marek.ol...@amd.com>
Reviewed-by: Nicolai Hähnle <nicolai.haeh...@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index 72baaac..05c70e8 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5065,6 +5065,18 @@ static void create_function(struct si_shader_context 
*ctx)
  S_0286D0_LINEAR_CENTROID_ENA(1) |
  S_0286D0_FRONT_FACE_ENA(1) |
  S_0286D0_POS_FIXED_PT_ENA(1));
+   } else if (ctx->type == TGSI_PROCESSOR_COMPUTE) {
+   const unsigned *properties = shader->selector->info.properties;
+   unsigned max_work_group_size =
+  properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
+  properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
+  properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
+
+   assert(max_work_group_size);
+
+   radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
+ "amdgpu-max-work-group-size",
+ max_work_group_size);
}
 
shader->info.num_input_sgprs = 0;
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 11/20] radeonsi: rework compute scratch buffer

2016-04-13 Thread Bas Nieuwenhuizen
Instead of having a scratch buffer per program, have one per
context.

Also removed the per kernel wave count calculations, but
that only helped if the total number of waves in the dispatch
was smaller than sctx->scratch_waves.

v2: Fix style issue.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
Reviewed-by: Marek Olšák <marek.ol...@amd.com>
Reviewed-by: Nicolai Hähnle <nicolai.haeh...@amd.com>
---
 src/gallium/drivers/radeonsi/si_compute.c | 137 ++
 src/gallium/drivers/radeonsi/si_pipe.c|   1 +
 src/gallium/drivers/radeonsi/si_pipe.h|   2 +
 3 files changed, 47 insertions(+), 93 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 4db436e..9119286 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -46,47 +46,6 @@ struct si_compute {
struct pipe_resource *global_buffers[MAX_GLOBAL_BUFFERS];
 };
 
-static void init_scratch_buffer(struct si_context *sctx, struct si_compute 
*program)
-{
-   unsigned scratch_bytes = 0;
-   uint64_t scratch_buffer_va;
-   unsigned i;
-
-   /* Compute the scratch buffer size using the maximum number of waves.
-* This way we don't need to recompute it for each kernel launch. */
-   unsigned scratch_waves = 32 * 
sctx->screen->b.info.num_good_compute_units;
-   for (i = 0; i < program->shader.binary.global_symbol_count; i++) {
-   unsigned offset =
-   program->shader.binary.global_symbol_offsets[i];
-   unsigned scratch_bytes_needed;
-
-   si_shader_binary_read_config(>shader.binary,
->shader.config, offset);
-   scratch_bytes_needed = 
program->shader.config.scratch_bytes_per_wave;
-   scratch_bytes = MAX2(scratch_bytes, scratch_bytes_needed);
-   }
-
-   if (scratch_bytes == 0)
-   return;
-
-   program->shader.scratch_bo =
-   si_resource_create_custom(sctx->b.b.screen,
-   PIPE_USAGE_DEFAULT,
-   scratch_bytes * scratch_waves);
-
-   scratch_buffer_va = program->shader.scratch_bo->gpu_address;
-
-   /* apply_scratch_relocs needs scratch_bytes_per_wave to be set
-* to the maximum bytes needed, so it can compute the stride
-* correctly.
-*/
-   program->shader.config.scratch_bytes_per_wave = scratch_bytes;
-
-   /* Patch the shader with the scratch buffer address. */
-   si_shader_apply_scratch_relocs(sctx,
-   >shader, scratch_buffer_va);
-}
-
 static void *si_create_compute_state(
struct pipe_context *ctx,
const struct pipe_compute_state *cso)
@@ -142,11 +101,6 @@ static void *si_create_compute_state(
code = cso->prog + sizeof(struct pipe_llvm_program_header);
 
radeon_elf_read(code, header->num_bytes, 
>shader.binary);
-   /* init_scratch_buffer patches the shader code with the scratch 
address,
-   * so we need to call it before si_shader_binary_read() which 
uploads
-   * the shader code to the GPU.
-   */
-   init_scratch_buffer(sctx, program);
si_shader_binary_read_config(>shader.binary,
 >shader.config, 0);
}
@@ -191,43 +145,6 @@ static void si_set_global_binding(
}
 }
 
-/**
- * This function computes the value for R_00B860_COMPUTE_TMPRING_SIZE.WAVES
- * /p block_layout is the number of threads in each work group.
- * /p grid layout is the number of work groups.
- */
-static unsigned compute_num_waves_for_scratch(
-   const struct radeon_info *info,
-   const uint *block_layout,
-   const uint *grid_layout)
-{
-   unsigned num_sh = MAX2(info->max_sh_per_se, 1);
-   unsigned num_se = MAX2(info->max_se, 1);
-   unsigned num_blocks = 1;
-   unsigned threads_per_block = 1;
-   unsigned waves_per_block;
-   unsigned waves_per_sh;
-   unsigned waves;
-   unsigned scratch_waves;
-   unsigned i;
-
-   for (i = 0; i < 3; i++) {
-   threads_per_block *= block_layout[i];
-   num_blocks *= grid_layout[i];
-   }
-
-   waves_per_block = align(threads_per_block, 64) / 64;
-   waves = waves_per_block * num_blocks;
-   waves_per_sh = align(waves, num_sh * num_se) / (num_sh * num_se);
-   scratch_waves = waves_per_sh * num_sh * num_se;
-
-   if (waves_per_block > waves_per_sh) {
-   scratch_waves = waves_per_block * num_sh * num_se;
-   }
-
-   return scratch_waves;
-}
-
 static void si_initialize_compute(struct si_context *sctx)
 {
struct radeon_winsys_cs *cs = sctx

[Mesa-dev] [PATCH v2 14/20] radeonsi: update predicate condition for compute dispatches

2016-04-13 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_compute.c | 6 ++
 src/gallium/drivers/radeonsi/si_pipe.h| 9 +
 2 files changed, 15 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 6a4db3a..0b248cb 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -448,6 +448,12 @@ static void si_launch_grid(
if (!si_switch_compute_shader(sctx, program, >shader, 
info->pc))
return;
 
+   if (si_is_atom_dirty(sctx, sctx->atoms.s.render_cond)) {
+   sctx->atoms.s.render_cond->emit(>b,
+   sctx->atoms.s.render_cond);
+   si_set_atom_dirty(sctx, sctx->atoms.s.render_cond, false);
+   }
+
if (program->input_size || program->ir_type == PIPE_SHADER_IR_NATIVE)
si_upload_compute_input(sctx, info);
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index 3d3d1a9..4a06854 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -414,6 +414,15 @@ si_set_atom_dirty(struct si_context *sctx,
sctx->dirty_atoms &= ~bit;
 }
 
+static inline bool
+si_is_atom_dirty(struct si_context *sctx,
+ struct r600_atom *atom)
+{
+   unsigned bit = 1 << (atom->id - 1);
+
+   return sctx->dirty_atoms & bit;
+}
+
 static inline void
 si_mark_atom_dirty(struct si_context *sctx,
   struct r600_atom *atom)
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 01/13] gallium/radeon: move ring_type into winsyses

2016-04-13 Thread Bas Nieuwenhuizen
From: Marek Olšák 

Not used by drivers.
---
 src/gallium/drivers/radeon/radeon_winsys.h|  1 -
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c |  8 
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.h |  1 +
 src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 10 +-
 src/gallium/winsys/radeon/drm/radeon_drm_cs.h |  1 +
 5 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_winsys.h 
b/src/gallium/drivers/radeon/radeon_winsys.h
index 0c03652..aa94df6 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -229,7 +229,6 @@ struct radeon_winsys_cs {
 unsignedcdw;  /* Number of used dwords. */
 unsignedmax_dw; /* Maximum number of dwords. */
 uint32_t*buf; /* The command buffer. */
-enum ring_type  ring_type;
 };
 
 struct radeon_info {
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index a9fc55f..63c72fc 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -348,7 +348,7 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
cs->ctx = ctx;
cs->flush_cs = flush;
cs->flush_data = flush_ctx;
-   cs->base.ring_type = ring_type;
+   cs->ring_type = ring_type;
 
if (!amdgpu_init_cs_context(cs, ring_type)) {
   FREE(cs);
@@ -570,7 +570,7 @@ static void amdgpu_cs_do_submission(struct amdgpu_cs *cs,
cs->request.fence_info.handle = NULL;
if (cs->request.ip_type != AMDGPU_HW_IP_UVD && cs->request.ip_type != 
AMDGPU_HW_IP_VCE) {
cs->request.fence_info.handle = cs->ctx->user_fence_bo;
-   cs->request.fence_info.offset = cs->base.ring_type;
+   cs->request.fence_info.offset = cs->ring_type;
}
 
r = amdgpu_cs_submit(cs->ctx->ctx, 0, >request, 1);
@@ -591,7 +591,7 @@ static void amdgpu_cs_do_submission(struct amdgpu_cs *cs,
   amdgpu_fence_submitted(fence, >request, user_fence);
 
   for (i = 0; i < cs->num_buffers; i++)
- amdgpu_fence_reference(>buffers[i].bo->fence[cs->base.ring_type],
+ amdgpu_fence_reference(>buffers[i].bo->fence[cs->ring_type],
 fence);
}
pipe_mutex_unlock(ws->bo_fence_lock);
@@ -613,7 +613,7 @@ static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
struct amdgpu_cs *cs = amdgpu_cs(rcs);
struct amdgpu_winsys *ws = cs->ctx->ws;
 
-   switch (cs->base.ring_type) {
+   switch (cs->ring_type) {
case RING_DMA:
   /* pad DMA ring to 8 DWs */
   while (rcs->cdw & 7)
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
index a2fb44a..f4709e9 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
@@ -66,6 +66,7 @@ struct amdgpu_cs {
unsigned used_ib_space;
 
/* amdgpu_cs_submit parameters */
+   enum ring_type  ring_type;
struct amdgpu_cs_requestrequest;
struct amdgpu_cs_ib_infoib;
 
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c 
b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index b50e19c..6b2694c 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -197,8 +197,8 @@ radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
 cs->csc = >csc1;
 cs->cst = >csc2;
 cs->base.buf = cs->csc->buf;
-cs->base.ring_type = ring_type;
 cs->base.max_dw = ARRAY_SIZE(cs->csc->buf);
+cs->ring_type = ring_type;
 
 p_atomic_inc(>num_cs);
 return >base;
@@ -281,7 +281,7 @@ static unsigned radeon_add_buffer(struct radeon_drm_cs *cs,
  * This doesn't have to be done if virtual memory is enabled,
  * because there is no offset patching with virtual memory.
  */
-if (cs->base.ring_type != RING_DMA || cs->ws->info.has_virtual_memory) 
{
+if (cs->ring_type != RING_DMA || cs->ws->info.has_virtual_memory) {
 return i;
 }
 }
@@ -466,7 +466,7 @@ static void radeon_drm_cs_flush(struct radeon_winsys_cs 
*rcs,
 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 struct radeon_cs_context *tmp;
 
-switch (cs->base.ring_type) {
+switch (cs->ring_type) {
 case RING_DMA:
 /* pad DMA ring to 8 DWs */
 if (cs->ws->info.chip_class <= SI) {
@@ -526,7 +526,7 @@ static void radeon_drm_cs_flush(struct radeon_winsys_cs 
*rcs,
 p_atomic_inc(>cst->relocs_bo[i].bo->num_active_ioctls);
 }
 
-switch (cs->base.ring_type) {
+switch (cs->ring_type) {
 case RING_DMA:
 cs->cst->flags[0] = 0;
 cs->cst->flags[1] = RADEON_CS_RING_DMA;
@@ -566,7 +566,7 @@ static void radeon_drm_cs_flush(struct radeon_winsys_cs 
*rcs,
 cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
 cs->cst->cs.num_chunks = 3;

[Mesa-dev] [PATCH 00/13] Use the constant engine in radeonsi

2016-04-13 Thread Bas Nieuwenhuizen
This series implements updating descriptors using the constant engine.
This should result in a 0%-3% improvement for CPU bound applications,
as we only have to upload the change descriptors from the CPU.

There are very slight performance advantages on the GPU too, as the
CE uploads the data directly to the L2 cache. This series reduces the
number of cycles waited on LGKM counters by about 1/6th across multiple
applications. However this translates in almost negligible performance
improvements.

Only the amdgpu winsys supports this for now. I think the radeon
winsys can be extended to support it, but I don't have any radeon
hardware so it is probably better that someone else implements that.

This series does not include support for partially updating
uniforms. My gallium interface proof of concept became kind of messy
and the constant engine patches have been requested for descriptors.

Bas Nieuwenhuizen (10):
  winsys/amdgpu: Enlarge const IB size.
  radeonsi: Create CE IB.
  radeonsi: Add dirty_mask to descriptor list.
  radeonsi: Add CE packet definitions.
  radeonsi: Add CE synchronization.
  radeonsi: Allocate chunks of CE ram.
  radeonsi: Add CE uploader.
  radeonsi: Use CE for vertex buffers.
  gallium/util: Add u_bit_scan_consecutive_range64.
  radeonsi: Use CE for all descriptors.

Marek Olšák (3):
  gallium/radeon: move ring_type into winsyses
  winsys/amdgpu: split IB data into a new structure in preparation for
CE
  winsys/amdgpu: add support for const IB

 src/gallium/auxiliary/util/u_math.h   |   8 ++
 src/gallium/drivers/radeon/r600_pipe_common.c |   1 +
 src/gallium/drivers/radeon/r600_pipe_common.h |   1 +
 src/gallium/drivers/radeon/radeon_winsys.h|  19 +++-
 src/gallium/drivers/radeonsi/si_descriptors.c | 149 +-
 src/gallium/drivers/radeonsi/si_hw_context.c  |   4 +-
 src/gallium/drivers/radeonsi/si_pipe.c|  18 
 src/gallium/drivers/radeonsi/si_pipe.h|   6 ++
 src/gallium/drivers/radeonsi/si_state.h   |   4 +
 src/gallium/drivers/radeonsi/si_state_draw.c  |  24 +
 src/gallium/drivers/radeonsi/sid.h|   6 ++
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.c |   5 -
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.h |   6 ++
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 125 ++---
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.h |  26 +++--
 src/gallium/winsys/radeon/drm/radeon_drm_cs.c |  10 +-
 src/gallium/winsys/radeon/drm/radeon_drm_cs.h |   1 +
 17 files changed, 326 insertions(+), 87 deletions(-)

-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 04/13] winsys/amdgpu: Enlarge const IB size.

2016-04-13 Thread Bas Nieuwenhuizen
Necessary to prevent performance regressions due to extra flushing.

Probably should enlarge it even further when also updating
uniforms through the CE, but this seems large enough for now.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 19 ---
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index b0c80c6..3ea0f3d 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -199,14 +199,19 @@ amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx 
*rwctx)
 /* COMMAND SUBMISSION */
 
 static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_ib *ib,
-  struct amdgpu_cs_ib_info *info)
+  struct amdgpu_cs_ib_info *info, unsigned ib_type)
 {
/* Small IBs are better than big IBs, because the GPU goes idle quicker
 * and there is less waiting for buffers and fences. Proof:
 *   http://www.phoronix.com/scan.php?page=article=mesa-111-si=1
 */
-   const unsigned buffer_size = 128 * 1024 * 4;
-   const unsigned ib_size = 20 * 1024 * 4;
+   unsigned buffer_size = 128 * 1024 * 4;
+   unsigned ib_size = 20 * 1024 * 4;
+
+   if (ib_type == IB_CONST) {
+  buffer_size = 512 * 1024 * 4;
+  ib_size = 128 * 1024 * 4;
+   }
 
ib->base.cdw = 0;
ib->base.buf = NULL;
@@ -350,7 +355,7 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
   return NULL;
}
 
-   if (!amdgpu_get_new_ib(>ws->base, >main, >ib[IB_MAIN])) {
+   if (!amdgpu_get_new_ib(>ws->base, >main, >ib[IB_MAIN], 
IB_MAIN)) {
   amdgpu_destroy_cs_context(cs);
   FREE(cs);
   return NULL;
@@ -373,7 +378,7 @@ amdgpu_cs_add_const_ib(struct radeon_winsys_cs *rcs)
if (cs->ring_type != RING_GFX || cs->const_ib.ib_mapped)
   return NULL;
 
-   if (!amdgpu_get_new_ib(>base, >const_ib, >ib[IB_CONST]))
+   if (!amdgpu_get_new_ib(>base, >const_ib, >ib[IB_CONST], 
IB_CONST))
   return NULL;
 
cs->request.number_of_ibs = 2;
@@ -725,9 +730,9 @@ static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
 cleanup:
amdgpu_cs_context_cleanup(cs);
 
-   amdgpu_get_new_ib(>base, >main, >ib[IB_MAIN]);
+   amdgpu_get_new_ib(>base, >main, >ib[IB_MAIN], IB_MAIN);
if (cs->const_ib.ib_mapped)
-  amdgpu_get_new_ib(>base, >const_ib, >ib[IB_CONST]);
+  amdgpu_get_new_ib(>base, >const_ib, >ib[IB_CONST], IB_CONST);
 
ws->num_cs_flushes++;
 }
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 10/13] radeonsi: Add CE uploader.

2016-04-13 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_descriptors.c | 23 +++
 src/gallium/drivers/radeonsi/si_pipe.c| 11 +++
 src/gallium/drivers/radeonsi/si_pipe.h|  3 +++
 3 files changed, 37 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index 17c9285..46d00b4 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -61,6 +61,7 @@
 #include "sid.h"
 
 #include "util/u_memory.h"
+#include "util/u_suballoc.h"
 #include "util/u_upload_mgr.h"
 
 
@@ -130,6 +131,28 @@ static void si_release_descriptors(struct si_descriptors 
*desc)
FREE(desc->list);
 }
 
+static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned 
size,
+unsigned *out_offset, struct r600_resource **out_buf) {
+   uint64_t va;
+
+   u_suballocator_alloc(sctx->ce_suballocator, size, out_offset,
+(struct pipe_resource**)out_buf);
+   if (!out_buf)
+   return false;
+
+   va = (*out_buf)->gpu_address + *out_offset;
+
+   radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0));
+   radeon_emit(sctx->ce_ib, ce_offset);
+   radeon_emit(sctx->ce_ib, size / 4);
+   radeon_emit(sctx->ce_ib, va);
+   radeon_emit(sctx->ce_ib, va >> 32);
+
+   sctx->ce_need_synchronization = true;
+   return true;
+}
+
+
 static bool si_upload_descriptors(struct si_context *sctx,
  struct si_descriptors *desc)
 {
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
b/src/gallium/drivers/radeonsi/si_pipe.c
index e49836d..b2fa272 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -29,6 +29,7 @@
 #include "radeon/radeon_llvm_emit.h"
 #include "radeon/radeon_uvd.h"
 #include "util/u_memory.h"
+#include "util/u_suballoc.h"
 #include "vl/vl_decoder.h"
 
 /*
@@ -41,6 +42,9 @@ static void si_destroy_context(struct pipe_context *context)
 
si_release_all_descriptors(sctx);
 
+   if (sctx->ce_suballocator)
+   u_suballocator_destroy(sctx->ce_suballocator);
+
pipe_resource_reference(>esgs_ring, NULL);
pipe_resource_reference(>gsvs_ring, NULL);
pipe_resource_reference(>tf_ring, NULL);
@@ -147,6 +151,13 @@ static struct pipe_context *si_create_context(struct 
pipe_screen *screen,
sctx->ce_ib = ws->cs_add_const_ib(sctx->b.gfx.cs);
if (!sctx->ce_ib)
goto fail;
+
+   sctx->ce_suballocator =
+   u_suballocator_create(>b.b, 1024 * 1024,
+ 64, PIPE_BIND_CUSTOM,
+ PIPE_USAGE_DEFAULT, 
FALSE);
+   if(!sctx->ce_suballocator)
+   goto fail;
}
 
sctx->b.gfx.flush = si_context_gfx_flush;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index 8eee2fe..afad959 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -80,6 +80,7 @@
 
 struct si_compute;
 struct hash_table;
+struct u_suballocator;
 
 struct si_screen {
struct r600_common_screen   b;
@@ -191,8 +192,10 @@ struct si_context {
void*custom_blend_dcc_decompress;
void*pstipple_sampler_state;
struct si_screen*screen;
+
struct radeon_winsys_cs *ce_ib;
boolce_need_synchronization;
+   struct u_suballocator   *ce_suballocator;
 
struct pipe_fence_handle*last_gfx_fence;
struct si_shader_ctx_state  fixed_func_tcs_shader;
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 05/13] radeonsi: Create CE IB.

2016-04-13 Thread Bas Nieuwenhuizen
Based on work by Marek Olšák.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeon/r600_pipe_common.c | 1 +
 src/gallium/drivers/radeon/r600_pipe_common.h | 1 +
 src/gallium/drivers/radeonsi/si_hw_context.c  | 4 +++-
 src/gallium/drivers/radeonsi/si_pipe.c| 7 +++
 src/gallium/drivers/radeonsi/si_pipe.h| 2 ++
 5 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c 
b/src/gallium/drivers/radeon/r600_pipe_common.c
index a7477ab..a8660f2 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -402,6 +402,7 @@ static const struct debug_named_value 
common_debug_options[] = {
{ "norbplus", DBG_NO_RB_PLUS, "Disable RB+ on Stoney." },
{ "sisched", DBG_SI_SCHED, "Enable LLVM SI Machine Instruction 
Scheduler." },
{ "mono", DBG_MONOLITHIC_SHADERS, "Use old-style monolithic shaders 
compiled on demand" },
+   { "noce", DBG_NO_CE, "Disable the constant engine"},
 
DEBUG_NAMED_VALUE_END /* must be last */
 };
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h 
b/src/gallium/drivers/radeon/r600_pipe_common.h
index a6abe09..dbbd98f 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -95,6 +95,7 @@
 #define DBG_NO_RB_PLUS (1llu << 45)
 #define DBG_SI_SCHED   (1llu << 46)
 #define DBG_MONOLITHIC_SHADERS (1llu << 47)
+#define DBG_NO_CE  (1llu << 48)
 
 #define R600_MAP_BUFFER_ALIGNMENT 64
 #define R600_MAX_VIEWPORTS16
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c 
b/src/gallium/drivers/radeonsi/si_hw_context.c
index b621b55..9c84623 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -30,6 +30,7 @@
 void si_need_cs_space(struct si_context *ctx)
 {
struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
+   struct radeon_winsys_cs *ce_ib = ctx->ce_ib;
struct radeon_winsys_cs *dma = ctx->b.dma.cs;
 
/* Flush the DMA IB if it's not empty. */
@@ -53,7 +54,8 @@ void si_need_cs_space(struct si_context *ctx)
/* If the CS is sufficiently large, don't count the space needed
 * and just flush if there is not enough space left.
 */
-   if (unlikely(cs->cdw > cs->max_dw - 2048))
+   if (unlikely(cs->cdw > cs->max_dw - 2048 ||
+ (ce_ib && ce_ib->cdw > ce_ib->max_dw - 10 * 1024)))
ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
b/src/gallium/drivers/radeonsi/si_pipe.c
index 6a990ed..e49836d 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -142,6 +142,13 @@ static struct pipe_context *si_create_context(struct 
pipe_screen *screen,
 
sctx->b.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX,
   si_context_gfx_flush, sctx);
+
+   if (!(sscreen->b.debug_flags & DBG_NO_CE) && ws->cs_add_const_ib) {
+   sctx->ce_ib = ws->cs_add_const_ib(sctx->b.gfx.cs);
+   if (!sctx->ce_ib)
+   goto fail;
+   }
+
sctx->b.gfx.flush = si_context_gfx_flush;
 
/* Border colors. */
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index 0398b1d..743c782 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -191,6 +191,8 @@ struct si_context {
void*custom_blend_dcc_decompress;
void*pstipple_sampler_state;
struct si_screen*screen;
+   struct radeon_winsys_cs *ce_ib;
+
struct pipe_fence_handle*last_gfx_fence;
struct si_shader_ctx_state  fixed_func_tcs_shader;
LLVMTargetMachineReftm;
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 06/13] radeonsi: Add dirty_mask to descriptor list.

2016-04-13 Thread Bas Nieuwenhuizen
We can then upload only the dirty ones with the constant engine.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_descriptors.c | 23 +++
 src/gallium/drivers/radeonsi/si_state.h   |  1 +
 2 files changed, 24 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index b5557d8..d893ab4 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -108,6 +108,7 @@ static void si_init_descriptors(struct si_descriptors *desc,
desc->element_dw_size = element_dw_size;
desc->num_elements = num_elements;
desc->list_dirty = true; /* upload the list before the next draw */
+   desc->dirty_mask = num_elements == 64 ? ~0llu : (1llu << num_elements) 
- 1;
desc->shader_userdata_offset = shader_userdata_index * 4;
 
/* Initialize the array to NULL descriptors if the element size is 8. */
@@ -188,6 +189,9 @@ static void si_sampler_views_begin_new_cs(struct si_context 
*sctx,
si_sampler_view_add_buffer(sctx, views->views[i]->texture);
}
 
+   views->desc.dirty_mask = views->desc.num_elements == 64 ? ~0llu :
+(1llu << views->desc.num_elements) - 1;
+
if (!views->desc.buffer)
return;
radeon_add_to_buffer_list(>b, >b.gfx, views->desc.buffer,
@@ -239,6 +243,7 @@ static void si_set_sampler_view(struct si_context *sctx,
views->desc.enabled_mask &= ~(1llu << slot);
}
 
+   views->desc.dirty_mask |= 1llu << slot;
views->desc.list_dirty = true;
 }
 
@@ -345,6 +350,9 @@ si_image_views_begin_new_cs(struct si_context *sctx, struct 
si_images_info *imag
si_sampler_view_add_buffer(sctx, view->resource);
}
 
+   images->desc.dirty_mask = images->desc.num_elements == 64 ? ~0llu :
+   (1llu << images->desc.num_elements) - 1;
+
if (images->desc.buffer) {
radeon_add_to_buffer_list(>b, >b.gfx,
  images->desc.buffer,
@@ -362,6 +370,7 @@ si_disable_shader_image(struct si_images_info *images, 
unsigned slot)
 
memcpy(images->desc.list + slot*8, null_image_descriptor, 8*4);
images->desc.enabled_mask &= ~(1llu << slot);
+   images->desc.dirty_mask |= 1llu << slot;
images->desc.list_dirty = true;
}
 }
@@ -443,6 +452,7 @@ si_set_shader_images(struct pipe_context *pipe, unsigned 
shader,
}
 
images->desc.enabled_mask |= 1llu << slot;
+   images->desc.dirty_mask |= 1llu << slot;
images->desc.list_dirty = true;
}
 }
@@ -501,6 +511,7 @@ static void si_bind_sampler_states(struct pipe_context 
*ctx, unsigned shader,
continue;
 
memcpy(desc->list + slot * 16 + 12, sstates[i]->val, 4*4);
+   desc->dirty_mask |= 1llu << slot;
desc->list_dirty = true;
}
 }
@@ -547,6 +558,9 @@ static void si_buffer_resources_begin_new_cs(struct 
si_context *sctx,
  buffers->shader_usage, buffers->priority);
}
 
+   buffers->desc.dirty_mask = buffers->desc.num_elements == 64 ? ~0llu :
+  (1llu << buffers->desc.num_elements) - 1;
+
if (!buffers->desc.buffer)
return;
radeon_add_to_buffer_list(>b, >b.gfx,
@@ -743,6 +757,7 @@ static void si_set_constant_buffer(struct pipe_context 
*ctx, uint shader, uint s
buffers->desc.enabled_mask &= ~(1llu << slot);
}
 
+   buffers->desc.dirty_mask |= 1llu << slot;
buffers->desc.list_dirty = true;
 }
 
@@ -790,6 +805,7 @@ static void si_set_shader_buffers(struct pipe_context *ctx, 
unsigned shader,
radeon_add_to_buffer_list(>b, >b.gfx, buf,
  buffers->shader_usage, buffers->priority);
buffers->desc.enabled_mask |= 1llu << slot;
+   buffers->desc.dirty_mask |= 1llu << slot;
}
 
buffers->desc.list_dirty = true;
@@ -887,6 +903,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint 
shader, uint slot,
buffers->desc.enabled_mask &= ~(1llu << slot);
}
 
+   buffers->desc.dirty_mask |= 1llu << slot;
buffers->desc.list_dirty = true;
 }
 
@@ -985,6 +1002,7 @@ static void si_set_streamout_targets(struct pipe_context 
*ctx,
   

[Mesa-dev] [PATCH 13/13] radeonsi: Use CE for all descriptors.

2016-04-13 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_descriptors.c | 46 +--
 1 file changed, 36 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index 5e26760..5ddb168 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -60,6 +60,7 @@
 #include "si_shader.h"
 #include "sid.h"
 
+#include "util/u_math.h"
 #include "util/u_memory.h"
 #include "util/u_suballoc.h"
 #include "util/u_upload_mgr.h"
@@ -104,7 +105,10 @@ static void si_init_descriptors(struct si_descriptors 
*desc,
 {
int i;
 
-   assert(num_elements <= sizeof(desc->enabled_mask)*8);
+   /* Ensure that desc->enabled_mask covers all descriptors. The + 1 is
+* to ensure that u_bit_scan_consecutive_range64 never shifts the 1
+* out of the variable while creating the clear mask. */
+   assert(num_elements + 1 <= sizeof(desc->enabled_mask) * CHAR_BIT);
 
desc->list = CALLOC(num_elements, element_dw_size * 4);
desc->element_dw_size = element_dw_size;
@@ -157,24 +161,46 @@ static bool si_upload_descriptors(struct si_context *sctx,
  struct si_descriptors *desc)
 {
unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
-   void *ptr;
 
if (!desc->list_dirty)
return true;
 
-   u_upload_alloc(sctx->b.uploader, 0, list_size, 256,
-  >buffer_offset,
-  (struct pipe_resource**)>buffer, );
-   if (!desc->buffer)
-   return false; /* skip the draw call */
+   if (sctx->ce_ib) {
+   uint32_t const* list = (uint32_t const*)desc->list;
 
-   util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
+   while(desc->dirty_mask) {
+   int begin, count;
+   u_bit_scan_consecutive_range64(>dirty_mask, 
,
+  );
 
-   radeon_add_to_buffer_list(>b, >b.gfx, desc->buffer,
- RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
+   begin *= desc->element_dw_size;
+   count *= desc->element_dw_size;
+
+   radeon_emit(sctx->ce_ib,
+   PKT3(PKT3_WRITE_CONST_RAM, count, 0));
+   radeon_emit(sctx->ce_ib, desc->ce_offset + begin * 4);
+   radeon_emit_array(sctx->ce_ib, list + begin, count);
+   }
+
+   if(!si_ce_upload(sctx, desc->ce_offset, list_size,
+>buffer_offset, >buffer))
+   return false;
+   } else {
+   void *ptr;
+
+   u_upload_alloc(sctx->b.uploader, 0, list_size, 256,
+   >buffer_offset,
+   (struct pipe_resource**)>buffer, );
+   if (!desc->buffer)
+   return false; /* skip the draw call */
 
+   util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
+   }
+   radeon_add_to_buffer_list(>b, >b.gfx, desc->buffer,
+   RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
desc->list_dirty = false;
desc->pointer_dirty = true;
+   desc->dirty_mask = 0;
si_mark_atom_dirty(sctx, >shader_userdata.atom);
return true;
 }
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 11/13] radeonsi: Use CE for vertex buffers.

2016-04-13 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_descriptors.c | 28 ---
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index 46d00b4..5e26760 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -641,14 +641,18 @@ static bool si_upload_vertex_buffer_descriptors(struct 
si_context *sctx)
 * directly through a staging buffer and don't go through
 * the fine-grained upload path.
 */
-   u_upload_alloc(sctx->b.uploader, 0, count * 16, 256, 
>buffer_offset,
-  (struct pipe_resource**)>buffer, (void**));
-   if (!desc->buffer)
-   return false;
 
-   radeon_add_to_buffer_list(>b, >b.gfx,
- desc->buffer, RADEON_USAGE_READ,
- RADEON_PRIO_DESCRIPTORS);
+   if (sctx->ce_ib) {
+   radeon_emit(sctx->ce_ib, PKT3(PKT3_WRITE_CONST_RAM, 4 * count, 
0));
+   radeon_emit(sctx->ce_ib, desc->ce_offset);
+   ptr = sctx->ce_ib->buf + sctx->ce_ib->cdw;
+   sctx->ce_ib->cdw += count * 4;
+   } else {
+   u_upload_alloc(sctx->b.uploader, 0, count * 16, 256, 
>buffer_offset,
+   (struct pipe_resource**)>buffer, (void**));
+   if (!desc->buffer)
+   return false;
+   }
 
assert(count <= SI_NUM_VERTEX_BUFFERS);
 
@@ -697,6 +701,16 @@ static bool si_upload_vertex_buffer_descriptors(struct 
si_context *sctx)
}
}
 
+   if (sctx->ce_ib) {
+   if (!si_ce_upload(sctx, desc->ce_offset, count * 16,
+ >buffer_offset, >buffer))
+   return false;
+   }
+
+   radeon_add_to_buffer_list(>b, >b.gfx,
+ desc->buffer, RADEON_USAGE_READ,
+ RADEON_PRIO_DESCRIPTORS);
+
/* Don't flush the const cache. It would have a very negative effect
 * on performance (confirmed by testing). New descriptors are always
 * uploaded to a fresh new buffer, so I don't think flushing the const
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 03/13] winsys/amdgpu: add support for const IB

2016-04-13 Thread Bas Nieuwenhuizen
From: Marek Olšák <marek.ol...@amd.com>

v2: use the correct IB to update request (Bas Nieuwenhuizen)
---
 src/gallium/drivers/radeon/radeon_winsys.h | 18 +++
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c  | 48 +++---
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.h  |  9 +-
 3 files changed, 70 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_winsys.h 
b/src/gallium/drivers/radeon/radeon_winsys.h
index aa94df6..04ce2fb 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -603,6 +603,24 @@ struct radeon_winsys {
   void *flush_ctx);
 
 /**
+ * Add a constant engine IB to a graphics CS. This makes the graphics CS
+ * from "cs_create" a group of two IBs that share a buffer list and are
+ * flushed together.
+ *
+ * The returned constant CS is only a stream for writing packets to the new
+ * IB. Calling other winsys functions with it is not allowed, not even
+ * "cs_destroy".
+ *
+ * In order to add buffers and check memory usage, use the graphics CS.
+ * In order to flush it, use the graphics CS, which will flush both IBs.
+ * Destroying the graphics CS will destroy both of them.
+ *
+ * \param cs  The graphics CS from "cs_create" that will hold the buffer
+ *list and will be used for flushing.
+ */
+struct radeon_winsys_cs *(*cs_add_const_ib)(struct radeon_winsys_cs *cs);
+
+/**
  * Destroy a command stream.
  *
  * \param csA command stream to destroy.
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index b0fe8b9..b0c80c6 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -350,19 +350,39 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
   return NULL;
}
 
-   if (!amdgpu_get_new_ib(>ws->base, >main, >ib)) {
+   if (!amdgpu_get_new_ib(>ws->base, >main, >ib[IB_MAIN])) {
   amdgpu_destroy_cs_context(cs);
   FREE(cs);
   return NULL;
}
 
cs->request.number_of_ibs = 1;
-   cs->request.ibs = >ib;
+   cs->request.ibs = >ib[IB_MAIN];
 
p_atomic_inc(>ws->num_cs);
return >main.base;
 }
 
+static struct radeon_winsys_cs *
+amdgpu_cs_add_const_ib(struct radeon_winsys_cs *rcs)
+{
+   struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs;
+   struct amdgpu_winsys *ws = cs->ctx->ws;
+
+   /* only one const IB can be added */
+   if (cs->ring_type != RING_GFX || cs->const_ib.ib_mapped)
+  return NULL;
+
+   if (!amdgpu_get_new_ib(>base, >const_ib, >ib[IB_CONST]))
+  return NULL;
+
+   cs->request.number_of_ibs = 2;
+   cs->request.ibs = >ib[IB_CONST];
+   cs->ib[IB_CONST].flags = AMDGPU_IB_FLAG_CE;
+
+   return >const_ib.base;
+}
+
 #define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value)
 
 int amdgpu_lookup_buffer(struct amdgpu_cs *cs, struct amdgpu_winsys_bo *bo)
@@ -621,6 +641,12 @@ static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
   /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements */
   while (rcs->cdw & 7)
  OUT_CS(rcs, 0x1000); /* type3 nop packet */
+
+  /* Also pad the const IB. */
+  /* TODO: is this the correct packet for the const IB? */
+  if (cs->const_ib.ib_mapped)
+ while (!cs->const_ib.base.cdw || (cs->const_ib.base.cdw & 7))
+OUT_CS(>const_ib.base, 0x1000); /* type3 nop packet */
   break;
case RING_UVD:
   while (rcs->cdw & 15)
@@ -637,6 +663,10 @@ static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
amdgpu_cs_add_buffer(rcs, cs->main.big_ib_buffer,
 RADEON_USAGE_READ, 0, RADEON_PRIO_IB1);
 
+   if (cs->const_ib.ib_mapped)
+  amdgpu_cs_add_buffer(rcs, cs->const_ib.big_ib_buffer,
+   RADEON_USAGE_READ, 0, RADEON_PRIO_IB1);
+
/* If the CS is not empty or overflowed */
if (cs->main.base.cdw && cs->main.base.cdw <= cs->main.base.max_dw && 
!debug_get_option_noop()) {
   int r;
@@ -677,9 +707,14 @@ static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
 goto cleanup;
   }
 
-  cs->ib.size = cs->main.base.cdw;
+  cs->ib[IB_MAIN].size = cs->main.base.cdw;
   cs->main.used_ib_space += cs->main.base.cdw * 4;
 
+  if (cs->const_ib.ib_mapped) {
+ cs->ib[IB_CONST].size = cs->const_ib.base.cdw;
+ cs->const_ib.used_ib_space += cs->const_ib.base.cdw * 4;
+  }
+
   amdgpu_cs_do_submission(cs, fence);
 
   /* Cleanup. */
@@ -689,7 +724,10 @@ static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
 
 cleanup:
amdgpu_cs_context_cleanup(cs);
-   amdgpu_get_new_ib(

[Mesa-dev] [PATCH 09/13] radeonsi: Allocate chunks of CE ram.

2016-04-13 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_descriptors.c | 29 ++-
 src/gallium/drivers/radeonsi/si_pipe.h|  1 -
 src/gallium/drivers/radeonsi/si_state.h   |  3 +++
 3 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index d893ab4..17c9285 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -98,7 +98,8 @@ static void si_init_descriptors(struct si_descriptors *desc,
unsigned shader_userdata_index,
unsigned element_dw_size,
unsigned num_elements,
-   const uint32_t *null_descriptor)
+   const uint32_t *null_descriptor,
+   unsigned *ce_offset)
 {
int i;
 
@@ -110,6 +111,9 @@ static void si_init_descriptors(struct si_descriptors *desc,
desc->list_dirty = true; /* upload the list before the next draw */
desc->dirty_mask = num_elements == 64 ? ~0llu : (1llu << num_elements) 
- 1;
desc->shader_userdata_offset = shader_userdata_index * 4;
+   desc->ce_offset = *ce_offset;
+
+   *ce_offset += element_dw_size * num_elements * 4;
 
/* Initialize the array to NULL descriptors if the element size is 8. */
if (null_descriptor) {
@@ -522,14 +526,15 @@ static void si_init_buffer_resources(struct 
si_buffer_resources *buffers,
 unsigned num_buffers,
 unsigned shader_userdata_index,
 enum radeon_bo_usage shader_usage,
-enum radeon_bo_priority priority)
+enum radeon_bo_priority priority,
+unsigned *ce_offset)
 {
buffers->shader_usage = shader_usage;
buffers->priority = priority;
buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
 
si_init_descriptors(>desc, shader_userdata_index, 4,
-   num_buffers, NULL);
+   num_buffers, NULL, ce_offset);
 }
 
 static void si_release_buffer_resources(struct si_buffer_resources *buffers)
@@ -1347,29 +1352,35 @@ void si_emit_shader_userdata(struct si_context *sctx, 
struct r600_atom *atom)
 void si_init_all_descriptors(struct si_context *sctx)
 {
int i;
+   unsigned ce_offset = 0;
 
for (i = 0; i < SI_NUM_SHADERS; i++) {
si_init_buffer_resources(>const_buffers[i],
 SI_NUM_CONST_BUFFERS, 
SI_SGPR_CONST_BUFFERS,
-RADEON_USAGE_READ, 
RADEON_PRIO_CONST_BUFFER);
+RADEON_USAGE_READ, 
RADEON_PRIO_CONST_BUFFER,
+_offset);
si_init_buffer_resources(>rw_buffers[i],
 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
-RADEON_USAGE_READWRITE, 
RADEON_PRIO_RINGS_STREAMOUT);
+RADEON_USAGE_READWRITE, 
RADEON_PRIO_RINGS_STREAMOUT,
+_offset);
si_init_buffer_resources(>shader_buffers[i],
 SI_NUM_SHADER_BUFFERS, 
SI_SGPR_SHADER_BUFFERS,
-RADEON_USAGE_READWRITE, 
RADEON_PRIO_SHADER_RW_BUFFER);
+RADEON_USAGE_READWRITE, 
RADEON_PRIO_SHADER_RW_BUFFER,
+_offset);
 
si_init_descriptors(>samplers[i].views.desc,
SI_SGPR_SAMPLERS, 16, SI_NUM_SAMPLERS,
-   null_texture_descriptor);
+   null_texture_descriptor, _offset);
 
si_init_descriptors(>images[i].desc,
SI_SGPR_IMAGES, 8, SI_NUM_IMAGES,
-   null_image_descriptor);
+   null_image_descriptor, _offset);
}
 
si_init_descriptors(>vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
-   4, SI_NUM_VERTEX_BUFFERS, NULL);
+   4, SI_NUM_VERTEX_BUFFERS, NULL, _offset);
+
+   assert(ce_offset <= 32768);
 
/* Set pipe_context functions. */
sctx->b.b.bind_sampler_states = si_bind_sampler_states;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index d9dfb59..8eee2fe 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/s

[Mesa-dev] [PATCH 02/13] winsys/amdgpu: split IB data into a new structure in preparation for CE

2016-04-13 Thread Bas Nieuwenhuizen
From: Marek Olšák 

---
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.c |  5 ---
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.h |  6 +++
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 68 +++
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.h | 16 
 4 files changed, 48 insertions(+), 47 deletions(-)

diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index 1b2793a..036301e 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -38,11 +38,6 @@
 #include 
 #include 
 
-static inline struct amdgpu_winsys_bo *amdgpu_winsys_bo(struct pb_buffer *bo)
-{
-   return (struct amdgpu_winsys_bo *)bo;
-}
-
 static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
enum radeon_bo_usage usage)
 {
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h 
b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
index 54f5dbd..69ada10 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
@@ -69,6 +69,12 @@ void amdgpu_bo_destroy(struct pb_buffer *_buf);
 void amdgpu_bo_init_functions(struct amdgpu_winsys *ws);
 
 static inline
+struct amdgpu_winsys_bo *amdgpu_winsys_bo(struct pb_buffer *bo)
+{
+   return (struct amdgpu_winsys_bo *)bo;
+}
+
+static inline
 void amdgpu_winsys_bo_reference(struct amdgpu_winsys_bo **dst,
 struct amdgpu_winsys_bo *src)
 {
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index 63c72fc..b0fe8b9 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -198,7 +198,8 @@ amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx 
*rwctx)
 
 /* COMMAND SUBMISSION */
 
-static bool amdgpu_get_new_ib(struct amdgpu_cs *cs)
+static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_ib *ib,
+  struct amdgpu_cs_ib_info *info)
 {
/* Small IBs are better than big IBs, because the GPU goes idle quicker
 * and there is less waiting for buffers and fences. Proof:
@@ -207,39 +208,36 @@ static bool amdgpu_get_new_ib(struct amdgpu_cs *cs)
const unsigned buffer_size = 128 * 1024 * 4;
const unsigned ib_size = 20 * 1024 * 4;
 
-   cs->base.cdw = 0;
-   cs->base.buf = NULL;
+   ib->base.cdw = 0;
+   ib->base.buf = NULL;
 
/* Allocate a new buffer for IBs if the current buffer is all used. */
-   if (!cs->big_ib_buffer ||
-   cs->used_ib_space + ib_size > cs->big_ib_buffer->size) {
-  struct radeon_winsys *ws = >ctx->ws->base;
+   if (!ib->big_ib_buffer ||
+   ib->used_ib_space + ib_size > ib->big_ib_buffer->size) {
 
-  pb_reference(>big_ib_buffer, NULL);
-  cs->big_ib_winsys_buffer = NULL;
-  cs->ib_mapped = NULL;
-  cs->used_ib_space = 0;
+  pb_reference(>big_ib_buffer, NULL);
+  ib->ib_mapped = NULL;
+  ib->used_ib_space = 0;
 
-  cs->big_ib_buffer = ws->buffer_create(ws, buffer_size,
+  ib->big_ib_buffer = ws->buffer_create(ws, buffer_size,
 4096, true,
 RADEON_DOMAIN_GTT,
 RADEON_FLAG_CPU_ACCESS);
-  if (!cs->big_ib_buffer)
+  if (!ib->big_ib_buffer)
  return false;
 
-  cs->ib_mapped = ws->buffer_map(cs->big_ib_buffer, NULL,
+  ib->ib_mapped = ws->buffer_map(ib->big_ib_buffer, NULL,
  PIPE_TRANSFER_WRITE);
-  if (!cs->ib_mapped) {
- pb_reference(>big_ib_buffer, NULL);
+  if (!ib->ib_mapped) {
+ pb_reference(>big_ib_buffer, NULL);
  return false;
   }
-
-  cs->big_ib_winsys_buffer = (struct amdgpu_winsys_bo*)cs->big_ib_buffer;
}
 
-   cs->ib.ib_mc_address = cs->big_ib_winsys_buffer->va + cs->used_ib_space;
-   cs->base.buf = (uint32_t*)(cs->ib_mapped + cs->used_ib_space);
-   cs->base.max_dw = ib_size / 4;
+   info->ib_mc_address = amdgpu_winsys_bo(ib->big_ib_buffer)->va +
+ ib->used_ib_space;
+   ib->base.buf = (uint32_t*)(ib->ib_mapped + ib->used_ib_space);
+   ib->base.max_dw = ib_size / 4;
return true;
 }
 
@@ -271,9 +269,6 @@ static boolean amdgpu_init_cs_context(struct amdgpu_cs *cs,
   break;
}
 
-   cs->request.number_of_ibs = 1;
-   cs->request.ibs = >ib;
-
cs->max_num_buffers = 512;
cs->buffers = (struct amdgpu_cs_buffer*)
   CALLOC(1, cs->max_num_buffers * sizeof(struct 
amdgpu_cs_buffer));
@@ -355,14 +350,17 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
   return NULL;
}
 
-   if (!amdgpu_get_new_ib(cs)) {
+   if (!amdgpu_get_new_ib(>ws->base, >main, >ib)) {
   amdgpu_destroy_cs_context(cs);
   FREE(cs);
   return NULL;
}
 
+   cs->request.number_of_ibs = 1;
+   cs->request.ibs = >ib;
+
p_atomic_inc(>ws->num_cs);
-   return >base;
+   return 

[Mesa-dev] [PATCH 07/13] radeonsi: Add CE packet definitions.

2016-04-13 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/sid.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/sid.h 
b/src/gallium/drivers/radeonsi/sid.h
index f0aa605..1072e0a 100644
--- a/src/gallium/drivers/radeonsi/sid.h
+++ b/src/gallium/drivers/radeonsi/sid.h
@@ -145,6 +145,12 @@
 #define PKT3_SET_SH_REG0x76
 #define PKT3_SET_SH_REG_OFFSET 0x77
 #define PKT3_SET_UCONFIG_REG   0x79 /* new for CIK */
+#define PKT3_LOAD_CONST_RAM0x80
+#define PKT3_WRITE_CONST_RAM   0x81
+#define PKT3_DUMP_CONST_RAM0x83
+#define PKT3_INCREMENT_CE_COUNTER  0x84
+#define PKT3_INCREMENT_DE_COUNTER  0x85
+#define PKT3_WAIT_ON_CE_COUNTER0x86
 
 #define PKT_TYPE_S(x)   (((x) & 0x3) << 30)
 #define PKT_TYPE_G(x)   (((x) >> 30) & 0x3)
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 12/13] gallium/util: Add u_bit_scan_consecutive_range64.

2016-04-13 Thread Bas Nieuwenhuizen
For use by radeonsi.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/auxiliary/util/u_math.h | 8 
 1 file changed, 8 insertions(+)

diff --git a/src/gallium/auxiliary/util/u_math.h 
b/src/gallium/auxiliary/util/u_math.h
index b4ac0db..3a468e4 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -523,6 +523,14 @@ u_bit_scan_consecutive_range(unsigned *mask, int *start, 
int *count)
*mask &= ~(((1 << *count) - 1) << *start);
 }
 
+static inline void
+u_bit_scan_consecutive_range64(uint64_t *mask, int *start, int *count)
+{
+   *start = ffsll(*mask) - 1;
+   *count = ffsll(~(*mask >> *start)) - 1;
+   *mask &= ~(((1llu << *count) - 1) << *start);
+}
+
 /**
  * Return float bits.
  */
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 08/13] radeonsi: Add CE synchronization.

2016-04-13 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_pipe.h   |  2 ++
 src/gallium/drivers/radeonsi/si_state_draw.c | 24 
 2 files changed, 26 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index 743c782..d9dfb59 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -192,6 +192,8 @@ struct si_context {
void*pstipple_sampler_state;
struct si_screen*screen;
struct radeon_winsys_cs *ce_ib;
+   boolce_need_synchronization;
+
 
struct pipe_fence_handle*last_gfx_fence;
struct si_shader_ctx_state  fixed_func_tcs_shader;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c 
b/src/gallium/drivers/radeonsi/si_state_draw.c
index 40cad50..dd13d51 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -753,6 +753,25 @@ static void si_get_draw_start_count(struct si_context 
*sctx,
}
 }
 
+static void si_ce_pre_draw_synchronization(struct si_context *sctx) {
+   if (sctx->ce_need_synchronization) {
+   radeon_emit(sctx->ce_ib, PKT3(PKT3_INCREMENT_CE_COUNTER, 0, 0));
+   radeon_emit(sctx->ce_ib, 1);
+
+   radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_WAIT_ON_CE_COUNTER, 0, 
0));
+   radeon_emit(sctx->b.gfx.cs, 1);
+   }
+}
+
+static void si_ce_post_draw_synchronization(struct si_context *sctx) {
+   if (sctx->ce_need_synchronization) {
+   radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_INCREMENT_DE_COUNTER, 0, 
0));
+   radeon_emit(sctx->b.gfx.cs, 0);
+
+   sctx->ce_need_synchronization = false;
+   }
+}
+
 void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 {
struct si_context *sctx = (struct si_context *)ctx;
@@ -882,8 +901,13 @@ void si_draw_vbo(struct pipe_context *ctx, const struct 
pipe_draw_info *info)
si_emit_scratch_reloc(sctx);
si_emit_rasterizer_prim_state(sctx);
si_emit_draw_registers(sctx, info);
+
+   si_ce_pre_draw_synchronization(sctx);
+
si_emit_draw_packets(sctx, info, );
 
+   si_ce_post_draw_synchronization(sctx);
+
if (sctx->trace_buf)
si_trace_emit(sctx);
 
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 2/2] radeonsi: Enable loading into CE RAM.

2016-04-21 Thread Bas Nieuwenhuizen
We need to enable a bit in the CONTEXT_CONTROL packet for the
loads to work.

v2: Style issues.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_descriptors.c | 7 +++
 src/gallium/drivers/radeonsi/si_hw_context.c  | 5 +
 src/gallium/drivers/radeonsi/si_state.h   | 1 +
 3 files changed, 13 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index 1580e61..9ad630b 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -185,6 +185,13 @@ static void si_reinitialize_ce_ram(struct si_context *sctx,
desc->ce_ram_dirty = false;
 }
 
+void si_ce_enable_loads(struct radeon_winsys_cs *ib)
+{
+   radeon_emit(ib, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
+   radeon_emit(ib, LOAD_CONTROL_LOAD_ENABLE(1) | 
LOAD_CONTROL_LOAD_CE_RAM(1));
+   radeon_emit(ib, SHADOW_ENABLE_SHADOW_ENABLE(1));
+}
+
 static bool si_upload_descriptors(struct si_context *sctx,
  struct si_descriptors *desc,
  struct r600_atom * atom)
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c 
b/src/gallium/drivers/radeonsi/si_hw_context.c
index e3abb7f..e6018f3 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -202,6 +202,11 @@ void si_begin_new_cs(struct si_context *ctx)
if (ctx->init_config_gs_rings)
si_pm4_emit(ctx, ctx->init_config_gs_rings);
 
+   if (ctx->ce_preamble_ib)
+   si_ce_enable_loads(ctx->ce_preamble_ib);
+   else if (ctx->ce_ib)
+   si_ce_enable_loads(ctx->ce_ib);
+
ctx->framebuffer.dirty_cbufs = (1 << 8) - 1;
ctx->framebuffer.dirty_zsbuf = true;
si_mark_atom_dirty(ctx, >framebuffer.atom);
diff --git a/src/gallium/drivers/radeonsi/si_state.h 
b/src/gallium/drivers/radeonsi/si_state.h
index c4b2b45..cbe91dd 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -251,6 +251,7 @@ struct si_buffer_resources {
} while(0)
 
 /* si_descriptors.c */
+void si_ce_enable_loads(struct radeon_winsys_cs *ib);
 void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
struct pipe_resource *buffer,
unsigned stride, unsigned num_records,
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 1/2] radeonsi: Use defines for CONTEXT_CONTROL instead of magic values.

2016-04-21 Thread Bas Nieuwenhuizen
v2: Use field names provided by Nicolai.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---

Sending this with the changed names, as they seem  double to me.  Should
I just lose the register name, and optionally add a CONTEXT_CONTROL prefix?

 src/gallium/drivers/radeonsi/si_state.c | 4 ++--
 src/gallium/drivers/radeonsi/sid.h  | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state.c 
b/src/gallium/drivers/radeonsi/si_state.c
index 305a70b..e195eaf 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -3821,8 +3821,8 @@ static void si_init_config(struct si_context *sctx)
return;
 
si_pm4_cmd_begin(pm4, PKT3_CONTEXT_CONTROL);
-   si_pm4_cmd_add(pm4, 0x8000);
-   si_pm4_cmd_add(pm4, 0x8000);
+   si_pm4_cmd_add(pm4, LOAD_CONTROL_LOAD_ENABLE(1));
+   si_pm4_cmd_add(pm4, SHADOW_ENABLE_SHADOW_ENABLE(1));
si_pm4_cmd_end(pm4, false);
 
si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64));
diff --git a/src/gallium/drivers/radeonsi/sid.h 
b/src/gallium/drivers/radeonsi/sid.h
index 516e114..34ced0e 100644
--- a/src/gallium/drivers/radeonsi/sid.h
+++ b/src/gallium/drivers/radeonsi/sid.h
@@ -88,6 +88,9 @@
 #define PKT3_INDEX_BASE0x26
 #define PKT3_DRAW_INDEX_2  0x27
 #define PKT3_CONTEXT_CONTROL   0x28
+#define LOAD_CONTROL_LOAD_ENABLE(x)(((x) & 0x1) << 31)
+#define LOAD_CONTROL_LOAD_CE_RAM(x)(((x) & 0x1) << 28)
+#define SHADOW_ENABLE_SHADOW_ENABLE(x) (((x) & 0x1) << 31)
 #define PKT3_INDEX_TYPE0x2A
 #define PKT3_DRAW_INDIRECT_MULTI   0x2C
 #define PKT3_DRAW_INDEX_AUTO   0x2D
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/2] radeonsi: Enable loading into CE RAM.

2016-04-21 Thread Bas Nieuwenhuizen
Hi Marek,

The hang was most likely related to something else I tried to fix the
loads and that started doing something once the loads were enabled.
With the kernel patch I had the hang too until I cleaned up the
leftovers of all my previous attempts at fixes in mesa.

- Bas



On Thu, Apr 21, 2016 at 11:44 AM, Marek Olšák <mar...@gmail.com> wrote:
> On Thu, Apr 21, 2016 at 1:49 AM, Bas Nieuwenhuizen
> <b...@basnieuwenhuizen.nl> wrote:
>> We need to enable a bit in the CONTEXT_CONTROL packet for the
>> loads to work.
>>
>> Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
>> ---
>>  src/gallium/drivers/radeonsi/si_descriptors.c | 6 ++
>>  src/gallium/drivers/radeonsi/si_hw_context.c  | 5 +
>>  src/gallium/drivers/radeonsi/si_state.h   | 1 +
>>  3 files changed, 12 insertions(+)
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
>> b/src/gallium/drivers/radeonsi/si_descriptors.c
>> index 1580e61..30e65a9 100644
>> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
>> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
>> @@ -185,6 +185,12 @@ static void si_reinitialize_ce_ram(struct si_context 
>> *sctx,
>> desc->ce_ram_dirty = false;
>>  }
>>
>> +void si_ce_enable_loads(struct radeon_winsys_cs *ib) {
>> +   radeon_emit(ib, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
>> +   radeon_emit(ib, LOAD_CONTROL_UPDATE(1) | LOAD_CONTROL_CE_RAM_EN(1));
>> +   radeon_emit(ib, SHADOW_ENABLE_UPDATE(1));
>> +}
>> +
>>  static bool si_upload_descriptors(struct si_context *sctx,
>>   struct si_descriptors *desc,
>>   struct r600_atom * atom)
>> diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c 
>> b/src/gallium/drivers/radeonsi/si_hw_context.c
>> index e3abb7f..e6018f3 100644
>> --- a/src/gallium/drivers/radeonsi/si_hw_context.c
>> +++ b/src/gallium/drivers/radeonsi/si_hw_context.c
>> @@ -202,6 +202,11 @@ void si_begin_new_cs(struct si_context *ctx)
>> if (ctx->init_config_gs_rings)
>> si_pm4_emit(ctx, ctx->init_config_gs_rings);
>>
>> +   if (ctx->ce_preamble_ib)
>> +   si_ce_enable_loads(ctx->ce_preamble_ib);
>> +   else if (ctx->ce_ib)
>> +   si_ce_enable_loads(ctx->ce_ib);
>
> So what was the cause of hangs you were seeing? Does it not hang when
> there is CONTEXT_CONTROL in the CE IB?
>
> Marek
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/2] radeonsi: Enable loading into CE RAM.

2016-04-20 Thread Bas Nieuwenhuizen
We need to enable a bit in the CONTEXT_CONTROL packet for the
loads to work.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_descriptors.c | 6 ++
 src/gallium/drivers/radeonsi/si_hw_context.c  | 5 +
 src/gallium/drivers/radeonsi/si_state.h   | 1 +
 3 files changed, 12 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index 1580e61..30e65a9 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -185,6 +185,12 @@ static void si_reinitialize_ce_ram(struct si_context *sctx,
desc->ce_ram_dirty = false;
 }
 
+void si_ce_enable_loads(struct radeon_winsys_cs *ib) {
+   radeon_emit(ib, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
+   radeon_emit(ib, LOAD_CONTROL_UPDATE(1) | LOAD_CONTROL_CE_RAM_EN(1));
+   radeon_emit(ib, SHADOW_ENABLE_UPDATE(1));
+}
+
 static bool si_upload_descriptors(struct si_context *sctx,
  struct si_descriptors *desc,
  struct r600_atom * atom)
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c 
b/src/gallium/drivers/radeonsi/si_hw_context.c
index e3abb7f..e6018f3 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -202,6 +202,11 @@ void si_begin_new_cs(struct si_context *ctx)
if (ctx->init_config_gs_rings)
si_pm4_emit(ctx, ctx->init_config_gs_rings);
 
+   if (ctx->ce_preamble_ib)
+   si_ce_enable_loads(ctx->ce_preamble_ib);
+   else if (ctx->ce_ib)
+   si_ce_enable_loads(ctx->ce_ib);
+
ctx->framebuffer.dirty_cbufs = (1 << 8) - 1;
ctx->framebuffer.dirty_zsbuf = true;
si_mark_atom_dirty(ctx, >framebuffer.atom);
diff --git a/src/gallium/drivers/radeonsi/si_state.h 
b/src/gallium/drivers/radeonsi/si_state.h
index c4b2b45..cbe91dd 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -251,6 +251,7 @@ struct si_buffer_resources {
} while(0)
 
 /* si_descriptors.c */
+void si_ce_enable_loads(struct radeon_winsys_cs *ib);
 void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
struct pipe_resource *buffer,
unsigned stride, unsigned num_records,
-- 
2.8.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


<    1   2   3   4   5   6   7   8   9   10   >