date:20190212

From: Marek Olšák 

---
 src/gallium/drivers/radeonsi/Makefile.sources |1 +
 src/gallium/drivers/radeonsi/meson.build  |1 +
 .../radeonsi/si_compute_prim_discard.c| 1537 +
 src/gallium/drivers/radeonsi/si_debug.c   |   32 +-
 src/gallium/drivers/radeonsi/si_fence.c   |8 +-
 src/gallium/drivers/radeonsi/si_gfx_cs.c  |   69 +
 src/gallium/drivers/radeonsi/si_pipe.c|   11 +-
 src/gallium/drivers/radeonsi/si_pipe.h|   61 +-
 src/gallium/drivers/radeonsi/si_query.c   |6 +
 src/gallium/drivers/radeonsi/si_shader.c  |   63 +
 src/gallium/drivers/radeonsi/si_shader.h  |   14 +
 src/gallium/drivers/radeonsi/si_state.c   |9 +
 src/gallium/drivers/radeonsi/si_state.h   |4 +
 src/gallium/drivers/radeonsi/si_state_draw.c  |  238 ++-
 src/gallium/drivers/radeonsi/si_state_msaa.c  |4 +
 .../drivers/radeonsi/si_state_shaders.c   |   23 +-
 .../drivers/radeonsi/si_state_viewport.c  |6 +
 17 files changed, 2061 insertions(+), 26 deletions(-)
 create mode 100644 src/gallium/drivers/radeonsi/si_compute_prim_discard.c

diff --git a/src/gallium/drivers/radeonsi/Makefile.sources 
b/src/gallium/drivers/radeonsi/Makefile.sources
index 713629c6e87..62747f57b87 100644
--- a/src/gallium/drivers/radeonsi/Makefile.sources
+++ b/src/gallium/drivers/radeonsi/Makefile.sources
@@ -10,6 +10,7 @@ C_SOURCES := \
si_build_pm4.h \
si_clear.c \
si_compute.c \
+   si_compute_prim_discard.c \
si_compute.h \
si_compute_blit.c \
si_cp_dma.c \
diff --git a/src/gallium/drivers/radeonsi/meson.build 
b/src/gallium/drivers/radeonsi/meson.build
index cf3b24cd358..ae216bc1858 100644
--- a/src/gallium/drivers/radeonsi/meson.build
+++ b/src/gallium/drivers/radeonsi/meson.build
@@ -26,6 +26,7 @@ files_libradeonsi = files(
   'si_build_pm4.h',
   'si_clear.c',
   'si_compute.c',
+  'si_compute_prim_discard.c',
   'si_compute.h',
   'si_compute_blit.c',
   'si_cp_dma.c',
diff --git a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c 
b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
new file mode 100644
index 000..661407cb648
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
@@ -0,0 +1,1537 @@
+/*
+ * Copyright 2019 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "si_pipe.h"
+#include "si_shader_internal.h"
+#include "sid.h"
+#include "si_build_pm4.h"
+#include "ac_llvm_cull.h"
+
+#include "util/u_prim.h"
+#include "util/u_suballoc.h"
+#include "util/u_upload_mgr.h"
+#include "util/fast_idiv_by_const.h"
+
+/* Based on:
+ * 
https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf
+ */
+
+/* This file implements primitive culling using asynchronous compute.
+ * It's written to be GL conformant.
+ *
+ * It takes a monolithic VS in LLVM IR returning gl_Position and invokes it
+ * in a compute shader. The shader processes 1 primitive/thread by invoking
+ * the VS for each vertex to get the positions, decomposes strips and fans
+ * into triangles (if needed), eliminates primitive restart (if needed),
+ * does (W<0) culling, face culling, view XY culling, zero-area and
+ * small-primitive culling, and generates a new index buffer that doesn't
+ * contain culled primitives.
+ *
+ * The index buffer is generated using the Ordered Count feature of GDS,
+ * which is an atomic counter that is incremented in the wavefront launch
+ * order, so that the original primitive order is preserved.
+ *
+ * Another GDS ordered counter is used to eliminate primitive restart indices.
+ * If a restart index lands on an even thread ID, the compute shader has to 
flip
+ * the primitive orientation of the whole following triangle strip. The 
primitive
+ * orientation has to be correct after strip

[Mesa-dev] [PATCH 08/26] winsys/amdgpu: add a parallel compute IB coupled with a gfx IB

From: Marek Olšák 

---
 src/amd/common/ac_gpu_info.c  |   6 +
 src/amd/common/ac_gpu_info.h  |   2 +
 src/gallium/drivers/r600/r600_pipe_common.c   |   4 +-
 src/gallium/drivers/radeon/radeon_winsys.h|  36 ++-
 src/gallium/drivers/radeonsi/si_fence.c   |   4 +-
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 214 +-
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.h |  13 ++
 src/gallium/winsys/radeon/drm/radeon_drm_cs.c |   3 +-
 8 files changed, 272 insertions(+), 10 deletions(-)

diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c
index 6971e4f0a8e..4d9f6afca01 100644
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@@ -398,6 +398,7 @@ bool ac_query_gpu_info(int fd, amdgpu_device_handle dev,
info->drm_minor >= 13;
info->has_2d_tiling = true;
info->has_read_registers_query = true;
+   info->has_scheduled_fence_dependency = info->drm_minor >= 28;
 
info->num_render_backends = amdinfo->rb_pipes;
/* The value returned by the kernel driver was wrong. */
@@ -463,6 +464,9 @@ bool ac_query_gpu_info(int fd, amdgpu_device_handle dev,
assert(ib_align);
info->ib_start_alignment = ib_align;
 
+   info->has_gds_ordered_append = info->chip_class >= CIK &&
+  info->drm_minor >= 29 &&
+  HAVE_LLVM >= 0x0800;
return true;
 }
 
@@ -562,6 +566,8 @@ void ac_print_gpu_info(struct radeon_info *info)
printf("has_sparse_vm_mappings = %u\n", 
info->has_sparse_vm_mappings);
printf("has_2d_tiling = %u\n", info->has_2d_tiling);
printf("has_read_registers_query = %u\n", 
info->has_read_registers_query);
+   printf("has_gds_ordered_append = %u\n", 
info->has_gds_ordered_append);
+   printf("has_scheduled_fence_dependency = %u\n", 
info->has_scheduled_fence_dependency);
 
printf("Shader core info:\n");
printf("max_shader_clock = %i\n", info->max_shader_clock);
diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h
index 2c2389eaaa7..bb6984451e7 100644
--- a/src/amd/common/ac_gpu_info.h
+++ b/src/amd/common/ac_gpu_info.h
@@ -113,6 +113,8 @@ struct radeon_info {
boolhas_sparse_vm_mappings;
boolhas_2d_tiling;
boolhas_read_registers_query;
+   boolhas_gds_ordered_append;
+   boolhas_scheduled_fence_dependency;
 
/* Shader cores. */
uint32_tr600_max_quad_pipes; /* wave size / 16 */
diff --git a/src/gallium/drivers/r600/r600_pipe_common.c 
b/src/gallium/drivers/r600/r600_pipe_common.c
index abfa250435d..3c00ad691ac 100644
--- a/src/gallium/drivers/r600/r600_pipe_common.c
+++ b/src/gallium/drivers/r600/r600_pipe_common.c
@@ -355,8 +355,8 @@ static void r600_add_fence_dependency(struct 
r600_common_context *rctx,
struct radeon_winsys *ws = rctx->ws;
 
if (rctx->dma.cs)
-   ws->cs_add_fence_dependency(rctx->dma.cs, fence);
-   ws->cs_add_fence_dependency(rctx->gfx.cs, fence);
+   ws->cs_add_fence_dependency(rctx->dma.cs, fence, 0);
+   ws->cs_add_fence_dependency(rctx->gfx.cs, fence, 0);
 }
 
 static void r600_fence_server_sync(struct pipe_context *ctx,
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h 
b/src/gallium/drivers/radeon/radeon_winsys.h
index aec91c8d002..c04c014bd2f 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -67,6 +67,16 @@ enum radeon_bo_flag { /* bitfield */
 RADEON_FLAG_32BIT =(1 << 6),
 };
 
+enum radeon_dependency_flag {
+/* Add the dependency to the parallel compute IB only. */
+RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY = 1 << 0,
+
+/* Instead of waiting for a job to finish execution, the dependency will
+ * be signaled when the job starts execution.
+ */
+RADEON_DEPENDENCY_START_FENCE = 1 << 1,
+};
+
 enum radeon_bo_usage { /* bitfield */
 RADEON_USAGE_READ = 2,
 RADEON_USAGE_WRITE = 4,
@@ -486,6 +496,27 @@ struct radeon_winsys {
void *flush_ctx,
bool stop_exec_on_failure);
 
+/**
+ * Add a parallel compute IB to a gfx IB. It will share the buffer list
+ * and fence dependencies with the gfx IB. The gfx flush call will submit
+ * both IBs at the same time.
+ *
+ * The compute IB doesn't have an output fence, so the primary IB has
+ * to use a wait packet for synchronization.
+ *
+ * The returned IB is only a stream for writing packets to the new
+ * IB. Calling other winsys functions with it is not allowed, not even
+ * "cs_destroy". Use the gfx IB instead.
+ *
+ * \param cs  Gfx IB
+ * \param gds_size

[Mesa-dev] [PATCH 24/26] radeonsi: add si_vs_prolog_bits::unpack_instance_id_from_vertex_id:1

From: Marek Olšák 

The prim discard compute shader bakes InstanceID into the output index buffer.
---
 src/gallium/drivers/radeonsi/si_shader.c | 25 ++--
 src/gallium/drivers/radeonsi/si_shader.h |  1 +
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index 6e3019a9f6c..ba43f0ff902 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5788,6 +5788,8 @@ static void si_dump_shader_key_vs(const struct 
si_shader_key *key,
prefix, prolog->instance_divisor_is_one);
fprintf(f, "  %s.instance_divisor_is_fetched = %u\n",
prefix, prolog->instance_divisor_is_fetched);
+   fprintf(f, "  %s.unpack_instance_id_from_vertex_id = %u\n",
+   prefix, prolog->unpack_instance_id_from_vertex_id);
fprintf(f, "  %s.ls_vgpr_fix = %u\n",
prefix, prolog->ls_vgpr_fix);
 
@@ -7214,8 +7216,21 @@ static void si_build_vs_prolog_function(struct 
si_shader_context *ctx,
}
}
 
-   ctx->abi.vertex_id = input_vgprs[first_vs_vgpr];
-   ctx->abi.instance_id = input_vgprs[first_vs_vgpr + 
(key->vs_prolog.as_ls ? 2 : 1)];
+   unsigned vertex_id_vgpr = first_vs_vgpr;
+   unsigned instance_id_vgpr = first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 
1);
+
+   ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr];
+   ctx->abi.instance_id = input_vgprs[instance_id_vgpr];
+
+   /* InstanceID = VertexID >> 16;
+* VertexID   = VertexID & 0x;
+*/
+   if (key->vs_prolog.states.unpack_instance_id_from_vertex_id) {
+   ctx->abi.instance_id = LLVMBuildLShr(ctx->ac.builder, 
ctx->abi.vertex_id,
+LLVMConstInt(ctx->i32, 16, 
0), "");
+   ctx->abi.vertex_id = LLVMBuildAnd(ctx->ac.builder, 
ctx->abi.vertex_id,
+ LLVMConstInt(ctx->i32, 
0x, 0), "");
+   }
 
/* Copy inputs to outputs. This should be no-op, as the registers match,
 * but it will prevent the compiler from overwriting them 
unintentionally.
@@ -7227,6 +7242,12 @@ static void si_build_vs_prolog_function(struct 
si_shader_context *ctx,
}
for (i = 0; i < num_input_vgprs; i++) {
LLVMValueRef p = input_vgprs[i];
+
+   if (i == vertex_id_vgpr)
+   p = ctx->abi.vertex_id;
+   else if (i == instance_id_vgpr)
+   p = ctx->abi.instance_id;
+
p = ac_to_float(>ac, p);
ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p,
   key->vs_prolog.num_input_sgprs + i, 
"");
diff --git a/src/gallium/drivers/radeonsi/si_shader.h 
b/src/gallium/drivers/radeonsi/si_shader.h
index f9f81a7bc1e..28f32be30c8 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -412,6 +412,7 @@ struct si_vs_prolog_bits {
uint16_tinstance_divisor_is_one; /* bitmask of inputs */
uint16_tinstance_divisor_is_fetched; /* bitmask of inputs */
unsignedls_vgpr_fix:1;
+   unsignedunpack_instance_id_from_vertex_id:1;
 };
 
 /* Common TCS bits between the shader key and the epilog key. */
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 20/26] radeonsi: allow si_shader_select_with_key to return an optimized shader or fail

From: Marek Olšák 

If a prim discard compute shader hasn't finished compilation, we don't want
to any shader.
---
 src/gallium/drivers/radeonsi/si_state.h   |  7 
 .../drivers/radeonsi/si_state_shaders.c   | 38 +--
 2 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state.h 
b/src/gallium/drivers/radeonsi/si_state.h
index aed5ea63d8b..8f3a3224edf 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -41,6 +41,7 @@
 
 struct si_screen;
 struct si_shader;
+struct si_shader_ctx_state;
 struct si_shader_selector;
 struct si_texture;
 struct si_qbo_state;
@@ -555,6 +556,12 @@ void si_schedule_initial_compile(struct si_context *sctx, 
unsigned processor,
 void si_get_active_slot_masks(const struct tgsi_shader_info *info,
  uint32_t *const_and_shader_buffers,
  uint64_t *samplers_and_images);
+int si_shader_select_with_key(struct si_screen *sscreen,
+ struct si_shader_ctx_state *state,
+ struct si_compiler_ctx_state *compiler_state,
+ struct si_shader_key *key,
+ int thread_index,
+ bool optimized_or_none);
 
 /* si_state_draw.c */
 void si_emit_cache_flush(struct si_context *sctx);
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c 
b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 9e052e1efce..146e0f87693 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1771,12 +1771,19 @@ static bool si_check_missing_main_part(struct si_screen 
*sscreen,
return true;
 }
 
-/* Select the hw shader variant depending on the current state. */
-static int si_shader_select_with_key(struct si_screen *sscreen,
-struct si_shader_ctx_state *state,
-struct si_compiler_ctx_state 
*compiler_state,
-struct si_shader_key *key,
-int thread_index)
+/**
+ * Select a shader variant according to the shader key.
+ *
+ * \param optimized_or_none  If the key describes an optimized shader variant 
and
+ *   the compilation isn't finished, don't select any
+ *   shader and return an error.
+ */
+int si_shader_select_with_key(struct si_screen *sscreen,
+ struct si_shader_ctx_state *state,
+ struct si_compiler_ctx_state *compiler_state,
+ struct si_shader_key *key,
+ int thread_index,
+ bool optimized_or_none)
 {
struct si_shader_selector *sel = state->cso;
struct si_shader_selector *previous_stage_sel = NULL;
@@ -1792,6 +1799,9 @@ again:
   memcmp(>key, key, sizeof(*key)) == 0)) {
if (unlikely(!util_queue_fence_is_signalled(>ready))) {
if (current->is_optimized) {
+   if (optimized_or_none)
+   return -1;
+
memset(>opt, 0, sizeof(key->opt));
goto current_not_ready;
}
@@ -1828,6 +1838,8 @@ current_not_ready:
 * shader so as not to cause a stall due to 
compilation.
 */
if (iter->is_optimized) {
+   if (optimized_or_none)
+   return -1;
memset(>opt, 0, sizeof(key->opt));
goto again;
}
@@ -1869,12 +1881,13 @@ current_not_ready:
util_queue_fence_wait(_stage_sel->ready);
}
 
-   /* Compile the main shader part if it doesn't exist. This can happen
-* if the initial guess was wrong. */
bool is_pure_monolithic =
sscreen->use_monolithic_shaders ||
memcmp(>mono, , sizeof(key->mono)) != 0;
 
+   /* Compile the main shader part if it doesn't exist. This can happen
+* if the initial guess was wrong.
+*/
if (!is_pure_monolithic) {
bool ok;
 
@@ -1931,9 +1944,7 @@ current_not_ready:
memcmp(>opt, , sizeof(key->opt)) != 0;
 
/* If it's an optimized shader, compile it asynchronously. */
-   if (shader->is_optimized &&
-   !is_pure_monolithic &&
-   thread_index < 0) {
+   if (shader->is_optimized && thread_index < 0) {
/* Compile it asynchronously. */
util_queue_add_job(>shader_compiler_queue_low_priority,

[Mesa-dev] [PATCH 18/26] radeonsi: add threadgroups_per_cu param into si_get_compute_resource_limits

From: Marek Olšák 

---
 src/gallium/drivers/radeonsi/si_compute.c | 9 ++---
 src/gallium/drivers/radeonsi/si_pipe.h| 3 ++-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 52a62dcb7fa..dc6f647d9a8 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -750,7 +750,8 @@ static void si_setup_tgsi_user_data(struct si_context *sctx,
 
 unsigned si_get_compute_resource_limits(struct si_screen *sscreen,
unsigned waves_per_threadgroup,
-   unsigned max_waves_per_sh)
+   unsigned max_waves_per_sh,
+   unsigned threadgroups_per_cu)
 {
unsigned compute_resource_limits =
S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0);
@@ -766,7 +767,9 @@ unsigned si_get_compute_resource_limits(struct si_screen 
*sscreen,
if (num_cu_per_se % 4 && waves_per_threadgroup == 1)
compute_resource_limits |= S_00B854_FORCE_SIMD_DIST(1);
 
-   compute_resource_limits |= 
S_00B854_WAVES_PER_SH(max_waves_per_sh);
+   assert(threadgroups_per_cu >= 1 && threadgroups_per_cu <= 8);
+   compute_resource_limits |= 
S_00B854_WAVES_PER_SH(max_waves_per_sh) |
+  
S_00B854_CU_GROUP_COUNT(threadgroups_per_cu - 1);
} else {
/* SI */
if (max_waves_per_sh) {
@@ -788,7 +791,7 @@ static void si_emit_dispatch_packets(struct si_context 
*sctx,
 
radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
  si_get_compute_resource_limits(sscreen, 
waves_per_threadgroup,
-
sctx->cs_max_waves_per_sh));
+
sctx->cs_max_waves_per_sh, 1));
 
unsigned dispatch_initiator =
S_00B800_COMPUTE_SHADER_EN(1) |
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index e4e731e913b..330cdfa0c12 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1294,7 +1294,8 @@ unsigned si_end_counter(struct si_screen *sscreen, 
unsigned type,
 void si_emit_initial_compute_regs(struct si_context *sctx, struct 
radeon_cmdbuf *cs);
 unsigned si_get_compute_resource_limits(struct si_screen *sscreen,
unsigned waves_per_threadgroup,
-   unsigned max_waves_per_sh);
+   unsigned max_waves_per_sh,
+   unsigned threadgroups_per_cu);
 void si_init_compute_functions(struct si_context *sctx);
 
 /* si_perfcounters.c */
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 23/26] radeonsi: add helper si_get_minimum_num_gfx_cs_dwords

From: Marek Olšák 

---
 src/gallium/drivers/radeonsi/si_gfx_cs.c |  8 +---
 src/gallium/drivers/radeonsi/si_pipe.h   | 11 +++
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c 
b/src/gallium/drivers/radeonsi/si_gfx_cs.c
index 3d64587fa2b..cd7c921bcda 100644
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -53,13 +53,7 @@ void si_need_gfx_cs_space(struct si_context *ctx)
ctx->gtt = 0;
ctx->vram = 0;
 
-   /* If the IB is sufficiently large, don't count the space needed
-* and just flush if there is not enough space left.
-*
-* Also reserve space for stopping queries at the end of IB, because
-* the number of active queries is mostly unlimited.
-*/
-   unsigned need_dwords = 2048 + ctx->num_cs_dw_queries_suspend;
+   unsigned need_dwords = si_get_minimum_num_gfx_cs_dwords(ctx);
if (!ctx->ws->cs_check_space(cs, need_dwords))
si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, 
NULL);
 }
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index c96adbb84bf..95636dd804e 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1426,6 +1426,17 @@ si_tile_mode_index(struct si_texture *tex, unsigned 
level, bool stencil)
return tex->surface.u.legacy.tiling_index[level];
 }
 
+static inline unsigned
+si_get_minimum_num_gfx_cs_dwords(struct si_context *sctx)
+{
+   /* Don't count the needed CS space exactly and just use an upper bound.
+*
+* Also reserve space for stopping queries at the end of IB, because
+* the number of active queries is unlimited in theory.
+*/
+   return 2048 + sctx->num_cs_dw_queries_suspend;
+}
+
 static inline void
 si_context_add_resource_size(struct si_context *sctx, struct pipe_resource *r)
 {
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 22/26] radeonsi: add a cs parameter into si_cp_copy_data

From: Marek Olšák 

---
 src/gallium/drivers/radeonsi/si_compute.c | 2 +-
 src/gallium/drivers/radeonsi/si_cp_dma.c  | 9 -
 src/gallium/drivers/radeonsi/si_perfcounter.c | 2 +-
 src/gallium/drivers/radeonsi/si_pipe.h| 2 +-
 src/gallium/drivers/radeonsi/si_state_draw.c  | 2 +-
 5 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index dc6f647d9a8..0d622db91b0 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -721,7 +721,7 @@ static void si_setup_tgsi_user_data(struct si_context *sctx,
if (info->indirect) {
if (program->uses_grid_size) {
for (unsigned i = 0; i < 3; ++i) {
-   si_cp_copy_data(sctx,
+   si_cp_copy_data(sctx, sctx->gfx_cs,
COPY_DATA_REG, NULL, 
(grid_size_reg >> 2) + i,
COPY_DATA_SRC_MEM, 
si_resource(info->indirect),
info->indirect_offset + 4 * i);
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c 
b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 4e7a89b77b9..2048d52cd3c 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -607,18 +607,17 @@ void si_cp_write_data(struct si_context *sctx, struct 
si_resource *buf,
radeon_emit_array(cs, (const uint32_t*)data, size/4);
 }
 
-void si_cp_copy_data(struct si_context *sctx,
+void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs,
 unsigned dst_sel, struct si_resource *dst, unsigned 
dst_offset,
 unsigned src_sel, struct si_resource *src, unsigned 
src_offset)
 {
-   struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
+   /* cs can point to the compute IB, which has the buffer list in gfx_cs. 
*/
if (dst) {
-   radeon_add_to_buffer_list(sctx, cs, dst,
+   radeon_add_to_buffer_list(sctx, sctx->gfx_cs, dst,
  RADEON_USAGE_WRITE, 
RADEON_PRIO_CP_DMA);
}
if (src) {
-   radeon_add_to_buffer_list(sctx, cs, src,
+   radeon_add_to_buffer_list(sctx, sctx->gfx_cs, src,
  RADEON_USAGE_READ, 
RADEON_PRIO_CP_DMA);
}
 
diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c 
b/src/gallium/drivers/radeonsi/si_perfcounter.c
index 7cf3b86c73a..2c3e410daa6 100644
--- a/src/gallium/drivers/radeonsi/si_perfcounter.c
+++ b/src/gallium/drivers/radeonsi/si_perfcounter.c
@@ -671,7 +671,7 @@ static void si_pc_emit_start(struct si_context *sctx,
 {
struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
-   si_cp_copy_data(sctx,
+   si_cp_copy_data(sctx, sctx->gfx_cs,
COPY_DATA_DST_MEM, buffer, va - buffer->gpu_address,
COPY_DATA_IMM, NULL, 1);
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index e1f7588da41..c96adbb84bf 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1228,7 +1228,7 @@ void si_test_gds(struct si_context *sctx);
 void si_cp_write_data(struct si_context *sctx, struct si_resource *buf,
  unsigned offset, unsigned size, unsigned dst_sel,
  unsigned engine, const void *data);
-void si_cp_copy_data(struct si_context *sctx,
+void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs,
 unsigned dst_sel, struct si_resource *dst, unsigned 
dst_offset,
 unsigned src_sel, struct si_resource *src, unsigned 
src_offset);
 
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c 
b/src/gallium/drivers/radeonsi/si_state_draw.c
index 3d00d5aef0e..8836574130f 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -691,7 +691,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
 
radeon_set_context_reg(cs, 
R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE,
   t->stride_in_dw);
-   si_cp_copy_data(sctx,
+   si_cp_copy_data(sctx, sctx->gfx_cs,
COPY_DATA_REG, NULL,

R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2,
COPY_DATA_SRC_MEM, t->buf_filled_size,
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 25/26] ac: add LLVM code for triangle culling

From: Marek Olšák 

---
 src/amd/Makefile.sources  |   2 +
 src/amd/common/ac_llvm_cull.c | 275 ++
 src/amd/common/ac_llvm_cull.h |  59 
 src/amd/common/meson.build|   2 +
 4 files changed, 338 insertions(+)
 create mode 100644 src/amd/common/ac_llvm_cull.c
 create mode 100644 src/amd/common/ac_llvm_cull.h

diff --git a/src/amd/Makefile.sources b/src/amd/Makefile.sources
index 58e0008ee62..e1557ff5365 100644
--- a/src/amd/Makefile.sources
+++ b/src/amd/Makefile.sources
@@ -39,6 +39,8 @@ AMD_COMPILER_FILES = \
common/ac_exp_param.h \
common/ac_llvm_build.c \
common/ac_llvm_build.h \
+   common/ac_llvm_cull.c \
+   common/ac_llvm_cull.h \
common/ac_llvm_helper.cpp \
common/ac_llvm_util.c \
common/ac_llvm_util.h \
diff --git a/src/amd/common/ac_llvm_cull.c b/src/amd/common/ac_llvm_cull.c
new file mode 100644
index 000..1c2da3e0418
--- /dev/null
+++ b/src/amd/common/ac_llvm_cull.c
@@ -0,0 +1,275 @@
+/*
+ * Copyright 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ */
+
+#include "ac_llvm_cull.h"
+#include 
+
+struct ac_position_w_info {
+   /* If a primitive intersects the W=0 plane, it causes a reflection
+* of the determinant used for face culling. Every vertex behind
+* the W=0 plane negates the determinant, so having 2 vertices behind
+* the plane has no effect. This is i1 true if the determinant should be
+* negated.
+*/
+   LLVMValueRef w_reflection;
+
+   /* If we simplify the "-w <= p <= w" view culling equation, we get
+* "-w <= w", which can't be satisfied when w is negative.
+* In perspective projection, a negative W means that the primitive
+* is behind the viewer, but the equation is independent of the type
+* of projection.
+*
+* w_accepted is false when all W are negative and therefore
+* the primitive is invisible.
+*/
+   LLVMValueRef w_accepted;
+
+   LLVMValueRef all_w_positive;
+   LLVMValueRef any_w_negative;
+};
+
+static void ac_analyze_position_w(struct ac_llvm_context *ctx,
+ LLVMValueRef pos[3][4],
+ struct ac_position_w_info *w)
+{
+   LLVMBuilderRef builder = ctx->builder;
+   LLVMValueRef all_w_negative = ctx->i1true;
+
+   w->w_reflection = ctx->i1false;
+   w->any_w_negative = ctx->i1false;
+
+   for (unsigned i = 0; i < 3; i++) {
+   LLVMValueRef neg_w;
+
+   neg_w = LLVMBuildFCmp(builder, LLVMRealOLT, pos[i][3], 
ctx->f32_0, "");
+   /* If neg_w is true, negate w_reflection. */
+   w->w_reflection = LLVMBuildXor(builder, w->w_reflection, neg_w, 
"");
+   w->any_w_negative = LLVMBuildOr(builder, w->any_w_negative, 
neg_w, "");
+   all_w_negative = LLVMBuildAnd(builder, all_w_negative, neg_w, 
"");
+   }
+   w->all_w_positive = LLVMBuildNot(builder, w->any_w_negative, "");
+   w->w_accepted = LLVMBuildNot(builder, all_w_negative, "");
+}
+
+/* Perform front/back face culling and return true if the primitive is 
accepted. */
+static LLVMValueRef ac_cull_face(struct ac_llvm_context *ctx,
+LLVMValueRef pos[3][4],
+struct ac_position_w_info *w,
+bool cull_front,
+bool cull_back,
+bool cull_zero_area)
+{
+   LLVMBuilderRef builder = ctx->builder;
+
+   if (cull_front && cull_back)
+   return ctx->i1false;
+
+   if (!cull_front && !cull_back && !cull_zero_area)
+   return ctx->i1true;
+
+   /* Front/back face culling. Also if the determinant == 0, the triangle
+

[Mesa-dev] [PATCH 07/26] winsys/amdgpu: reorder chunks, make BO_HANDLES first, IB and FENCE last

From: Marek Olšák 

---
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 36 +++
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index 319741f7d0b..4a588d52930 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -1367,20 +1367,11 @@ void amdgpu_cs_submit_ib(void *job, int thread_index)
   struct drm_amdgpu_cs_chunk chunks[6];
   unsigned num_chunks = 0;
 
-  /* Convert from dwords to bytes. */
-  cs->ib[IB_MAIN].ib_bytes *= 4;
-
-  /* IB */
-  chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
-  chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
-  chunks[num_chunks].chunk_data = (uintptr_t)>ib[IB_MAIN];
-  num_chunks++;
-
-  /* Fence */
-  if (has_user_fence) {
- chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_FENCE;
- chunks[num_chunks].length_dw = sizeof(struct 
drm_amdgpu_cs_chunk_fence) / 4;
- chunks[num_chunks].chunk_data = (uintptr_t)>fence_chunk;
+  /* BO list */
+  if (!use_bo_list_create) {
+ chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
+ chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 
4;
+ chunks[num_chunks].chunk_data = (uintptr_t)_list_in;
  num_chunks++;
   }
 
@@ -1448,14 +1439,21 @@ void amdgpu_cs_submit_ib(void *job, int thread_index)
  num_chunks++;
   }
 
-  /* BO list */
-  if (!use_bo_list_create) {
- chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
- chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 
4;
- chunks[num_chunks].chunk_data = (uintptr_t)_list_in;
+  /* Fence */
+  if (has_user_fence) {
+ chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_FENCE;
+ chunks[num_chunks].length_dw = sizeof(struct 
drm_amdgpu_cs_chunk_fence) / 4;
+ chunks[num_chunks].chunk_data = (uintptr_t)>fence_chunk;
  num_chunks++;
   }
 
+  /* IB */
+  cs->ib[IB_MAIN].ib_bytes *= 4; /* Convert from dwords to bytes. */
+  chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
+  chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
+  chunks[num_chunks].chunk_data = (uintptr_t)>ib[IB_MAIN];
+  num_chunks++;
+
   assert(num_chunks <= ARRAY_SIZE(chunks));
 
   r = amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, bo_list,
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 12/26] radeonsi: return the last part's return value from @wrapper

From: Marek Olšák 

The primitive discard compute shader will get the position output this way.
---
 src/gallium/drivers/radeonsi/si_shader.c | 29 +---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index efae02ee91c..b376a14a2fc 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -6514,7 +6514,26 @@ static void si_build_wrapper_function(struct 
si_shader_context *ctx,
gprs += size;
}
 
-   si_create_function(ctx, "wrapper", NULL, 0, ,
+   /* Prepare the return type. */
+   unsigned num_returns = 0;
+   LLVMTypeRef returns[32], last_func_type, return_type;
+
+   last_func_type = LLVMGetElementType(LLVMTypeOf(parts[num_parts - 1]));
+   return_type = LLVMGetReturnType(last_func_type);
+
+   switch (LLVMGetTypeKind(return_type)) {
+   case LLVMStructTypeKind:
+   num_returns = LLVMCountStructElementTypes(return_type);
+   assert(num_returns <= ARRAY_SIZE(returns));
+   LLVMGetStructElementTypes(return_type, returns);
+   break;
+   case LLVMVoidTypeKind:
+   break;
+   default:
+   unreachable("unexpected type");
+   }
+
+   si_create_function(ctx, "wrapper", returns, num_returns, ,
   si_get_max_workgroup_size(ctx->shader));
 
if (is_merged_shader(ctx))
@@ -6566,9 +6585,9 @@ static void si_build_wrapper_function(struct 
si_shader_context *ctx,
initial_num_out_sgpr = num_out_sgpr;
 
/* Now chain the parts. */
+   LLVMValueRef ret;
for (unsigned part = 0; part < num_parts; ++part) {
LLVMValueRef in[48];
-   LLVMValueRef ret;
LLVMTypeRef ret_type;
unsigned out_idx = 0;
unsigned num_params = LLVMCountParams(parts[part]);
@@ -6680,7 +6699,11 @@ static void si_build_wrapper_function(struct 
si_shader_context *ctx,
}
}
 
-   LLVMBuildRetVoid(builder);
+   /* Return the value from the last part. */
+   if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
+   LLVMBuildRetVoid(builder);
+   else
+   LLVMBuildRet(builder, ret);
 }
 
 static bool si_should_optimize_less(struct ac_llvm_compiler *compiler,
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 17/26] radeonsi: use pipe_draw_info::instance_count indirectly

From: Marek Olšák 

It will be modified by compute shader culling.
---
 src/gallium/drivers/radeonsi/si_state_draw.c | 36 
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c 
b/src/gallium/drivers/radeonsi/si_state_draw.c
index bf091827828..ba77fb68dcc 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -494,6 +494,7 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context 
*sctx,
  const struct pipe_draw_info *info,
  enum pipe_prim_type prim,
  unsigned num_patches,
+ unsigned instance_count,
  bool primitive_restart)
 {
union si_vgt_param_key key = sctx->ia_multi_vgt_param_key;
@@ -509,10 +510,10 @@ static unsigned si_get_ia_multi_vgt_param(struct 
si_context *sctx,
}
 
key.u.prim = prim;
-   key.u.uses_instancing = info->indirect || info->instance_count > 1;
+   key.u.uses_instancing = info->indirect || instance_count > 1;
key.u.multi_instances_smaller_than_primgroup =
info->indirect ||
-   (info->instance_count > 1 &&
+   (instance_count > 1 &&
 (info->count_from_stream_output ||
  si_num_prims_for_vertices(info, prim) < primgroup_size));
key.u.primitive_restart = primitive_restart;
@@ -534,7 +535,7 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context 
*sctx,
if (sctx->family == CHIP_HAWAII &&
G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) &&
(info->indirect ||
-(info->instance_count > 1 &&
+(instance_count > 1 &&
  (info->count_from_stream_output ||
   si_num_prims_for_vertices(info, prim) <= 1
sctx->flags |= SI_CONTEXT_VGT_FLUSH;
@@ -618,6 +619,7 @@ static void si_emit_draw_registers(struct si_context *sctx,
   const struct pipe_draw_info *info,
   enum pipe_prim_type prim,
   unsigned num_patches,
+  unsigned instance_count,
   bool primitive_restart)
 {
struct radeon_cmdbuf *cs = sctx->gfx_cs;
@@ -625,7 +627,7 @@ static void si_emit_draw_registers(struct si_context *sctx,
unsigned ia_multi_vgt_param;
 
ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info, prim, 
num_patches,
-  primitive_restart);
+  instance_count, 
primitive_restart);
 
/* Draw state. */
if (ia_multi_vgt_param != sctx->last_multi_vgt_param) {
@@ -673,7 +675,8 @@ static void si_emit_draw_packets(struct si_context *sctx,
 const struct pipe_draw_info *info,
 struct pipe_resource *indexbuf,
 unsigned index_size,
-unsigned index_offset)
+unsigned index_offset,
+unsigned instance_count)
 {
struct pipe_draw_indirect_info *indirect = info->indirect;
struct radeon_cmdbuf *cs = sctx->gfx_cs;
@@ -815,7 +818,6 @@ static void si_emit_draw_packets(struct si_context *sctx,
radeon_emit(cs, di_src_sel);
}
} else {
-   unsigned instance_count = info->instance_count;
int base_vertex;
 
if (sctx->last_instance_count == SI_INSTANCE_COUNT_UNKNOWN ||
@@ -1204,8 +1206,8 @@ static void si_get_draw_start_count(struct si_context 
*sctx,
 }
 
 static void si_emit_all_states(struct si_context *sctx, const struct 
pipe_draw_info *info,
-  enum pipe_prim_type prim, bool primitive_restart,
-  unsigned skip_atom_mask)
+  enum pipe_prim_type prim, unsigned 
instance_count,
+  bool primitive_restart, unsigned skip_atom_mask)
 {
unsigned num_patches = 0;
/* Vega10/Raven scissor bug workaround. When any context register is
@@ -1258,7 +1260,8 @@ static void si_emit_all_states(struct si_context *sctx, 
const struct pipe_draw_i
 
/* Emit draw states. */
si_emit_vs_state(sctx, info);
-   si_emit_draw_registers(sctx, info, prim, num_patches, 
primitive_restart);
+   si_emit_draw_registers(sctx, info, prim, num_patches, instance_count,
+  primitive_restart);
 }
 
 static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info 
*info)
@@ -1270,6 +1273,7 @@

[Mesa-dev] [PATCH 19/26] radeonsi: add a cs parameter into si_cp_release_mem

From: Marek Olšák 

---
 src/gallium/drivers/radeonsi/si_fence.c   | 5 ++---
 src/gallium/drivers/radeonsi/si_perfcounter.c | 2 +-
 src/gallium/drivers/radeonsi/si_pipe.h| 2 +-
 src/gallium/drivers/radeonsi/si_query.c   | 6 +++---
 src/gallium/drivers/radeonsi/si_state_draw.c  | 4 ++--
 5 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_fence.c 
b/src/gallium/drivers/radeonsi/si_fence.c
index e3c1e0959fd..509b22fa4e4 100644
--- a/src/gallium/drivers/radeonsi/si_fence.c
+++ b/src/gallium/drivers/radeonsi/si_fence.c
@@ -66,13 +66,12 @@ struct si_multi_fence {
  * \param old_valuePrevious fence value (for a bug workaround)
  * \param new_valueFence value to write for this event.
  */
-void si_cp_release_mem(struct si_context *ctx,
+void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
   unsigned event, unsigned event_flags,
   unsigned dst_sel, unsigned int_sel, unsigned data_sel,
   struct si_resource *buf, uint64_t va,
   uint32_t new_fence, unsigned query_type)
 {
-   struct radeon_cmdbuf *cs = ctx->gfx_cs;
unsigned op = EVENT_TYPE(event) |
  EVENT_INDEX(event == V_028A90_CS_DONE ||
  event == V_028A90_PS_DONE ? 6 : 5) |
@@ -269,7 +268,7 @@ static void si_fine_fence_set(struct si_context *ctx,
 
radeon_add_to_buffer_list(ctx, ctx->gfx_cs, fine->buf,
  RADEON_USAGE_WRITE, 
RADEON_PRIO_QUERY);
-   si_cp_release_mem(ctx,
+   si_cp_release_mem(ctx, ctx->gfx_cs,
  V_028A90_BOTTOM_OF_PIPE_TS, 0,
  EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
  EOP_DATA_SEL_VALUE_32BIT,
diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c 
b/src/gallium/drivers/radeonsi/si_perfcounter.c
index c15c444cc40..7cf3b86c73a 100644
--- a/src/gallium/drivers/radeonsi/si_perfcounter.c
+++ b/src/gallium/drivers/radeonsi/si_perfcounter.c
@@ -690,7 +690,7 @@ static void si_pc_emit_stop(struct si_context *sctx,
 {
struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
-   si_cp_release_mem(sctx, V_028A90_BOTTOM_OF_PIPE_TS, 0,
+   si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0,
  EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
  EOP_DATA_SEL_VALUE_32BIT,
  buffer, va, 0, SI_NOT_QUERY);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index 330cdfa0c12..e1f7588da41 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1262,7 +1262,7 @@ void si_screen_clear_buffer(struct si_screen *sscreen, 
struct pipe_resource *dst
uint64_t offset, uint64_t size, unsigned value);
 
 /* si_fence.c */
-void si_cp_release_mem(struct si_context *ctx,
+void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
   unsigned event, unsigned event_flags,
   unsigned dst_sel, unsigned int_sel, unsigned data_sel,
   struct si_resource *buf, uint64_t va,
diff --git a/src/gallium/drivers/radeonsi/si_query.c 
b/src/gallium/drivers/radeonsi/si_query.c
index c115e7787b2..59039875c4d 100644
--- a/src/gallium/drivers/radeonsi/si_query.c
+++ b/src/gallium/drivers/radeonsi/si_query.c
@@ -826,7 +826,7 @@ static void si_query_hw_do_emit_start(struct si_context 
*sctx,
emit_sample_streamout(cs, va + 32 * stream, stream);
break;
case PIPE_QUERY_TIME_ELAPSED:
-   si_cp_release_mem(sctx, V_028A90_BOTTOM_OF_PIPE_TS, 0,
+   si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0,
  EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
  EOP_DATA_SEL_TIMESTAMP, NULL, va,
  0, query->b.type);
@@ -902,7 +902,7 @@ static void si_query_hw_do_emit_stop(struct si_context 
*sctx,
va += 8;
/* fall through */
case PIPE_QUERY_TIMESTAMP:
-   si_cp_release_mem(sctx, V_028A90_BOTTOM_OF_PIPE_TS, 0,
+   si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0,
  EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
  EOP_DATA_SEL_TIMESTAMP, NULL, va,
  0, query->b.type);
@@ -927,7 +927,7 @@ static void si_query_hw_do_emit_stop(struct si_context 
*sctx,
  RADEON_PRIO_QUERY);
 
if (fence_va) {
-   si_cp_release_mem(sctx, V_028A90_BOTTOM_OF_PIPE_TS, 0,
+   si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0,
  EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,

[Mesa-dev] [PATCH 21/26] radeonsi: make some functions non-static

From: Marek Olšák 

---
 src/gallium/drivers/radeonsi/si_state.c   | 20 ++-
 src/gallium/drivers/radeonsi/si_state.h   | 15 ++
 .../drivers/radeonsi/si_state_shaders.c   |  8 
 3 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state.c 
b/src/gallium/drivers/radeonsi/si_state.c
index b49a1b3695e..5a9597bdd8d 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -4399,21 +4399,13 @@ static void si_delete_sampler_state(struct pipe_context 
*ctx, void *state)
  * Vertex elements & buffers
  */
 
-struct util_fast_udiv_info32 {
-   unsigned multiplier; /* the "magic number" multiplier */
-   unsigned pre_shift; /* shift for the dividend before multiplying */
-   unsigned post_shift; /* shift for the dividend after multiplying */
-   int increment; /* 0 or 1; if set then increment the numerator, using one of
- the two strategies */
-};
-
-static struct util_fast_udiv_info32
-util_compute_fast_udiv_info32(uint32_t D, unsigned num_bits)
+struct si_fast_udiv_info32
+si_compute_fast_udiv_info32(uint32_t D, unsigned num_bits)
 {
struct util_fast_udiv_info info =
util_compute_fast_udiv_info(D, num_bits, 32);
 
-   struct util_fast_udiv_info32 result = {
+   struct si_fast_udiv_info32 result = {
info.multiplier,
info.pre_shift,
info.post_shift,
@@ -4429,8 +4421,8 @@ static void *si_create_vertex_elements(struct 
pipe_context *ctx,
struct si_screen *sscreen = (struct si_screen*)ctx->screen;
struct si_vertex_elements *v = CALLOC_STRUCT(si_vertex_elements);
bool used[SI_NUM_VERTEX_BUFFERS] = {};
-   struct util_fast_udiv_info32 divisor_factors[SI_MAX_ATTRIBS] = {};
-   STATIC_ASSERT(sizeof(struct util_fast_udiv_info32) == 16);
+   struct si_fast_udiv_info32 divisor_factors[SI_MAX_ATTRIBS] = {};
+   STATIC_ASSERT(sizeof(struct si_fast_udiv_info32) == 16);
STATIC_ASSERT(sizeof(divisor_factors[0].multiplier) == 4);
STATIC_ASSERT(sizeof(divisor_factors[0].pre_shift) == 4);
STATIC_ASSERT(sizeof(divisor_factors[0].post_shift) == 4);
@@ -4466,7 +4458,7 @@ static void *si_create_vertex_elements(struct 
pipe_context *ctx,
} else {
v->instance_divisor_is_fetched |= 1u << i;
divisor_factors[i] =
-   
util_compute_fast_udiv_info32(instance_divisor, 32);
+   
si_compute_fast_udiv_info32(instance_divisor, 32);
}
}
 
diff --git a/src/gallium/drivers/radeonsi/si_state.h 
b/src/gallium/drivers/radeonsi/si_state.h
index 8f3a3224edf..b3ed1517d85 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -535,6 +535,17 @@ void si_save_qbo_state(struct si_context *sctx, struct 
si_qbo_state *st);
 void si_set_occlusion_query_state(struct si_context *sctx,
  bool old_perfect_enable);
 
+struct si_fast_udiv_info32 {
+   unsigned multiplier; /* the "magic number" multiplier */
+   unsigned pre_shift; /* shift for the dividend before multiplying */
+   unsigned post_shift; /* shift for the dividend after multiplying */
+   int increment; /* 0 or 1; if set then increment the numerator, using one of
+ the two strategies */
+};
+
+struct si_fast_udiv_info32
+si_compute_fast_udiv_info32(uint32_t D, unsigned num_bits);
+
 /* si_state_binning.c */
 void si_emit_dpbb_state(struct si_context *sctx);
 
@@ -562,6 +573,10 @@ int si_shader_select_with_key(struct si_screen *sscreen,
  struct si_shader_key *key,
  int thread_index,
  bool optimized_or_none);
+void si_shader_selector_key_vs(struct si_context *sctx,
+  struct si_shader_selector *vs,
+  struct si_shader_key *key,
+  struct si_vs_prolog_bits *prolog_key);
 
 /* si_state_draw.c */
 void si_emit_cache_flush(struct si_context *sctx);
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c 
b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 146e0f87693..80d4f7222da 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1370,10 +1370,10 @@ static unsigned si_get_alpha_test_func(struct 
si_context *sctx)
return PIPE_FUNC_ALWAYS;
 }
 
-static void si_shader_selector_key_vs(struct si_context *sctx,
- struct si_shader_selector *vs,
- struct si_shader_key *key,
- struct si_vs_prolog_bits *prolog_key)
+void si_shader_selector_key_vs(struct si_context *sctx,
+

[Mesa-dev] [PATCH 16/26] radeonsi: make functions for creating LLVM functions non-static

From: Marek Olšák 

---
 src/gallium/drivers/radeonsi/si_shader.c  | 30 +--
 .../drivers/radeonsi/si_shader_internal.h | 25 
 2 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index b376a14a2fc..6e3019a9f6c 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -51,22 +51,6 @@ struct si_shader_output_values
ubyte vertex_stream[4];
 };
 
-/**
- * Used to collect types and other info about arguments of the LLVM function
- * before the function is created.
- */
-struct si_function_info {
-   LLVMTypeRef types[100];
-   LLVMValueRef *assign[100];
-   unsigned num_sgpr_params;
-   unsigned num_params;
-};
-
-enum si_arg_regfile {
-   ARG_SGPR,
-   ARG_VGPR
-};
-
 static void si_init_shader_ctx(struct si_shader_context *ctx,
   struct si_screen *sscreen,
   struct ac_llvm_compiler *compiler);
@@ -114,13 +98,13 @@ static bool is_merged_shader(struct si_shader_context *ctx)
   ctx->type == PIPE_SHADER_GEOMETRY;
 }
 
-static void si_init_function_info(struct si_function_info *fninfo)
+void si_init_function_info(struct si_function_info *fninfo)
 {
fninfo->num_params = 0;
fninfo->num_sgpr_params = 0;
 }
 
-static unsigned add_arg_assign(struct si_function_info *fninfo,
+unsigned add_arg_assign(struct si_function_info *fninfo,
enum si_arg_regfile regfile, LLVMTypeRef type,
LLVMValueRef *assign)
 {
@@ -4352,11 +4336,11 @@ static void si_llvm_emit_barrier(const struct 
lp_build_tgsi_action *action,
ac_build_s_barrier(>ac);
 }
 
-static void si_create_function(struct si_shader_context *ctx,
-  const char *name,
-  LLVMTypeRef *returns, unsigned num_returns,
-  struct si_function_info *fninfo,
-  unsigned max_workgroup_size)
+void si_create_function(struct si_shader_context *ctx,
+   const char *name,
+   LLVMTypeRef *returns, unsigned num_returns,
+   struct si_function_info *fninfo,
+   unsigned max_workgroup_size)
 {
int i;
 
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h 
b/src/gallium/drivers/radeonsi/si_shader_internal.h
index 235c46ecf92..6e21bc7c26b 100644
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -45,6 +45,22 @@ struct ac_shader_binary;
 #define RADEON_LLVM_MAX_SYSTEM_VALUES 11
 #define RADEON_LLVM_MAX_ADDRS 16
 
+enum si_arg_regfile {
+   ARG_SGPR,
+   ARG_VGPR
+};
+
+/**
+ * Used to collect types and other info about arguments of the LLVM function
+ * before the function is created.
+ */
+struct si_function_info {
+   LLVMTypeRef types[100];
+   LLVMValueRef *assign[100];
+   unsigned num_sgpr_params;
+   unsigned num_params;
+};
+
 struct si_shader_context {
struct lp_build_tgsi_context bld_base;
struct gallivm_state gallivm;
@@ -218,6 +234,15 @@ si_shader_context_from_abi(struct ac_shader_abi *abi)
return container_of(abi, ctx, abi);
 }
 
+void si_init_function_info(struct si_function_info *fninfo);
+unsigned add_arg_assign(struct si_function_info *fninfo,
+   enum si_arg_regfile regfile, LLVMTypeRef type,
+   LLVMValueRef *assign);
+void si_create_function(struct si_shader_context *ctx,
+   const char *name,
+   LLVMTypeRef *returns, unsigned num_returns,
+   struct si_function_info *fninfo,
+   unsigned max_workgroup_size);
 unsigned si_llvm_compile(LLVMModuleRef M, struct ac_shader_binary *binary,
 struct ac_llvm_compiler *compiler,
 struct pipe_debug_callback *debug,
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 14/26] radeonsi: make si_initialize_compute reusable

From: Marek Olšák 

---
 src/gallium/drivers/radeonsi/si_compute.c | 14 +++---
 src/gallium/drivers/radeonsi/si_pipe.h|  1 +
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 5ec0c0a5699..52a62dcb7fa 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -316,9 +316,8 @@ static void si_set_global_binding(
}
 }
 
-static void si_initialize_compute(struct si_context *sctx)
+void si_emit_initial_compute_regs(struct si_context *sctx, struct 
radeon_cmdbuf *cs)
 {
-   struct radeon_cmdbuf *cs = sctx->gfx_cs;
uint64_t bc_va;
 
radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2);
@@ -362,9 +361,6 @@ static void si_initialize_compute(struct si_context *sctx)
  bc_va >> 8);
}
}
-
-   sctx->cs_shader_state.emitted_program = NULL;
-   sctx->cs_shader_state.initialized = true;
 }
 
 static bool si_setup_compute_scratch_buffer(struct si_context *sctx,
@@ -908,8 +904,12 @@ static void si_launch_grid(
 
si_need_gfx_cs_space(sctx);
 
-   if (!sctx->cs_shader_state.initialized)
-   si_initialize_compute(sctx);
+   if (!sctx->cs_shader_state.initialized) {
+   si_emit_initial_compute_regs(sctx, sctx->gfx_cs);
+
+   sctx->cs_shader_state.emitted_program = NULL;
+   sctx->cs_shader_state.initialized = true;
+   }
 
if (sctx->flags)
si_emit_cache_flush(sctx);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index 81faf4c66e8..e4e731e913b 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1291,6 +1291,7 @@ unsigned si_end_counter(struct si_screen *sscreen, 
unsigned type,
uint64_t begin);
 
 /* si_compute.c */
+void si_emit_initial_compute_regs(struct si_context *sctx, struct 
radeon_cmdbuf *cs);
 unsigned si_get_compute_resource_limits(struct si_screen *sscreen,
unsigned waves_per_threadgroup,
unsigned max_waves_per_sh);
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 15/26] radeonsi: move si_*_descriptors_idx functions into si_state.h

From: Marek Olšák 

---
 src/gallium/drivers/radeonsi/si_descriptors.c | 14 --
 src/gallium/drivers/radeonsi/si_state.h   | 14 ++
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index 21d4ca946d3..e638ee77a3f 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -220,13 +220,6 @@ si_get_sampler_view_priority(struct si_resource *res)
return RADEON_PRIO_SAMPLER_TEXTURE;
 }
 
-static unsigned
-si_sampler_and_image_descriptors_idx(unsigned shader)
-{
-   return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
-  SI_SHADER_DESCS_SAMPLERS_AND_IMAGES;
-}
-
 static struct si_descriptors *
 si_sampler_and_image_descriptors(struct si_context *sctx, unsigned shader)
 {
@@ -1176,13 +1169,6 @@ bool si_upload_vertex_buffer_descriptors(struct 
si_context *sctx)
 
 /* CONSTANT BUFFERS */
 
-static unsigned
-si_const_and_shader_buffer_descriptors_idx(unsigned shader)
-{
-   return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
-  SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS;
-}
-
 static struct si_descriptors *
 si_const_and_shader_buffer_descriptors(struct si_context *sctx, unsigned 
shader)
 {
diff --git a/src/gallium/drivers/radeonsi/si_state.h 
b/src/gallium/drivers/radeonsi/si_state.h
index 767e789276a..aed5ea63d8b 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -374,6 +374,20 @@ enum {
  PIPE_SHADER_##name * SI_NUM_SHADER_DESCS, \
  SI_NUM_SHADER_DESCS)
 
+static inline unsigned
+si_const_and_shader_buffer_descriptors_idx(unsigned shader)
+{
+   return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
+  SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS;
+}
+
+static inline unsigned
+si_sampler_and_image_descriptors_idx(unsigned shader)
+{
+   return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
+  SI_SHADER_DESCS_SAMPLERS_AND_IMAGES;
+}
+
 /* This represents descriptors in memory, such as buffer resources,
  * image resources, and sampler states.
  */
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 01/26] tgsi/scan: add uses_drawid

From: Marek Olšák 

---
 src/gallium/auxiliary/tgsi/tgsi_scan.c   | 3 +++
 src/gallium/auxiliary/tgsi/tgsi_scan.h   | 1 +
 src/gallium/drivers/radeonsi/si_shader_nir.c | 3 +++
 3 files changed, 7 insertions(+)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c 
b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index bfb415e439a..4cb1a3152c3 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -683,6 +683,9 @@ scan_declaration(struct tgsi_shader_info *info,
  case TGSI_SEMANTIC_BASEVERTEX:
 info->uses_basevertex = TRUE;
 break;
+ case TGSI_SEMANTIC_DRAWID:
+info->uses_drawid = TRUE;
+break;
  case TGSI_SEMANTIC_PRIMID:
 info->uses_primid = TRUE;
 break;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h 
b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index 64f2598a259..580c73a2814 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -118,6 +118,7 @@ struct tgsi_shader_info
boolean uses_vertexid;
boolean uses_vertexid_nobase;
boolean uses_basevertex;
+   boolean uses_drawid;
boolean uses_primid;
boolean uses_frontface;
boolean uses_invocationid;
diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c 
b/src/gallium/drivers/radeonsi/si_shader_nir.c
index 7554f5b9f8b..d81fb5449bf 100644
--- a/src/gallium/drivers/radeonsi/si_shader_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shader_nir.c
@@ -133,6 +133,9 @@ static void scan_instruction(struct tgsi_shader_info *info,
case nir_intrinsic_load_base_vertex:
info->uses_basevertex = 1;
break;
+   case nir_intrinsic_load_draw_id:
+   info->uses_drawid = 1;
+   break;
case nir_intrinsic_load_primitive_id:
info->uses_primid = 1;
break;
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 04/26] ac: add ac_get_i1_sgpr_mask

From: Marek Olšák 

---
 src/amd/common/ac_llvm_build.c | 16 
 src/amd/common/ac_llvm_build.h |  2 ++
 2 files changed, 18 insertions(+)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 9395bd1bbda..7adca596943 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -453,6 +453,22 @@ ac_build_ballot(struct ac_llvm_context *ctx,
  AC_FUNC_ATTR_CONVERGENT);
 }
 
+LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx,
+LLVMValueRef value)
+{
+   LLVMValueRef args[3] = {
+   value,
+   ctx->i1false,
+   LLVMConstInt(ctx->i32, LLVMIntNE, 0),
+   };
+
+   assert(HAVE_LLVM >= 0x0800);
+   return ac_build_intrinsic(ctx, "llvm.amdgcn.icmp.i1", ctx->i64, args, 3,
+ AC_FUNC_ATTR_NOUNWIND |
+ AC_FUNC_ATTR_READNONE |
+ AC_FUNC_ATTR_CONVERGENT);
+}
+
 LLVMValueRef
 ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
 {
diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index f218eaf2832..6dd15dda3e8 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -149,6 +149,8 @@ void ac_build_optimization_barrier(struct ac_llvm_context 
*ctx,
 LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx);
 
 LLVMValueRef ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef value);
+LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx,
+LLVMValueRef value);
 
 LLVMValueRef ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef 
value);
 
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 10/26] radeonsi: add si_cp_copy_data

From: Marek Olšák 

---
 src/gallium/drivers/radeonsi/si_compute.c | 21 --
 src/gallium/drivers/radeonsi/si_cp_dma.c  | 28 +++
 src/gallium/drivers/radeonsi/si_perfcounter.c | 13 ++---
 src/gallium/drivers/radeonsi/si_pipe.h|  3 ++
 src/gallium/drivers/radeonsi/si_state_draw.c  | 20 -
 5 files changed, 44 insertions(+), 41 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 1a62b3e0844..42caac66884 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -724,22 +724,11 @@ static void si_setup_tgsi_user_data(struct si_context 
*sctx,
 
if (info->indirect) {
if (program->uses_grid_size) {
-   uint64_t base_va = 
si_resource(info->indirect)->gpu_address;
-   uint64_t va = base_va + info->indirect_offset;
-   int i;
-
-   radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-si_resource(info->indirect),
-RADEON_USAGE_READ, 
RADEON_PRIO_DRAW_INDIRECT);
-
-   for (i = 0; i < 3; ++i) {
-   radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
-   radeon_emit(cs, 
COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
-   
COPY_DATA_DST_SEL(COPY_DATA_REG));
-   radeon_emit(cs, (va + 4 * i));
-   radeon_emit(cs, (va + 4 * i) >> 32);
-   radeon_emit(cs, (grid_size_reg >> 2) + i);
-   radeon_emit(cs, 0);
+   for (unsigned i = 0; i < 3; ++i) {
+   si_cp_copy_data(sctx,
+   COPY_DATA_REG, NULL, 
(grid_size_reg >> 2) + i,
+   COPY_DATA_SRC_MEM, 
si_resource(info->indirect),
+   info->indirect_offset + 4 * i);
}
}
} else {
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c 
b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 5993369d2da..4e7a89b77b9 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -606,3 +606,31 @@ void si_cp_write_data(struct si_context *sctx, struct 
si_resource *buf,
radeon_emit(cs, va >> 32);
radeon_emit_array(cs, (const uint32_t*)data, size/4);
 }
+
+void si_cp_copy_data(struct si_context *sctx,
+unsigned dst_sel, struct si_resource *dst, unsigned 
dst_offset,
+unsigned src_sel, struct si_resource *src, unsigned 
src_offset)
+{
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+   if (dst) {
+   radeon_add_to_buffer_list(sctx, cs, dst,
+ RADEON_USAGE_WRITE, 
RADEON_PRIO_CP_DMA);
+   }
+   if (src) {
+   radeon_add_to_buffer_list(sctx, cs, src,
+ RADEON_USAGE_READ, 
RADEON_PRIO_CP_DMA);
+   }
+
+   uint64_t dst_va = (dst ? dst->gpu_address : 0ull) + dst_offset;
+   uint64_t src_va = (src ? src->gpu_address : 0ull) + src_offset;
+
+   radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+   radeon_emit(cs, COPY_DATA_SRC_SEL(src_sel) |
+   COPY_DATA_DST_SEL(dst_sel) |
+   COPY_DATA_WR_CONFIRM);
+   radeon_emit(cs, src_va);
+   radeon_emit(cs, src_va >> 32);
+   radeon_emit(cs, dst_va);
+   radeon_emit(cs, dst_va >> 32);
+}
diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c 
b/src/gallium/drivers/radeonsi/si_perfcounter.c
index 4ce71f9500d..c15c444cc40 100644
--- a/src/gallium/drivers/radeonsi/si_perfcounter.c
+++ b/src/gallium/drivers/radeonsi/si_perfcounter.c
@@ -671,16 +671,9 @@ static void si_pc_emit_start(struct si_context *sctx,
 {
struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
-   radeon_add_to_buffer_list(sctx, sctx->gfx_cs, buffer,
- RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
-
-   radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
-   radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) |
-   COPY_DATA_DST_SEL(COPY_DATA_DST_MEM));
-   radeon_emit(cs, 1); /* immediate */
-   radeon_emit(cs, 0); /* unused */
-   radeon_emit(cs, va);
-   radeon_emit(cs, va >> 32);
+   si_cp_copy_data(sctx,
+   COPY_DATA_DST_MEM, buffer, va - buffer->gpu_address,
+   COPY_DATA_IMM, NULL, 1);
 
radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
   
S_036020_PERFMON_STATE(V_036020_DISABLE_AND_RESET));
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h

[Mesa-dev] [PATCH 09/26] winsys/amdpgu: retry after ENOMEM to work around a GDS memory management bug

From: Marek Olšák 

If GDS/GWS/OA resources are being used by IBs that are currently busy,
the kernel driver returns -ENOMEM instead of waiting until those resources
are idle.
---
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 17 +
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index 1438b1ffe76..edb90c1a734 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -1596,8 +1596,13 @@ void amdgpu_cs_submit_ib(void *job, int thread_index)
  chunks[num_chunks].chunk_data = 
(uintptr_t)>ib[IB_PARALLEL_COMPUTE];
  num_chunks++;
 
- r = amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, bo_list,
-   num_chunks, chunks, NULL);
+ /* The memory manager can return -ENOMEM for GDS when all GDS 
resources
+  * are busy. The workaround is to wait until they are idle.
+  */
+ while ((r = amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, bo_list,
+   num_chunks, chunks, NULL)) == 
-ENOMEM)
+usleep(2000);
+
  if (r)
 goto finalize;
 
@@ -1643,8 +1648,12 @@ void amdgpu_cs_submit_ib(void *job, int thread_index)
 
   assert(num_chunks <= ARRAY_SIZE(chunks));
 
-  r = amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, bo_list,
-num_chunks, chunks, _no);
+  /* The memory manager can return -ENOMEM for GDS when all GDS resources
+   * are busy. The workaround is to wait until they are idle.
+   */
+  while ((r = amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, bo_list,
+num_chunks, chunks, _no)) == 
-ENOMEM)
+ usleep(2000);
}
 finalize:
 
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 13/26] radeonsi: extract COMPUTE_RESOURCE_LIMITS code into a helper

From: Marek Olšák 

---
 src/gallium/drivers/radeonsi/si_compute.c | 32 ++-
 src/gallium/drivers/radeonsi/si_pipe.h|  3 +++
 2 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 42caac66884..5ec0c0a5699 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -752,18 +752,14 @@ static void si_setup_tgsi_user_data(struct si_context 
*sctx,
}
 }
 
-static void si_emit_dispatch_packets(struct si_context *sctx,
- const struct pipe_grid_info *info)
+unsigned si_get_compute_resource_limits(struct si_screen *sscreen,
+   unsigned waves_per_threadgroup,
+   unsigned max_waves_per_sh)
 {
-   struct si_screen *sscreen = sctx->screen;
-   struct radeon_cmdbuf *cs = sctx->gfx_cs;
-   bool render_cond_bit = sctx->render_cond && 
!sctx->render_cond_force_off;
-   unsigned waves_per_threadgroup =
-   DIV_ROUND_UP(info->block[0] * info->block[1] * info->block[2], 
64);
unsigned compute_resource_limits =
S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0);
 
-   if (sctx->chip_class >= CIK) {
+   if (sscreen->info.chip_class >= CIK) {
unsigned num_cu_per_se = sscreen->info.num_good_compute_units /
 sscreen->info.max_se;
 
@@ -774,17 +770,29 @@ static void si_emit_dispatch_packets(struct si_context 
*sctx,
if (num_cu_per_se % 4 && waves_per_threadgroup == 1)
compute_resource_limits |= S_00B854_FORCE_SIMD_DIST(1);
 
-   compute_resource_limits |= 
S_00B854_WAVES_PER_SH(sctx->cs_max_waves_per_sh);
+   compute_resource_limits |= 
S_00B854_WAVES_PER_SH(max_waves_per_sh);
} else {
/* SI */
-   if (sctx->cs_max_waves_per_sh) {
-   unsigned limit_div16 = 
DIV_ROUND_UP(sctx->cs_max_waves_per_sh, 16);
+   if (max_waves_per_sh) {
+   unsigned limit_div16 = DIV_ROUND_UP(max_waves_per_sh, 
16);
compute_resource_limits |= 
S_00B854_WAVES_PER_SH_SI(limit_div16);
}
}
+   return compute_resource_limits;
+}
+
+static void si_emit_dispatch_packets(struct si_context *sctx,
+ const struct pipe_grid_info *info)
+{
+   struct si_screen *sscreen = sctx->screen;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   bool render_cond_bit = sctx->render_cond && 
!sctx->render_cond_force_off;
+   unsigned waves_per_threadgroup =
+   DIV_ROUND_UP(info->block[0] * info->block[1] * info->block[2], 
64);
 
radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
- compute_resource_limits);
+ si_get_compute_resource_limits(sscreen, 
waves_per_threadgroup,
+
sctx->cs_max_waves_per_sh));
 
unsigned dispatch_initiator =
S_00B800_COMPUTE_SHADER_EN(1) |
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index bd39e13b381..81faf4c66e8 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1291,6 +1291,9 @@ unsigned si_end_counter(struct si_screen *sscreen, 
unsigned type,
uint64_t begin);
 
 /* si_compute.c */
+unsigned si_get_compute_resource_limits(struct si_screen *sscreen,
+   unsigned waves_per_threadgroup,
+   unsigned max_waves_per_sh);
 void si_init_compute_functions(struct si_context *sctx);
 
 /* si_perfcounters.c */
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 06/26] winsys/amdgpu: make IBs writable and expose their address

From: Marek Olšák 

---
 src/gallium/drivers/radeon/radeon_winsys.h | 1 +
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c  | 5 -
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeon/radeon_winsys.h 
b/src/gallium/drivers/radeon/radeon_winsys.h
index 822d2e88ee2..aec91c8d002 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -193,6 +193,7 @@ struct radeon_cmdbuf {
 /* Memory usage of the buffer list. These are always 0 for preamble IBs. */
 uint64_t  used_vram;
 uint64_t  used_gart;
+uint64_t  gpu_address;
 };
 
 /* Tiling info for display code, DRI sharing, and other data. */
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index dd5193c003d..319741f7d0b 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -692,10 +692,11 @@ static bool amdgpu_ib_new_buffer(struct amdgpu_winsys 
*ws, struct amdgpu_ib *ib,
ws->info.gart_page_size,
RADEON_DOMAIN_GTT,
RADEON_FLAG_NO_INTERPROCESS_SHARING |
+   RADEON_FLAG_32BIT |
(ring_type == RING_GFX ||
 ring_type == RING_COMPUTE ||
 ring_type == RING_DMA ?
-   RADEON_FLAG_READ_ONLY | RADEON_FLAG_GTT_WC 
: 0));
+   RADEON_FLAG_GTT_WC : 0));
if (!pb)
   return false;
 
@@ -789,6 +790,7 @@ static bool amdgpu_get_new_ib(struct radeon_winsys *ws, 
struct amdgpu_cs *cs,
ib_size = ib->big_ib_buffer->size - ib->used_ib_space;
ib->base.current.max_dw = ib_size / 4 - amdgpu_cs_epilog_dws(cs->ring_type);
assert(ib->base.current.max_dw >= ib->max_check_space_size / 4);
+   ib->base.gpu_address = info->va_start;
return true;
 }
 
@@ -1060,6 +1062,7 @@ static bool amdgpu_cs_check_space(struct radeon_cmdbuf 
*rcs, unsigned dw)
ib->base.current.buf = (uint32_t*)(ib->ib_mapped + ib->used_ib_space);
ib->base.current.max_dw = ib->big_ib_buffer->size / 4 - cs_epilog_dw;
assert(ib->base.current.max_dw >= ib->max_check_space_size / 4);
+   ib->base.gpu_address = va;
 
amdgpu_cs_add_buffer(>main.base, ib->big_ib_buffer,
 RADEON_USAGE_READ, 0, RADEON_PRIO_IB1);
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 11/26] radeonsi: use pipe_draw_info::prim and primitive_restart indirectly

From: Marek Olšák 

so that the fields can be changed by the driver.
---
 src/gallium/drivers/radeonsi/si_state_draw.c | 73 +++-
 1 file changed, 41 insertions(+), 32 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c 
b/src/gallium/drivers/radeonsi/si_state_draw.c
index 2feb504fb42..bf091827828 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -310,9 +310,10 @@ static bool si_emit_derived_tess_state(struct si_context 
*sctx,
return false;
 }
 
-static unsigned si_num_prims_for_vertices(const struct pipe_draw_info *info)
+static unsigned si_num_prims_for_vertices(const struct pipe_draw_info *info,
+ enum pipe_prim_type prim)
 {
-   switch (info->mode) {
+   switch (prim) {
case PIPE_PRIM_PATCHES:
return info->count / info->vertices_per_patch;
case PIPE_PRIM_POLYGON:
@@ -320,7 +321,7 @@ static unsigned si_num_prims_for_vertices(const struct 
pipe_draw_info *info)
case SI_PRIM_RECTANGLE_LIST:
return info->count / 3;
default:
-   return u_decomposed_prims_for_vertices(info->mode, info->count);
+   return u_decomposed_prims_for_vertices(prim, info->count);
}
 }
 
@@ -491,7 +492,9 @@ static void si_init_ia_multi_vgt_param_table(struct 
si_context *sctx)
 
 static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
  const struct pipe_draw_info *info,
- unsigned num_patches)
+ enum pipe_prim_type prim,
+ unsigned num_patches,
+ bool primitive_restart)
 {
union si_vgt_param_key key = sctx->ia_multi_vgt_param_key;
unsigned primgroup_size;
@@ -505,14 +508,14 @@ static unsigned si_get_ia_multi_vgt_param(struct 
si_context *sctx,
primgroup_size = 128; /* recommended without a GS and tess */
}
 
-   key.u.prim = info->mode;
+   key.u.prim = prim;
key.u.uses_instancing = info->indirect || info->instance_count > 1;
key.u.multi_instances_smaller_than_primgroup =
info->indirect ||
(info->instance_count > 1 &&
 (info->count_from_stream_output ||
- si_num_prims_for_vertices(info) < primgroup_size));
-   key.u.primitive_restart = info->primitive_restart;
+ si_num_prims_for_vertices(info, prim) < primgroup_size));
+   key.u.primitive_restart = primitive_restart;
key.u.count_from_stream_output = info->count_from_stream_output != NULL;
 
ia_multi_vgt_param = sctx->ia_multi_vgt_param[key.index] |
@@ -533,7 +536,7 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context 
*sctx,
(info->indirect ||
 (info->instance_count > 1 &&
  (info->count_from_stream_output ||
-  si_num_prims_for_vertices(info) <= 1
+  si_num_prims_for_vertices(info, prim) <= 1
sctx->flags |= SI_CONTEXT_VGT_FLUSH;
}
 
@@ -603,22 +606,26 @@ static void si_emit_vs_state(struct si_context *sctx,
 }
 
 static inline bool si_prim_restart_index_changed(struct si_context *sctx,
-const struct pipe_draw_info 
*info)
+bool primitive_restart,
+unsigned restart_index)
 {
-   return info->primitive_restart &&
-  (info->restart_index != sctx->last_restart_index ||
+   return primitive_restart &&
+  (restart_index != sctx->last_restart_index ||
sctx->last_restart_index == SI_RESTART_INDEX_UNKNOWN);
 }
 
 static void si_emit_draw_registers(struct si_context *sctx,
   const struct pipe_draw_info *info,
-  unsigned num_patches)
+  enum pipe_prim_type prim,
+  unsigned num_patches,
+  bool primitive_restart)
 {
struct radeon_cmdbuf *cs = sctx->gfx_cs;
-   unsigned prim = si_conv_pipe_prim(info->mode);
+   unsigned vgt_prim = si_conv_pipe_prim(prim);
unsigned ia_multi_vgt_param;
 
-   ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info, num_patches);
+   ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info, prim, 
num_patches,
+  primitive_restart);
 
/* Draw state. */
if (ia_multi_vgt_param != sctx->last_multi_vgt_param) {
@@ -633,29 +640,29 @@ static void si_emit_draw_registers(struct si_context 
*sctx,
 
sctx->last_multi_vgt_param =

[Mesa-dev] [PATCH 05/26] ac: add REWIND and GDS registers to register headers

From: Marek Olšák 

---
 src/amd/common/sid.h | 16 
 1 file changed, 16 insertions(+)

diff --git a/src/amd/common/sid.h b/src/amd/common/sid.h
index 5c8eee0124d..3c0b7001d2b 100644
--- a/src/amd/common/sid.h
+++ b/src/amd/common/sid.h
@@ -206,6 +206,7 @@
 #define PKT3_RELEASE_MEM   0x49 /* GFX9+ [any ring] or 
GFX8 [compute ring only] */
 #define PKT3_ONE_REG_WRITE 0x57 /* not on CIK */
 #define PKT3_ACQUIRE_MEM   0x58 /* new for CIK */
+#define PKT3_REWIND0x59 /* VI+ [any ring] or CIK 
[compute ring only] */
 #define PKT3_SET_CONFIG_REG0x68
 #define PKT3_SET_CONTEXT_REG   0x69
 #define PKT3_SET_SH_REG0x76
@@ -2623,6 +2624,21 @@
 #define   S_030FFC_COUNT_HI(x)
(((unsigned)(x) & 0x7FFF) << 0)
 #define   G_030FFC_COUNT_HI(x)(((x) >> 
0) & 0x7FFF)
 #define   C_030FFC_COUNT_HI   
0x8000
+#define R_031074_GDS_OA_CNTL   0x031074
+#define   S_031074_INDEX(x)   
(((unsigned)(x) & 0xF) << 0)
+#define R_031078_GDS_OA_COUNTER
0x031078
+#define   S_031078_SPACE_AVAILABLE(x) 
(((unsigned)(x) & 0x) << 0)
+#define R_03107C_GDS_OA_ADDRESS
0x03107C
+#define   S_03107C_DS_ADDRESS(x)  
(((unsigned)(x) & 0x) << 0)
+#define   S_03107C_CRAWLER(x) 
(((unsigned)(x) & 0xF) << 16)
+#define   S_03107C_CRAWLER_TYPE(x)
(((unsigned)(x) & 0x3) << 20)
+#define   S_03107C_NO_ALLOC(x)
(((unsigned)(x) & 0x1) << 30)
+#define   S_03107C_ENABLE(x)  
(((unsigned)(x) & 0x1) << 31)
+#define R_031080_GDS_OA_INCDEC 0x031080
+#define   S_031080_VALUE(x)   
(((unsigned)(x) & 0x7FFF) << 0)
+#define   S_031080_INCDEC(x)  
(((unsigned)(x) & 0x1) << 31)
+#define R_031084_GDS_OA_RING_SIZE  0x031084
+#define   S_031084_RING_SIZE(x)   
(((unsigned)(x) & 0x) << 0)
 #define R_009100_SPI_CONFIG_CNTL
0x009100
 #define   S_009100_GPR_WRITE_PRIORITY(x)  
(((unsigned)(x) & 0x1F) << 0)
 #define   G_009100_GPR_WRITE_PRIORITY(x)  (((x) >> 
0) & 0x1F)
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 02/26] ac: add radeon_info::marketing_name, replacing the winsys callback

From: Marek Olšák 

---
 src/amd/common/ac_gpu_info.c  |  2 ++
 src/amd/common/ac_gpu_info.h  |  1 +
 src/gallium/drivers/r600/r600_pipe_common.c   | 13 +
 src/gallium/drivers/radeon/radeon_winsys.h|  2 --
 src/gallium/drivers/radeonsi/si_get.c | 15 +++
 src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c |  7 ---
 6 files changed, 7 insertions(+), 33 deletions(-)

diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c
index fc8c6a09d2f..4e3190015e7 100644
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@@ -333,6 +333,8 @@ bool ac_query_gpu_info(int fd, amdgpu_device_handle dev,
return false;
}
 
+   info->marketing_name = amdgpu_get_marketing_name(dev);
+
/* Set which chips have dedicated VRAM. */
info->has_dedicated_vram =
!(amdinfo->ids_flags & AMDGPU_IDS_FLAGS_FUSION);
diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h
index b1ef9c53734..8971d51dfd7 100644
--- a/src/amd/common/ac_gpu_info.h
+++ b/src/amd/common/ac_gpu_info.h
@@ -48,6 +48,7 @@ struct radeon_info {
 
/* Device info. */
const char  *name;
+   const char  *marketing_name;
uint32_tpci_id;
enum radeon_family  family;
enum chip_class chip_class;
diff --git a/src/gallium/drivers/r600/r600_pipe_common.c 
b/src/gallium/drivers/r600/r600_pipe_common.c
index 19ba09ae82a..abfa250435d 100644
--- a/src/gallium/drivers/r600/r600_pipe_common.c
+++ b/src/gallium/drivers/r600/r600_pipe_common.c
@@ -809,13 +809,6 @@ static const char* r600_get_device_vendor(struct 
pipe_screen* pscreen)
return "AMD";
 }
 
-static const char *r600_get_marketing_name(struct radeon_winsys *ws)
-{
-   if (!ws->get_chip_name)
-   return NULL;
-   return ws->get_chip_name(ws);
-}
-
 static const char *r600_get_family_name(const struct r600_common_screen 
*rscreen)
 {
switch (rscreen->info.family) {
@@ -1278,11 +1271,7 @@ bool r600_common_screen_init(struct r600_common_screen 
*rscreen,
ws->query_info(ws, >info);
rscreen->ws = ws;
 
-   if ((chip_name = r600_get_marketing_name(ws)))
-   snprintf(family_name, sizeof(family_name), "%s / ",
-r600_get_family_name(rscreen) + 4);
-   else
-   chip_name = r600_get_family_name(rscreen);
+   chip_name = r600_get_family_name(rscreen);
 
if (uname(_data) == 0)
snprintf(kernel_version, sizeof(kernel_version),
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h 
b/src/gallium/drivers/radeon/radeon_winsys.h
index 82feef39487..822d2e88ee2 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -677,8 +677,6 @@ struct radeon_winsys {
 
 bool (*read_registers)(struct radeon_winsys *ws, unsigned reg_offset,
unsigned num_registers, uint32_t *out);
-
-const char* (*get_chip_name)(struct radeon_winsys *ws);
 };
 
 static inline bool radeon_emitted(struct radeon_cmdbuf *cs, unsigned num_dw)
diff --git a/src/gallium/drivers/radeonsi/si_get.c 
b/src/gallium/drivers/radeonsi/si_get.c
index f8ca02d4fcf..416eab1a3eb 100644
--- a/src/gallium/drivers/radeonsi/si_get.c
+++ b/src/gallium/drivers/radeonsi/si_get.c
@@ -48,13 +48,6 @@ static const char *si_get_device_vendor(struct pipe_screen 
*pscreen)
return "AMD";
 }
 
-static const char *si_get_marketing_name(struct radeon_winsys *ws)
-{
-   if (!ws->get_chip_name)
-   return NULL;
-   return ws->get_chip_name(ws);
-}
-
 static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 {
struct si_screen *sscreen = (struct si_screen *)pscreen;
@@ -941,14 +934,12 @@ static struct disk_cache *si_get_disk_shader_cache(struct 
pipe_screen *pscreen)
 
 static void si_init_renderer_string(struct si_screen *sscreen)
 {
-   struct radeon_winsys *ws = sscreen->ws;
char first_name[256], second_name[32] = {}, kernel_version[128] = {};
struct utsname uname_data;
 
-   const char *marketing_name = si_get_marketing_name(ws);
-
-   if (marketing_name) {
-   snprintf(first_name, sizeof(first_name), "%s", marketing_name);
+   if (sscreen->info.marketing_name) {
+   snprintf(first_name, sizeof(first_name), "%s",
+sscreen->info.marketing_name);
snprintf(second_name, sizeof(second_name), "%s, ",
 sscreen->info.name);
} else {
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
index 79d2c1345ef..88f9d3180b1 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -269,12 +269,6 @@ static bool amdgpu_winsys_unref(struct

[Mesa-dev] [PATCH 00/26] RadeonSI: Primitive culling with async compute

Hi,

This patch series uses async compute to do primitive culling before
the vertex shader. It significantly improves performance for applications
that use a lot of geometry that is invisible because primitives don't
intersect sample points or there are a lot of back faces, etc.

It passes 99.% of all tests (GL CTS, dEQP, piglit) and is 100% stable.
It supports all chips all the way from Sea Islands to Radeon VII.

As you can see in the results marked (ENABLED) in the picture below,
it destroys our competition (The GeForce results are from a Phoronix
article from 2017, the latest ones I could find):

Benchmark: ParaView - Many Spheres - 2560x1440
https://people.freedesktop.org/~mareko/prim-discard-cs-results.png


The last patch describes the implementation and functional limitations
if you can find the huge code comment, so I'm not gonna do that here.

I decided to enable this optimization on all Pro graphics cards.
The reason is that I haven't had time to benchmark games.
This decision may be changed based on community feedback, etc.

People using the Pro graphics cards can disable this by setting
AMD_DEBUG=nopd, and people using consumer graphics cards can enable
this by setting AMD_DEBUG=pd. So you always have a choice.

Eventually we might also enable this on consumer graphics cards for those
games that benefit. It might decrease performance if there is not enough
invisible geometry.

Branch:
https://cgit.freedesktop.org/~mareko/mesa/log/?h=prim-discard-cs

Please review.

Thanks,
Marek
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 03/26] ac: add radeon_info::is_pro_graphics

From: Marek Olšák 

---
 src/amd/common/ac_gpu_info.c | 3 +++
 src/amd/common/ac_gpu_info.h | 1 +
 2 files changed, 4 insertions(+)

diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c
index 4e3190015e7..6971e4f0a8e 100644
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@@ -334,6 +334,9 @@ bool ac_query_gpu_info(int fd, amdgpu_device_handle dev,
}
 
info->marketing_name = amdgpu_get_marketing_name(dev);
+   info->is_pro_graphics = info->marketing_name &&
+   (!strcmp(info->marketing_name, "Pro") ||
+!strcmp(info->marketing_name, "PRO"));
 
/* Set which chips have dedicated VRAM. */
info->has_dedicated_vram =
diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h
index 8971d51dfd7..2c2389eaaa7 100644
--- a/src/amd/common/ac_gpu_info.h
+++ b/src/amd/common/ac_gpu_info.h
@@ -49,6 +49,7 @@ struct radeon_info {
/* Device info. */
const char  *name;
const char  *marketing_name;
+   boolis_pro_graphics;
uint32_tpci_id;
enum radeon_family  family;
enum chip_class chip_class;
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 2/3] st/mesa: enable GL_EXT_float_blend when possible

On Tue, Feb 12, 2019 at 11:13 PM Jason Ekstrand  wrote:
>
> On February 12, 2019 21:40:49 Ilia Mirkin  wrote:
>
> > If the driver supports PIPE_BIND_BLENABLE on RGBA32F, flip
> > EXT_float_blend on (which will affect ES3 contexts).
> >
> > Signed-off-by: Ilia Mirkin 
> > ---
> > src/mesa/state_tracker/st_extensions.c | 10 ++
> > 1 file changed, 10 insertions(+)
> >
> > diff --git a/src/mesa/state_tracker/st_extensions.c
> > b/src/mesa/state_tracker/st_extensions.c
> > index d2660099fc1..528e6b74a54 100644
> > --- a/src/mesa/state_tracker/st_extensions.c
> > +++ b/src/mesa/state_tracker/st_extensions.c
> > @@ -820,6 +820,12 @@ void st_init_extensions(struct pipe_screen *screen,
> >   PIPE_FORMAT_R16G16B16A16_SNORM } },
> >};
> >
> > +   /* Required: render target, sampler, and blending */
> > +   static const struct st_extension_format_mapping rt_blendable[] = {
> > +  { { o(EXT_float_blend) },
> > +{ PIPE_FORMAT_R32G32B32A32_FLOAT } },
>
> Any particular reason you're only checking the one format? Seems like it
> should check R and RG too.
>
> With that resolved, the series is
>
> Reviewed-by: Jason Ekstrand 
>
> But take that with a huge grain of salt. I haven't implemented an OpenGL
> extension in 4 years so it'd be good to get another sanity check.


Well, it's a bit tricky. I guess you could have a weird driver, but
it's hard to check for every possibility.

I don't want to just start checking for e.g. R32_FLOAT, since a driver
might not support that at all, in which case we'd fall back to
RGBA32_FLOAT. The assumption is that if you support blending for
RGBA32_FLOAT, then you either support both blending + rendering or
neither on the other 32_FLOAT variants.

These checks aren't designed to handle EVERY possible driver-reported
permutation -- I think this is a reasonable compromise.

  -ilia
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 2/3] st/mesa: enable GL_EXT_float_blend when possible


On February 12, 2019 21:40:49 Ilia Mirkin  wrote:


If the driver supports PIPE_BIND_BLENABLE on RGBA32F, flip
EXT_float_blend on (which will affect ES3 contexts).

Signed-off-by: Ilia Mirkin 
---
src/mesa/state_tracker/st_extensions.c | 10 ++
1 file changed, 10 insertions(+)

diff --git a/src/mesa/state_tracker/st_extensions.c 
b/src/mesa/state_tracker/st_extensions.c

index d2660099fc1..528e6b74a54 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -820,6 +820,12 @@ void st_init_extensions(struct pipe_screen *screen,
  PIPE_FORMAT_R16G16B16A16_SNORM } },
   };

+   /* Required: render target, sampler, and blending */
+   static const struct st_extension_format_mapping rt_blendable[] = {
+  { { o(EXT_float_blend) },
+{ PIPE_FORMAT_R32G32B32A32_FLOAT } },


Any particular reason you're only checking the one format? Seems like it 
should check R and RG too.


With that resolved, the series is

Reviewed-by: Jason Ekstrand 

But take that with a huge grain of salt. I haven't implemented an OpenGL 
extension in 4 years so it'd be good to get another sanity check. 


--Jason


+   };
+
   /* Required: depth stencil and sampler support */
   static const struct st_extension_format_mapping depthstencil_mapping[] = {
  { { o(ARB_depth_buffer_float) },
@@ -1025,6 +1031,10 @@ void st_init_extensions(struct pipe_screen *screen,
   init_format_extensions(screen, extensions, rendertarget_mapping,
  ARRAY_SIZE(rendertarget_mapping), PIPE_TEXTURE_2D,
  PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW);
+   init_format_extensions(screen, extensions, rt_blendable,
+  ARRAY_SIZE(rt_blendable), PIPE_TEXTURE_2D,
+  PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW |
+  PIPE_BIND_BLENDABLE);
   init_format_extensions(screen, extensions, depthstencil_mapping,
  ARRAY_SIZE(depthstencil_mapping), PIPE_TEXTURE_2D,
  PIPE_BIND_DEPTH_STENCIL | PIPE_BIND_SAMPLER_VIEW);
--
2.19.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev




___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 3/3] i965: always enable EXT_float_blend

From the table in isl_format.c, it appears that all generations
support blending on 32-bit float surfaces.

Signed-off-by: Ilia Mirkin 
---
 src/mesa/drivers/dri/i965/intel_extensions.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c 
b/src/mesa/drivers/dri/i965/intel_extensions.c
index 3a95be58a63..865b4c29da1 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -97,6 +97,7 @@ intelInitExtensions(struct gl_context *ctx)
ctx->Extensions.EXT_blend_func_separate = true;
ctx->Extensions.EXT_blend_minmax = true;
ctx->Extensions.EXT_draw_buffers2 = true;
+   ctx->Extensions.EXT_float_blend = true;
ctx->Extensions.EXT_framebuffer_sRGB = true;
ctx->Extensions.EXT_gpu_program_parameters = true;
ctx->Extensions.EXT_packed_float = true;
-- 
2.19.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 1/3] mesa: add explicit enable for EXT_float_blend, and error condition

If EXT_float_blend is not supported, error out on blending of FP32
attachments in an ES2 context.

Signed-off-by: Ilia Mirkin 
---
 src/mesa/main/draw_validate.c| 19 +++
 src/mesa/main/extensions_table.h |  2 +-
 src/mesa/main/fbobject.c |  4 
 src/mesa/main/mtypes.h   |  2 ++
 4 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/mesa/main/draw_validate.c b/src/mesa/main/draw_validate.c
index b715a27f8b7..779cd1c12c7 100644
--- a/src/mesa/main/draw_validate.c
+++ b/src/mesa/main/draw_validate.c
@@ -304,6 +304,25 @@ check_valid_to_render(struct gl_context *ctx, const char 
*function)
  "%s(tess ctrl shader is missing)", function);
  return false;
   }
+
+  /* From GL_EXT_color_buffer_float:
+   *
+   * "Blending applies only if the color buffer has a fixed-point or
+   * or floating-point format. If the color buffer has an integer
+   * format, proceed to the next operation.  Furthermore, an
+   * INVALID_OPERATION error is generated by DrawArrays and the other
+   * drawing commands defined in section 2.8.3 (10.5 in ES 3.1) if
+   * blending is enabled (see below) and any draw buffer has 32-bit
+   * floating-point format components."
+   *
+   * However GL_EXT_float_blend removes this text.
+   */
+  if (!ctx->Extensions.EXT_float_blend &&
+  (ctx->DrawBuffer->_FP32Buffers & ctx->Color.BlendEnabled)) {
+ _mesa_error(ctx, GL_INVALID_OPERATION,
+ "%s(32-bit float output + blending)", function);
+ return false;
+  }
   break;
 
case API_OPENGL_CORE:
diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h
index 0d6bb452ffa..b0492fed698 100644
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -226,7 +226,7 @@ EXT(EXT_draw_buffers_indexed, 
ARB_draw_buffers_blend
 EXT(EXT_draw_elements_base_vertex   , ARB_draw_elements_base_vertex
  ,  x ,  x ,  x , ES2, 2014)
 EXT(EXT_draw_instanced  , ARB_draw_instanced   
  , GLL, GLC,  x ,  x , 2006)
 EXT(EXT_draw_range_elements , dummy_true   
  , GLL,  x ,  x ,  x , 1997)
-EXT(EXT_float_blend , dummy_true   
  ,  x ,  x ,  x ,  30, 2015)
+EXT(EXT_float_blend , EXT_float_blend  
  ,  x ,  x ,  x ,  30, 2015)
 EXT(EXT_fog_coord   , dummy_true   
  , GLL,  x ,  x ,  x , 1999)
 EXT(EXT_frag_depth  , dummy_true   
  ,  x ,  x ,  x , ES2, 2010)
 EXT(EXT_framebuffer_blit, dummy_true   
  , GLL, GLC,  x ,  x , 2005)
diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index 87c33be7854..21e3496593c 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -1004,6 +1004,7 @@ _mesa_test_framebuffer_completeness(struct gl_context 
*ctx,
fb->_HasAttachments = true;
fb->_IntegerBuffers = 0;
fb->_RGBBuffers = 0;
+   fb->_FP32Buffers = 0;
 
/* Start at -2 to more easily loop over all attachment points.
 *  -2: depth buffer
@@ -1153,6 +1154,9 @@ _mesa_test_framebuffer_completeness(struct gl_context 
*ctx,
  if (f == GL_RGB)
 fb->_RGBBuffers |= (1 << i);
 
+ if (type == GL_FLOAT && _mesa_get_format_max_bits(attFormat) > 16)
+fb->_FP32Buffers |= (1 << i);
+
  fb->_AllColorBuffersFixedPoint =
 fb->_AllColorBuffersFixedPoint &&
 (type == GL_UNSIGNED_NORMALIZED || type == GL_SIGNED_NORMALIZED);
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index dda96cd2f19..ca00de7dc63 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -3506,6 +3506,7 @@ struct gl_framebuffer
 
GLbitfield _IntegerBuffers;  /**< Which color buffers are integer valued */
GLbitfield _RGBBuffers;  /**< Which color buffers have baseformat == RGB */
+   GLbitfield _FP32Buffers; /**< Which color buffers are FP32 */
 
/* ARB_color_buffer_float */
GLboolean _AllColorBuffersFixedPoint; /* no integer, no float */
@@ -4248,6 +4249,7 @@ struct gl_extensions
GLboolean EXT_depth_bounds_test;
GLboolean EXT_disjoint_timer_query;
GLboolean EXT_draw_buffers2;
+   GLboolean EXT_float_blend;
GLboolean EXT_framebuffer_multisample;
GLboolean EXT_framebuffer_multisample_blit_scaled;
GLboolean EXT_framebuffer_sRGB;
-- 
2.19.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 2/3] st/mesa: enable GL_EXT_float_blend when possible

If the driver supports PIPE_BIND_BLENABLE on RGBA32F, flip
EXT_float_blend on (which will affect ES3 contexts).

Signed-off-by: Ilia Mirkin 
---
 src/mesa/state_tracker/st_extensions.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/src/mesa/state_tracker/st_extensions.c 
b/src/mesa/state_tracker/st_extensions.c
index d2660099fc1..528e6b74a54 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -820,6 +820,12 @@ void st_init_extensions(struct pipe_screen *screen,
   PIPE_FORMAT_R16G16B16A16_SNORM } },
};
 
+   /* Required: render target, sampler, and blending */
+   static const struct st_extension_format_mapping rt_blendable[] = {
+  { { o(EXT_float_blend) },
+{ PIPE_FORMAT_R32G32B32A32_FLOAT } },
+   };
+
/* Required: depth stencil and sampler support */
static const struct st_extension_format_mapping depthstencil_mapping[] = {
   { { o(ARB_depth_buffer_float) },
@@ -1025,6 +1031,10 @@ void st_init_extensions(struct pipe_screen *screen,
init_format_extensions(screen, extensions, rendertarget_mapping,
   ARRAY_SIZE(rendertarget_mapping), PIPE_TEXTURE_2D,
   PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW);
+   init_format_extensions(screen, extensions, rt_blendable,
+  ARRAY_SIZE(rt_blendable), PIPE_TEXTURE_2D,
+  PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW |
+  PIPE_BIND_BLENDABLE);
init_format_extensions(screen, extensions, depthstencil_mapping,
   ARRAY_SIZE(depthstencil_mapping), PIPE_TEXTURE_2D,
   PIPE_BIND_DEPTH_STENCIL | PIPE_BIND_SAMPLER_VIEW);
-- 
2.19.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [Bug 109107] gallium/st/va: change va max_profiles when using Radeon VCN Hardware

https://bugs.freedesktop.org/show_bug.cgi?id=109107

zhoulei  changed:

   What|Removed |Added

 Resolution|--- |FIXED
 Status|NEW |RESOLVED

-- 
You are receiving this mail because:
You are the assignee for the bug.
You are the QA Contact for the bug.___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] nir: fix example in opt_peel_loop_initial_if description


Rb

On February 12, 2019 12:48:58 Caio Marcelo de Oliveira Filho 
 wrote:



---
src/compiler/nir/nir_opt_if.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/compiler/nir/nir_opt_if.c b/src/compiler/nir/nir_opt_if.c
index 9afb901be14..170caaad89d 100644
--- a/src/compiler/nir/nir_opt_if.c
+++ b/src/compiler/nir/nir_opt_if.c
@@ -96,7 +96,7 @@ 
phi_has_constant_from_outside_and_one_from_inside_loop(nir_phi_instr *phi,

 *block block_1:
 *vec1 32 ssa_2 = phi block_0: ssa_0, block_7: ssa_5
 *vec1 32 ssa_3 = phi block_0: ssa_0, block_7: ssa_1
- *if ssa_2 {
+ *if ssa_3 {
 *   block block_2:
 *   vec1 32 ssa_4 = load_const (0x0001)
 *   vec1 32 ssa_5 = iadd ssa_2, ssa_4
@@ -121,9 +121,9 @@ 
phi_has_constant_from_outside_and_one_from_inside_loop(nir_phi_instr *phi,

 * // Stuff from block 3
 * loop {
 *block block_1:
- *vec1 32 ssa_3 = phi block_0: ssa_0, block_7: ssa_1
+ *vec1 32 ssa_2 = phi block_0: ssa_0, block_7: ssa_5
 *vec1 32 ssa_6 = load_const (0x0004)
- *vec1 32 ssa_7 = ilt ssa_5, ssa_6
+ *vec1 32 ssa_7 = ilt ssa_2, ssa_6
 *if ssa_7 {
 *   block block_5:
 *} else {
--
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev




___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] nir: move ALU instruction before the jump instruction


On February 12, 2019 18:22:36 Ian Romanick  wrote:


On 2/12/19 12:58 AM, Juan A. Suarez Romero wrote:

opt_split_alu_of_phi moves ALU instruction to the end of continue block.

But if the continue block ends with a jump instruction (an explicit
"continue" instruction) then the ALU must be inserted before the jump,
as it is illegal to add instructions after the jump.


I'm assuming you found this by inspection?  Since this pass only
operates when the first block of the loop only has two predecessors (the
block before the loop and the implicit continue at the end of the loop),
this shouldn't be a a problem in practice... or were you able to trigger
it somehow?


What if you have

loop {
  if () {
  /* Stuff*/
  continue;
  }
  break;
}

Or, for that matter, if the break and continue are flipped and this runs 
before the pass that gets rid of trivial continues.


Sorry, on my phone and not looking at the code so I could be way off base.




CC: Ian Romanick 
Fixes: 0881e90c099 ("nir: Split ALU instructions in loops that read phis")
---
src/compiler/nir/nir_opt_if.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/compiler/nir/nir_opt_if.c b/src/compiler/nir/nir_opt_if.c
index 9afb901be14..932af9e37ab 100644
--- a/src/compiler/nir/nir_opt_if.c
+++ b/src/compiler/nir/nir_opt_if.c
@@ -488,7 +488,7 @@ opt_split_alu_of_phi(nir_builder *b, nir_loop *loop)
  *
  * Insert the new instruction at the end of the continue block.
  */
- b->cursor = nir_after_block(continue_block);
+ b->cursor = nir_after_block_before_jump(continue_block);

 nir_ssa_def *const alu_copy =
clone_alu_and_replace_src_defs(b, alu, continue_srcs);

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev




___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] nir: move ALU instruction before the jump instruction

2019-02-12 Thread Ian Romanick

On 2/12/19 12:58 AM, Juan A. Suarez Romero wrote:
> opt_split_alu_of_phi moves ALU instruction to the end of continue block.
> 
> But if the continue block ends with a jump instruction (an explicit
> "continue" instruction) then the ALU must be inserted before the jump,
> as it is illegal to add instructions after the jump.

I'm assuming you found this by inspection?  Since this pass only
operates when the first block of the loop only has two predecessors (the
block before the loop and the implicit continue at the end of the loop),
this shouldn't be a a problem in practice... or were you able to trigger
it somehow?

> CC: Ian Romanick 
> Fixes: 0881e90c099 ("nir: Split ALU instructions in loops that read phis")
> ---
>  src/compiler/nir/nir_opt_if.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/src/compiler/nir/nir_opt_if.c b/src/compiler/nir/nir_opt_if.c
> index 9afb901be14..932af9e37ab 100644
> --- a/src/compiler/nir/nir_opt_if.c
> +++ b/src/compiler/nir/nir_opt_if.c
> @@ -488,7 +488,7 @@ opt_split_alu_of_phi(nir_builder *b, nir_loop *loop)
>*
>* Insert the new instruction at the end of the continue block.
>*/
> - b->cursor = nir_after_block(continue_block);
> + b->cursor = nir_after_block_before_jump(continue_block);
>  
>   nir_ssa_def *const alu_copy =
>  clone_alu_and_replace_src_defs(b, alu, continue_srcs);
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [Bug 109617] [oland, clover, llvm5] While-If Problem with Booleans

https://bugs.freedesktop.org/show_bug.cgi?id=109617

--- Comment #3 from Jan Vesely  ---
This looks like clang/llvm kernel miscompile.
Can you reproduce using more recent llvm version (ideally git)?

-- 
You are receiving this mail because:
You are the QA Contact for the bug.
You are the assignee for the bug.___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [Bug 109617] [oland, clover, llvm5] While-If Problem with Booleans

https://bugs.freedesktop.org/show_bug.cgi?id=109617

Jan Vesely  changed:

   What|Removed |Added

 Blocks||99553
Summary|Gallium OpenCL: While-If|[oland, clover, llvm5]
   |Problem with Booleans   |While-If Problem with
   ||Booleans

--- Comment #1 from Jan Vesely  ---
please don't use text upload sites. either post in a comment or add as an
attachment.

kernel code:

#ifndef TEST_KERNEL_CL
#define TEST_KERNEL_CL

__kernel void
test_kernel_1(__global float4 *result)
{
bool flag = false;
//uint flag = false;
uint gid = get_global_id(0);
float rnd;
uint i = 0;

#define VARIANT 1
#if (VARIANT == 1)
while ((i < 2) && (flag == false)){
rnd = sin((float)(gid + i)) + 0.1;

if ((rnd * rnd)<=0.5){flag=true;}
i++;
}
#elif (VARIANT == 2)
for (i = 0; i < 2; i++){
if (flag == false){
rnd = sin((float)(gid + i)) + 0.1;

if ((rnd * rnd)<=0.5){flag=true; i++; break;}
}
}
#endif

if(flag){
result[gid].x = rnd;
}

result[gid].y = (float)i;
}

#endif


Referenced Bugs:

https://bugs.freedesktop.org/show_bug.cgi?id=99553
[Bug 99553] Tracker bug for runnning OpenCL applications on Clover
-- 
You are receiving this mail because:
You are the assignee for the bug.
You are the QA Contact for the bug.___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [Bug 99553] Tracker bug for runnning OpenCL applications on Clover

https://bugs.freedesktop.org/show_bug.cgi?id=99553

Jan Vesely  changed:

   What|Removed |Added

 Depends on||109617


Referenced Bugs:

https://bugs.freedesktop.org/show_bug.cgi?id=109617
[Bug 109617] [oland, clover, llvm5] While-If Problem with Booleans
-- 
You are receiving this mail because:
You are the QA Contact for the bug.
You are the assignee for the bug.___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [Bug 109617] [oland, clover, llvm5] While-If Problem with Booleans

https://bugs.freedesktop.org/show_bug.cgi?id=109617

--- Comment #2 from Jan Vesely  ---
results:

xxx
Correct executionxx
xxx
---
bool flag = false;
#define VARIANT 2
---
:
Platform: Clover
Device: AMD OLAND (DRM 2.50.0 / 4.12.14-lp150.12.45-default, LLVM 5.0.1) (GPU)
:
Number of test_kernels: 1
--
[ 0.10,  1.00,  0.00,  0.00]
[ 0.00,  2.00,  0.00,  0.00]
[ 0.241120,  2.00,  0.00,  0.00]

---
uint flag = false;
#define VARIANT 1
---
:
Platform: Clover
Device: AMD OLAND (DRM 2.50.0 / 4.12.14-lp150.12.45-default, LLVM 5.0.1) (GPU)
:
Number of test_kernels: 1
--
[ 0.10,  1.00,  0.00,  0.00]
[ 0.00,  2.00,  0.00,  0.00]
[ 0.241120,  2.00,  0.00,  0.00]

xxx
Incorrect execution
xxx
---
bool flag = false;
#define VARIANT 1
---
:
Platform: Clover
Device: AMD OLAND (DRM 2.50.0 / 4.12.14-lp150.12.45-default, LLVM 5.0.1) (GPU)
:
Number of test_kernels: 1
--
[ 0.00,  1.00,  0.00,  0.00]
[ 0.00,  2.00,  0.00,  0.00]
[ 0.241120,  2.00,  0.00,  0.00]

xxx

-- 
You are receiving this mail because:
You are the assignee for the bug.
You are the QA Contact for the bug.___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [Bug 109617] Gallium OpenCL: While-If Problem with Booleans

https://bugs.freedesktop.org/show_bug.cgi?id=109617

Bug ID: 109617
   Summary: Gallium OpenCL: While-If Problem with Booleans
   Product: Mesa
   Version: unspecified
  Hardware: x86-64 (AMD64)
OS: Linux (All)
Status: NEW
  Severity: normal
  Priority: medium
 Component: Other
  Assignee: mesa-dev@lists.freedesktop.org
  Reporter: rk...@mail.ru
QA Contact: mesa-dev@lists.freedesktop.org

OpenCL kernel containing While loop and If statement does not work correctly
with boolean variable, while everething is good with integer variable and with
For loop.

The kernel is the following: https://textuploader.com/15szh.

The results are the following: https://textuploader.com/15szu.

This is tested at openSUSE 15.0 notebook with AMD Radeon HD 8750M GPU.

-- 
You are receiving this mail because:
You are the QA Contact for the bug.
You are the assignee for the bug.___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [ANNOUNCE] mesa 19.0.0-rc3

2019-02-12 Thread Dylan Baker

Hi List,

Mesa 19.0-rc3 is now available.

Due to a bug I discovered in the script that scrapes for stable nominations
(after uploading the tarball) there is basically nothing in the -rc3 release. As
a result I'm planning to make a -rc4 tomorrow. You can see the staging/19.0
branch to see the additional patches present.

Dylan

git tag: mesa-19.0.0-rc3

https://mesa.freedesktop.org/archive/mesa-19.0.0-rc3.tar.gz
MD5:  d70c6895d2bb7a2d9a55335b9c9d47f0  mesa-19.0.0-rc3.tar.gz
SHA1: 714077b0cbe4074525722ce3ad73b63595af1b44  mesa-19.0.0-rc3.tar.gz
SHA256: 0a7fc12a8648349bb8c502ce3d4d27c4193dfd1c0d71fe1691c315756143317f  
mesa-19.0.0-rc3.tar.gz
SHA512: 
bd7352fdf10e3d3f367d41d583181c752c5254c102b06f1ba04ab0fb7704a0dd42e6c68ec2f842ce2855b634aa7d80d981287defb8a631464578fb70b0cc5ba4
  mesa-19.0.0-rc3.tar.gz
PGP:  https://mesa.freedesktop.org/archive/mesa-19.0.0-rc3.tar.gz.sig

https://mesa.freedesktop.org/archive/mesa-19.0.0-rc3.tar.xz
MD5:  25d93df06f8bf7532ab2f7dd2aaed43b  mesa-19.0.0-rc3.tar.xz
SHA1: 211a7cdae7cdb920873ee3ae35a71f2ab4051bee  mesa-19.0.0-rc3.tar.xz
SHA256: ef32a89df305863acbf8d8910ce228c5f11d14216cb5602f4ef0f33b1be5f1a6  
mesa-19.0.0-rc3.tar.xz
SHA512: 
adcb4cdcda0ef5c43e2257368a674328cb7533c484e631db6ff80d4a6d15ffba4f3f18a063cb63102a7ae89a5954f5e2f45ac8fe295064513f2656198a12307f
  mesa-19.0.0-rc3.tar.xz
PGP:  https://mesa.freedesktop.org/archive/mesa-19.0.0-rc3.tar.xz.sig




signature.asc
Description: signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] radeonsi: Fix guardband computation for large render targets

Pushed, thanks!

Marek

On Tue, Feb 12, 2019 at 2:55 PM Oscar Blumberg  wrote:

> Stop using 12.12 quantization for viewports that are not contained in
> the lower 4k corner of the render target as the hardware needs to keep
> both absolute and relative coordinates representable.
> ---
>  .../drivers/radeonsi/si_state_viewport.c  | 30 +--
>  1 file changed, 28 insertions(+), 2 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c
> b/src/gallium/drivers/radeonsi/si_state_viewport.c
> index dac90df1c4f..64bb956b200 100644
> --- a/src/gallium/drivers/radeonsi/si_state_viewport.c
> +++ b/src/gallium/drivers/radeonsi/si_state_viewport.c
> @@ -185,6 +185,16 @@ static void si_emit_guardband(struct si_context *ctx)
> const unsigned hw_screen_offset_alignment =
> ctx->chip_class >= VI ? 16 :
> MAX2(ctx->screen->se_tile_repeat, 16);
>
> +   /* Indexed by quantization modes */
> +   static unsigned max_viewport_size[] = {65535, 16383, 4095};
> +
> +   /* Ensure that the whole viewport stays representable in
> +* absolute coordinates.
> +* See comment in si_set_viewport_states.
> +*/
> +   assert(vp_as_scissor.maxx <=
> max_viewport_size[vp_as_scissor.quant_mode] &&
> +  vp_as_scissor.maxy <=
> max_viewport_size[vp_as_scissor.quant_mode]);
> +
> hw_screen_offset_x = CLAMP(hw_screen_offset_x, 0,
> MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
> hw_screen_offset_y = CLAMP(hw_screen_offset_y, 0,
> MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
>
> @@ -219,7 +229,6 @@ static void si_emit_guardband(struct si_context *ctx)
>  *
>  * The viewport range is [-max_viewport_size/2,
> max_viewport_size/2].
>  */
> -   static unsigned max_viewport_size[] = {65535, 16383, 4095};
> assert(vp_as_scissor.quant_mode < ARRAY_SIZE(max_viewport_size));
> max_range = max_viewport_size[vp_as_scissor.quant_mode] / 2;
> left   = (-max_range - vp.translate[0]) / vp.scale[0];
> @@ -333,6 +342,8 @@ static void si_set_viewport_states(struct pipe_context
> *pctx,
> unsigned h = scissor->maxy - scissor->miny;
> unsigned max_extent = MAX2(w, h);
>
> +   int max_corner = MAX2(scissor->maxx, scissor->maxy);
> +
> unsigned center_x = (scissor->maxx + scissor->minx) / 2;
> unsigned center_y = (scissor->maxy + scissor->miny) / 2;
> unsigned max_center = MAX2(center_x, center_y);
> @@ -358,7 +369,22 @@ static void si_set_viewport_states(struct
> pipe_context *pctx,
> if (ctx->family == CHIP_RAVEN)
> max_extent = 16384; /* Use QUANT_MODE == 16_8. */
>
> -   if (max_extent <= 1024) /* 4K scanline area for guardband
> */
> +   /* Another constraint is that all coordinates in the
> viewport
> +* are representable in fixed point with respect to the
> +* surface origin.
> +*
> +* It means that PA_SU_HARDWARE_SCREEN_OFFSET can't be
> given
> +* an offset that would make the upper corner of the
> viewport
> +* greater than the maximum representable number post
> +* quantization, ie 2^quant_bits.
> +*
> +* This does not matter for 14.10 and 16.8 formats since
> the
> +* offset is already limited at 8k, but it means we can't
> use
> +* 12.12 if we are drawing to some pixels outside the lower
> +* 4k x 4k of the render target.
> +*/
> +
> +   if (max_extent <= 1024 && max_corner < 4096) /* 4K
> scanline area for guardband */
> scissor->quant_mode =
> SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH;
> else if (max_extent <= 4096) /* 16K scanline area for
> guardband */
> scissor->quant_mode =
> SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH;
> --
> 2.20.1
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 6/6] glsl/linker: check for xfb_offset aliasing

2019-02-12 Thread Andres Gomez

From page 76 (page 80 of the PDF) of the GLSL 4.60 v.5 spec:

  " No aliasing in output buffers is allowed: It is a compile-time or
link-time error to specify variables with overlapping transform
feedback offsets."

Currently, this is expected to fail, but it succeeds:

  "

...

layout (xfb_offset = 0) out vec2 a;
layout (xfb_offset = 0) out vec4 b;

...

  "

v2: use a data structure to track the used components instead of a
nested loop (Ilia).

Cc: Timothy Arceri 
Cc: Ilia Mirkin 
Signed-off-by: Andres Gomez 
---
 src/compiler/glsl/link_varyings.cpp | 89 ++---
 src/mesa/main/mtypes.h  |  3 +
 2 files changed, 70 insertions(+), 22 deletions(-)

diff --git a/src/compiler/glsl/link_varyings.cpp 
b/src/compiler/glsl/link_varyings.cpp
index 8c7d6c14c8f..95e9ae895d2 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -1173,6 +1173,73 @@ tfeedback_decl::store(struct gl_context *ctx, struct 
gl_shader_program *prog,
   unsigned location = this->location;
   unsigned location_frac = this->location_frac;
   unsigned num_components = this->num_components();
+
+  /* From GL_EXT_transform_feedback:
+   *   A program will fail to link if:
+   *
+   * * the total number of components to capture is greater than the
+   *   constant MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS_EXT and
+   *   the buffer mode is INTERLEAVED_ATTRIBS_EXT.
+   *
+   * From GL_ARB_enhanced_layouts:
+   *
+   *   "The resulting stride (implicit or explicit) must be less than or
+   *equal to the implementation-dependent constant
+   *gl_MaxTransformFeedbackInterleavedComponents."
+   */
+  if ((prog->TransformFeedback.BufferMode == GL_INTERLEAVED_ATTRIBS ||
+   has_xfb_qualifiers) &&
+  xfb_offset + num_components >
+  ctx->Const.MaxTransformFeedbackInterleavedComponents) {
+ linker_error(prog,
+  "The MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS "
+  "limit has been exceeded.");
+ return false;
+  }
+
+  {
+ /* From the OpenGL 4.60.5 spec, section 4.4.2. Output Layout
+  * Qualifiers, Page 76, (Transform Feedback Layout Qualifiers):
+  *
+  * "No aliasing in output buffers is allowed: It is a compile-time or
+  *  link-time error to specify variables with overlapping transform
+  *  feedback offsets."
+  */
+ const unsigned max_components =
+ctx->Const.MaxTransformFeedbackInterleavedComponents;
+ const unsigned first_component = xfb_offset;
+ const unsigned last_component = xfb_offset + num_components - 1;
+ const unsigned start_word = BITSET_BITWORD(first_component);
+ const unsigned end_word = BITSET_BITWORD(last_component);
+ assert(last_component < max_components);
+
+ if (!info->Buffers[buffer].UsedComponents) {
+info->Buffers[buffer].UsedComponents =
+   rzalloc_array(info, BITSET_WORD, BITSET_WORDS(max_components));
+ }
+ BITSET_WORD *used = info->Buffers[buffer].UsedComponents;
+
+ for (unsigned word = start_word; word <= end_word; word++) {
+unsigned start_range = 0;
+unsigned end_range = BITSET_WORDBITS - 1;
+
+if (word == start_word)
+   start_range = first_component % BITSET_WORDBITS;
+
+if (word == end_word)
+   end_range = last_component % BITSET_WORDBITS;
+
+if (used[word] & BITSET_RANGE(start_range, end_range)) {
+   linker_error(prog,
+"variable '%s', xfb_offset (%d) "
+"is causing aliasing.",
+this->orig_name, xfb_offset * 4);
+   return false;
+}
+used[word] |= BITSET_RANGE(start_range, end_range);
+ }
+  }
+
   while (num_components > 0) {
  unsigned output_size = MIN2(num_components, 4 - location_frac);
  assert((info->NumOutputs == 0 && max_outputs == 0) ||
@@ -1223,28 +1290,6 @@ tfeedback_decl::store(struct gl_context *ctx, struct 
gl_shader_program *prog,
   info->Buffers[buffer].Stride = xfb_offset;
}
 
-   /* From GL_EXT_transform_feedback:
-*   A program will fail to link if:
-*
-* * the total number of components to capture is greater than
-*   the constant MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS_EXT
-*   and the buffer mode is INTERLEAVED_ATTRIBS_EXT.
-*
-* From GL_ARB_enhanced_layouts:
-*
-*   "The resulting stride (implicit or explicit) must be less than or
-*   equal to the implementation-dependent constant
-*   gl_MaxTransformFeedbackInterleavedComponents."
-*/
-   if ((prog->TransformFeedback.BufferMode == GL_INTERLEAVED_ATTRIBS ||
-

[Mesa-dev] radeonsi: NIR - Polaris triangle sprinkling running UH SOLVED - finally

2019-02-12 Thread Dieter Nützel


Hello Marek, Timo, Nicolai,

Timo SOLVED this long-standing NIR corruption on Polaris with his 'nir: 
rewrite varying component packing' commit.


It was triggered with

commit 86b52d42368ac496fe24bc6674e754c323381635
Author: Marek Olšák 
Date:   Fri Jul 13 00:23:36 2018 -0400

radeonsi: reduce LDS stalls by 40% for tessellation

40% is the decrease in the LGKM counter (which includes SMEM too)
for the GFX9 LSHS stage.

This will make the LDS size slightly larger, but I wasn't able to 
increase
the patch stride without corruption, so I'm increasing the vertex 
stride.


and now finally SOLVED with

commit 26aa460940f6222565ad5eb40a21c2377c59c3a6
Author: Timothy Arceri 
Date:   Mon Dec 10 10:23:51 2018 +1100

nir: rewrite varying component packing

There are a number of reasons for the rewrite.

1. Adding support for packing tess patch varyings in a sane way.

2. Making use of qsort allowing the code to be much easier to
   follow.

3. Fixes a bug where different interp types caused component
   packing to be skipped for all varyings in some scenarios.

4. Allows us to add a crude live range analysis for deciding
   which components should be packed together. This support can
   optionally be added in a future patch.

Reviewed-by: Jason Ekstrand 

Maybe it should backported (Cc: ) 
) for 19.0?


I hope my bisect help to bring some more understanding for this Polaris 
NIR bug.


Now, hunting for the (last) 19.0+ EQAA regression (DiRT Rally, black 
squares like  radv/DXVK corruption, NOT NIR related) and 'meson' OpenCL 
(Clover) build error.


Greetings,
Dieter
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [Bug 109107] gallium/st/va: change va max_profiles when using Radeon VCN Hardware

https://bugs.freedesktop.org/show_bug.cgi?id=109107

--- Comment #6 from Michael Eagle  ---
I can confirm that now chrome is no longer printing

ERROR:vaapi_wrapper.cc(568)] : vaQueryConfigProfiles returned: 14

in terminal, and chrome://media-internals/ reports:
video_decoder GpuVideoDecoder

So, GPU acceleration is working.

And, indeed, without allow_rgb10_configs=false , the video is messed up.

Thank you much for fixing this!

-- 
You are receiving this mail because:
You are the QA Contact for the bug.
You are the assignee for the bug.___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] radeonsi: Fix guardband computation for large render targets

2019-02-12 Thread Oscar Blumberg

Stop using 12.12 quantization for viewports that are not contained in
the lower 4k corner of the render target as the hardware needs to keep
both absolute and relative coordinates representable.
---
 .../drivers/radeonsi/si_state_viewport.c  | 30 +--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c 
b/src/gallium/drivers/radeonsi/si_state_viewport.c
index dac90df1c4f..64bb956b200 100644
--- a/src/gallium/drivers/radeonsi/si_state_viewport.c
+++ b/src/gallium/drivers/radeonsi/si_state_viewport.c
@@ -185,6 +185,16 @@ static void si_emit_guardband(struct si_context *ctx)
const unsigned hw_screen_offset_alignment =
ctx->chip_class >= VI ? 16 : MAX2(ctx->screen->se_tile_repeat, 
16);
 
+   /* Indexed by quantization modes */
+   static unsigned max_viewport_size[] = {65535, 16383, 4095};
+
+   /* Ensure that the whole viewport stays representable in
+* absolute coordinates.
+* See comment in si_set_viewport_states.
+*/
+   assert(vp_as_scissor.maxx <= 
max_viewport_size[vp_as_scissor.quant_mode] &&
+  vp_as_scissor.maxy <= 
max_viewport_size[vp_as_scissor.quant_mode]);
+
hw_screen_offset_x = CLAMP(hw_screen_offset_x, 0, 
MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
hw_screen_offset_y = CLAMP(hw_screen_offset_y, 0, 
MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
 
@@ -219,7 +229,6 @@ static void si_emit_guardband(struct si_context *ctx)
 *
 * The viewport range is [-max_viewport_size/2, max_viewport_size/2].
 */
-   static unsigned max_viewport_size[] = {65535, 16383, 4095};
assert(vp_as_scissor.quant_mode < ARRAY_SIZE(max_viewport_size));
max_range = max_viewport_size[vp_as_scissor.quant_mode] / 2;
left   = (-max_range - vp.translate[0]) / vp.scale[0];
@@ -333,6 +342,8 @@ static void si_set_viewport_states(struct pipe_context 
*pctx,
unsigned h = scissor->maxy - scissor->miny;
unsigned max_extent = MAX2(w, h);
 
+   int max_corner = MAX2(scissor->maxx, scissor->maxy);
+
unsigned center_x = (scissor->maxx + scissor->minx) / 2;
unsigned center_y = (scissor->maxy + scissor->miny) / 2;
unsigned max_center = MAX2(center_x, center_y);
@@ -358,7 +369,22 @@ static void si_set_viewport_states(struct pipe_context 
*pctx,
if (ctx->family == CHIP_RAVEN)
max_extent = 16384; /* Use QUANT_MODE == 16_8. */
 
-   if (max_extent <= 1024) /* 4K scanline area for guardband */
+   /* Another constraint is that all coordinates in the viewport
+* are representable in fixed point with respect to the
+* surface origin.
+*
+* It means that PA_SU_HARDWARE_SCREEN_OFFSET can't be given
+* an offset that would make the upper corner of the viewport
+* greater than the maximum representable number post
+* quantization, ie 2^quant_bits.
+*
+* This does not matter for 14.10 and 16.8 formats since the
+* offset is already limited at 8k, but it means we can't use
+* 12.12 if we are drawing to some pixels outside the lower
+* 4k x 4k of the render target.
+*/
+
+   if (max_extent <= 1024 && max_corner < 4096) /* 4K scanline 
area for guardband */
scissor->quant_mode = 
SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH;
else if (max_extent <= 4096) /* 16K scanline area for guardband 
*/
scissor->quant_mode = 
SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH;
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v6 5/5] gallium/auxiliary/vl: Add video compositor compute shader render

2019-02-12 Thread Zhu, James

Add compute shader initilization, assign and cleanup in vl_compositor API.
Set video compositor compute shader render as default when pipe support it.

Signed-off-by: James Zhu 
Reviewed-by: Christian König 
---
 src/gallium/auxiliary/vl/vl_compositor.c | 106 +++
 src/gallium/auxiliary/vl/vl_compositor.h |   5 ++
 2 files changed, 83 insertions(+), 28 deletions(-)

diff --git a/src/gallium/auxiliary/vl/vl_compositor.c 
b/src/gallium/auxiliary/vl/vl_compositor.c
index 4509913..8731ad9 100644
--- a/src/gallium/auxiliary/vl/vl_compositor.c
+++ b/src/gallium/auxiliary/vl/vl_compositor.c
@@ -28,6 +28,7 @@
 #include "util/u_sampler.h"
 
 #include "vl_compositor_gfx.h"
+#include "vl_compositor_cs.h"
 
 static bool
 init_shaders(struct vl_compositor *c)
@@ -40,18 +41,6 @@ init_shaders(struct vl_compositor *c)
   return false;
}
 
-   c->fs_video_buffer = create_frag_shader_video_buffer(c);
-   if (!c->fs_video_buffer) {
-  debug_printf("Unable to create YCbCr-to-RGB fragment shader.\n");
-  return false;
-   }
-
-   c->fs_weave_rgb = create_frag_shader_weave_rgb(c);
-   if (!c->fs_weave_rgb) {
-  debug_printf("Unable to create YCbCr-to-RGB weave fragment shader.\n");
-  return false;
-   }
-
c->fs_yuv.weave.y = create_frag_shader_deint_yuv(c, true, true);
c->fs_yuv.weave.uv = create_frag_shader_deint_yuv(c, false, true);
c->fs_yuv.bob.y = create_frag_shader_deint_yuv(c, true, false);
@@ -74,12 +63,6 @@ init_shaders(struct vl_compositor *c)
   return false;
}
 
-   c->fs_rgba = create_frag_shader_rgba(c);
-   if (!c->fs_rgba) {
-  debug_printf("Unable to create RGB-to-RGB fragment shader.\n");
-  return false;
-   }
-
c->fs_rgb_yuv.y = create_frag_shader_rgb_yuv(c, true);
c->fs_rgb_yuv.uv = create_frag_shader_rgb_yuv(c, false);
if (!c->fs_rgb_yuv.y || !c->fs_rgb_yuv.uv) {
@@ -87,6 +70,44 @@ init_shaders(struct vl_compositor *c)
   return false;
}
 
+   if (c->pipe_compute_supported) {
+  c->cs_video_buffer = vl_compositor_cs_create_shader(c, 
compute_shader_video_buffer);
+  if (!c->cs_video_buffer) {
+ debug_printf("Unable to create video_buffer compute shader.\n");
+ return false;
+  }
+
+  c->cs_weave_rgb = vl_compositor_cs_create_shader(c, 
compute_shader_weave);
+  if (!c->cs_weave_rgb) {
+ debug_printf("Unable to create weave_rgb compute shader.\n");
+ return false;
+  }
+
+  c->cs_rgba = vl_compositor_cs_create_shader(c, compute_shader_rgba);
+  if (!c->cs_rgba) {
+ debug_printf("Unable to create RGB-to-RGB compute shader.\n");
+ return false;
+  }
+   } else {
+  c->fs_video_buffer = create_frag_shader_video_buffer(c);
+  if (!c->fs_video_buffer) {
+ debug_printf("Unable to create YCbCr-to-RGB fragment shader.\n");
+ return false;
+  }
+
+  c->fs_weave_rgb = create_frag_shader_weave_rgb(c);
+  if (!c->fs_weave_rgb) {
+ debug_printf("Unable to create YCbCr-to-RGB weave fragment 
shader.\n");
+ return false;
+  }
+
+  c->fs_rgba = create_frag_shader_rgba(c);
+  if (!c->fs_rgba) {
+ debug_printf("Unable to create RGB-to-RGB fragment shader.\n");
+ return false;
+  }
+   }
+
return true;
 }
 
@@ -95,17 +116,24 @@ static void cleanup_shaders(struct vl_compositor *c)
assert(c);
 
c->pipe->delete_vs_state(c->pipe, c->vs);
-   c->pipe->delete_fs_state(c->pipe, c->fs_video_buffer);
-   c->pipe->delete_fs_state(c->pipe, c->fs_weave_rgb);
c->pipe->delete_fs_state(c->pipe, c->fs_yuv.weave.y);
c->pipe->delete_fs_state(c->pipe, c->fs_yuv.weave.uv);
c->pipe->delete_fs_state(c->pipe, c->fs_yuv.bob.y);
c->pipe->delete_fs_state(c->pipe, c->fs_yuv.bob.uv);
c->pipe->delete_fs_state(c->pipe, c->fs_palette.yuv);
c->pipe->delete_fs_state(c->pipe, c->fs_palette.rgb);
-   c->pipe->delete_fs_state(c->pipe, c->fs_rgba);
c->pipe->delete_fs_state(c->pipe, c->fs_rgb_yuv.y);
c->pipe->delete_fs_state(c->pipe, c->fs_rgb_yuv.uv);
+
+   if (c->pipe_compute_supported) {
+  c->pipe->delete_compute_state(c->pipe, c->cs_video_buffer);
+  c->pipe->delete_compute_state(c->pipe, c->cs_weave_rgb);
+  c->pipe->delete_compute_state(c->pipe, c->cs_rgba);
+   } else {
+  c->pipe->delete_fs_state(c->pipe, c->fs_video_buffer);
+  c->pipe->delete_fs_state(c->pipe, c->fs_weave_rgb);
+  c->pipe->delete_fs_state(c->pipe, c->fs_rgba);
+   }
 }
 
 static bool
@@ -409,6 +437,7 @@ vl_compositor_clear_layers(struct vl_compositor_state *s)
   s->layers[i].clearing = i ? false : true;
   s->layers[i].blend = NULL;
   s->layers[i].fs = NULL;
+  s->layers[i].cs = NULL;
   s->layers[i].viewport.scale[2] = 1;
   s->layers[i].viewport.translate[2] = 0;
   s->layers[i].rotate = VL_COMPOSITOR_ROTATE_0;
@@ -532,26 +561,39 @@ vl_compositor_set_buffer_layer(struct vl_compositor_state 
*s,
   float half_a_line =

[Mesa-dev] [PATCH] nir/xfb: Properly align 64-bit values

Fixes: 19064b8c "nir: Add a pass for gathering transform feedback info"
Cc: Alejandro Piñeiro 
---
 src/compiler/nir/nir_gather_xfb_info.c | 44 ++
 1 file changed, 44 insertions(+)

diff --git a/src/compiler/nir/nir_gather_xfb_info.c 
b/src/compiler/nir/nir_gather_xfb_info.c
index 96f0ece5e75..fb736dfeb17 100644
--- a/src/compiler/nir/nir_gather_xfb_info.c
+++ b/src/compiler/nir/nir_gather_xfb_info.c
@@ -72,6 +72,50 @@ add_var_xfb_outputs(nir_xfb_info *xfb,
   assert(var->data.location_frac + comp_slots <= 8);
   uint8_t comp_mask = ((1 << comp_slots) - 1) << var->data.location_frac;
 
+  /* From version 4.60 of the GLSL spec:
+   *
+   *"Variables and block members qualified with xfb_offset can be
+   *scalars, vectors, matrices, structures, and (sized) arrays of
+   *these. The offset must be a multiple of the size of the first
+   *component of the first qualified variable or block member, or a
+   *compile-time error results. Further, if applied to an aggregate
+   *containing a double, the offset must also be a multiple of 8, and
+   *the space taken in the buffer will be a multiple of 8. The given
+   *offset applies to the first component of the first member of the
+   *qualified entity. Then, within the qualified entity, subsequent
+   *components are each assigned, in order, to the next available
+   *offset aligned to a multiple of that component's size. Aggregate
+   *types are flattened down to the component level to get this
+   *sequence of components."
+   *
+   * We need to align each element to the component size in order to get
+   * the correct layout.  We do this at the component level and don't try
+   * to align entire aggregate types such as structs because of the last
+   * sentence which says that aggregate types are treated as flattened to
+   * components.  In other words, if we have
+   *
+   *struct A {
+   *   int a;
+   *   double b;
+   *};
+   *
+   *struct B {
+   *   int b
+   *   A a;
+   *};
+   *
+   *layout (...) out B o;
+   *
+   * then we treat it as if struct A was embedded struct B and o.a.b has
+   * an offset of 8.  If we tried to apply the alignment rule to nested
+   * structs and didn't flatten, o.a would have an offset of 8 because it
+   * contains a double and o.a.b would then have an offset of 16.
+   * However, thanks to the above GLSL rule, o.b and o.a.a are tightly
+   * packed and there is no gap.
+   */
+  if (glsl_type_is_64bit(type))
+ *offset = ALIGN_POT(*offset, 8);
+
   assert(attrib_slots <= 2);
   for (unsigned s = 0; s < attrib_slots; s++) {
  nir_xfb_output_info *output = >outputs[xfb->output_count++];
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] nir: fix example in opt_peel_loop_initial_if description

---
 src/compiler/nir/nir_opt_if.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/compiler/nir/nir_opt_if.c b/src/compiler/nir/nir_opt_if.c
index 9afb901be14..170caaad89d 100644
--- a/src/compiler/nir/nir_opt_if.c
+++ b/src/compiler/nir/nir_opt_if.c
@@ -96,7 +96,7 @@ 
phi_has_constant_from_outside_and_one_from_inside_loop(nir_phi_instr *phi,
  *block block_1:
  *vec1 32 ssa_2 = phi block_0: ssa_0, block_7: ssa_5
  *vec1 32 ssa_3 = phi block_0: ssa_0, block_7: ssa_1
- *if ssa_2 {
+ *if ssa_3 {
  *   block block_2:
  *   vec1 32 ssa_4 = load_const (0x0001)
  *   vec1 32 ssa_5 = iadd ssa_2, ssa_4
@@ -121,9 +121,9 @@ 
phi_has_constant_from_outside_and_one_from_inside_loop(nir_phi_instr *phi,
  * // Stuff from block 3
  * loop {
  *block block_1:
- *vec1 32 ssa_3 = phi block_0: ssa_0, block_7: ssa_1
+ *vec1 32 ssa_2 = phi block_0: ssa_0, block_7: ssa_5
  *vec1 32 ssa_6 = load_const (0x0004)
- *vec1 32 ssa_7 = ilt ssa_5, ssa_6
+ *vec1 32 ssa_7 = ilt ssa_2, ssa_6
  *if ssa_7 {
  *   block block_5:
  *} else {
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [Bug 109201] Deep Rock Galactic: GPU Hang (Steam Play) (DXVK)

https://bugs.freedesktop.org/show_bug.cgi?id=109201

--- Comment #15 from Alexander  ---
unfortunately still the same

-- 
You are receiving this mail because:
You are the assignee for the bug.
You are the QA Contact for the bug.___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] nir: remove jump from two merging jump-ending blocks

On Tue, Feb 12, 2019 at 04:38:04PM +0100, Juan A. Suarez Romero wrote:
> In opt_peel_initial_if optimization, when moving the continue list to
> end of the continue block, before the jump, could happen that the
> continue list itself also ends with a jump.
> 
> This would mean that we would have two jump instructions in a row: the
> first one from the continue list and the second one from the contine
> block.
> 
> As inserting an instruction after a jump is not allowed (and it does not
> make sense, as it will not be executed), remove the jump from the
> continue block and keep the one from continue list, as it will be
> executed first.
> 
> CC: Jason Ekstrand 
> ---
>  src/compiler/nir/nir_opt_if.c | 21 +++--
>  1 file changed, 19 insertions(+), 2 deletions(-)


Reviewed-by: Caio Marcelo de Oliveira Filho 



> diff --git a/src/compiler/nir/nir_opt_if.c b/src/compiler/nir/nir_opt_if.c
> index 932af9e37ab..a011401b3b4 100644
> --- a/src/compiler/nir/nir_opt_if.c
> +++ b/src/compiler/nir/nir_opt_if.c
> @@ -241,12 +241,29 @@ opt_peel_loop_initial_if(nir_loop *loop)
> nir_cf_reinsert(,
> nir_after_block_before_jump(find_continue_block(loop)));
>  
> +   bool continue_list_jumps =
> +  nir_block_ends_in_jump(exec_node_data(nir_block,
> +
> exec_list_get_tail(continue_list),
> +cf_node.node));
> +
> nir_cf_extract(, nir_before_cf_list(continue_list),
>  nir_after_cf_list(continue_list));
>  
> -   /* Get continue block again as the previous reinsert might have removed 
> the block. */
> +   /* Get continue block again as the previous reinsert might have removed 
> the
> +* block.  Also, if both the continue list and the continue block ends in
> +* jump instructions, removes the jump from the later, as it will not be

"latter"

> +* executed if we insert the continue list before it */

"...before it."


> +
> +   nir_block *continue_block = find_continue_block(loop);
> +
> +   if (continue_list_jumps) {
> +  nir_instr *last_instr = nir_block_last_instr(continue_block);
> +  if (last_instr && last_instr->type == nir_instr_type_jump)
> + nir_instr_remove(last_instr);
> +   }
> +
> nir_cf_reinsert(,
> -   nir_after_block_before_jump(find_continue_block(loop)));
> +   nir_after_block_before_jump(continue_block));
>  
> nir_cf_node_remove(>cf_node);
>  
> -- 
> 2.20.1
> 
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Caio
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] nir: allow stitching of non-empty block

Just saw your patch.  I'll review that one then :-)

On Tue, Feb 12, 2019 at 09:38:32AM -0800, Caio Marcelo de Oliveira Filho wrote:
> Hi Juan,
> 
> On Tue, Feb 12, 2019 at 04:37:23PM +0100, Juan A. Suarez Romero wrote:
> > On Fri, 2019-02-08 at 15:39 -0600, Jason Ekstrand wrote:
> > > I had a chat with Caio about this and I'm skeptical.  In general, users 
> > > of the CF manipulation code shouldn't be stitching two blocks together 
> > > where the first contains a jump and the second is non-empty.  If the 
> > > caller knows that this case is ok, then they can check for it and empty 
> > > out the one block before stitching.  Also, I'm not really seeing how 
> > > peel_initial_if would hit this case from your example.
> > > 
> > > 
> > The problem happens when moving the continous list to the end of continue 
> > block in loop; the former ends in a jump ("break") and the later also ends 
> > in a jump ("continue"), so stitch block complains because there will be an 
> > instruction (the "continue") after the jump (the "break").
> 
> I was investigating this yesterday and attempted to write a MR, could
> you take a look?
> 
> https://gitlab.freedesktop.org/mesa/mesa/merge_requests/238
> 
> 
>   Caio


Caio
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 2/2] gallium/u_tests: use a compute-only context to test GCN compute ring

From: Marek Olšák 

---
 src/gallium/auxiliary/util/u_tests.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_tests.c 
b/src/gallium/auxiliary/util/u_tests.c
index 365d4fa8f17..7b87337bb98 100644
--- a/src/gallium/auxiliary/util/u_tests.c
+++ b/src/gallium/auxiliary/util/u_tests.c
@@ -783,25 +783,23 @@ test_texture_barrier(struct pipe_context *ctx, bool 
use_fbfetch,
ctx->delete_fs_state(ctx, fs);
pipe_sampler_view_reference(, NULL);
pipe_resource_reference(, NULL);
 
util_report_result_helper(pass, name);
 }
 
 static void
 test_compute_clear_image(struct pipe_context *ctx)
 {
-   struct cso_context *cso;
struct pipe_resource *cb;
const char *text;
 
-   cso = cso_create_context(ctx, 0);
cb = util_create_texture2d(ctx->screen, 256, 256,
   PIPE_FORMAT_R8G8B8A8_UNORM, 1);
 
/* Compute shader. */
text = "COMP\n"
   "PROPERTY CS_FIXED_BLOCK_WIDTH 8\n"
   "PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n"
   "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
   "DCL SV[0], THREAD_ID\n"
   "DCL SV[1], BLOCK_ID\n"
@@ -820,21 +818,21 @@ test_compute_clear_image(struct pipe_context *ctx)
   assert(0);
   util_report_result(FAIL);
   return;
}
 
struct pipe_compute_state state = {0};
state.ir_type = PIPE_SHADER_IR_TGSI;
state.prog = tokens;
 
void *compute_shader = ctx->create_compute_state(ctx, );
-   cso_set_compute_shader_handle(cso, compute_shader);
+   ctx->bind_compute_state(ctx, compute_shader);
 
/* Bind the image. */
struct pipe_image_view image = {0};
image.resource = cb;
image.shader_access = image.access = PIPE_IMAGE_ACCESS_READ_WRITE;
image.format = cb->format;
 
ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, );
 
/* Dispatch compute. */
@@ -847,21 +845,20 @@ test_compute_clear_image(struct pipe_context *ctx)
info.grid[2] = 1;
 
ctx->launch_grid(ctx, );
 
/* Check pixels. */
static const float expected[] = {1.0, 0.0, 0.0, 0.0};
bool pass = util_probe_rect_rgba(ctx, cb, 0, 0,
 cb->width0, cb->height0, expected);
 
/* Cleanup. */
-   cso_destroy_context(cso);
ctx->delete_compute_state(ctx, compute_shader);
pipe_resource_reference(, NULL);
 
util_report_result(pass);
 }
 
 /**
  * Run all tests. This should be run with a clean context after
  * context_create.
  */
@@ -874,18 +871,19 @@ util_run_tests(struct pipe_screen *screen)
tgsi_vs_window_space_position(ctx);
null_sampler_view(ctx, TGSI_TEXTURE_2D);
null_sampler_view(ctx, TGSI_TEXTURE_BUFFER);
util_test_constant_buffer(ctx, NULL);
test_sync_file_fences(ctx);
 
for (int i = 1; i <= 8; i = i * 2)
   test_texture_barrier(ctx, false, i);
for (int i = 1; i <= 8; i = i * 2)
   test_texture_barrier(ctx, true, i);
+   ctx->destroy(ctx);
 
+   ctx = screen->context_create(screen, NULL, PIPE_CONTEXT_COMPUTE_ONLY);
test_compute_clear_image(ctx);
-
ctx->destroy(ctx);
 
puts("Done. Exiting..");
exit(0);
 }
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] nir: allow stitching of non-empty block

On Tue, 2019-02-12 at 09:38 -0800, Caio Marcelo de Oliveira Filho wrote:
> Hi Juan,
> 
> On Tue, Feb 12, 2019 at 04:37:23PM +0100, Juan A. Suarez Romero wrote:
> > On Fri, 2019-02-08 at 15:39 -0600, Jason Ekstrand wrote:
> > > I had a chat with Caio about this and I'm skeptical.  In general, users 
> > > of the CF manipulation code shouldn't be stitching two blocks together 
> > > where the first contains a jump and the second is non-empty.  If the 
> > > caller knows that this case is ok, then they can check for it and empty 
> > > out the one block before stitching.  Also, I'm not really seeing how 
> > > peel_initial_if would hit this case from your example.
> > > 
> > > 
> > The problem happens when moving the continous list to the end of continue 
> > block in loop; the former ends in a jump ("break") and the later also ends 
> > in a jump ("continue"), so stitch block complains because there will be an 
> > instruction (the "continue") after the jump (the "break").
> 
> I was investigating this yesterday and attempted to write a MR, could
> you take a look?
> 
> https://gitlab.freedesktop.org/mesa/mesa/merge_requests/238
> 
> 


I had sent a patch to fix it (https://patchwork.freedesktop.org/patch/285649/)
which is similar to your MR.

Other than that, your MR also fixes the issue.

J.A.



>   Caio
> 

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 1/2] radeonsi: always use compute rings for clover on CI and newer (v2)

From: Marek Olšák 

initialize all non-compute context functions to NULL.

v2: fix SI
---
 src/gallium/drivers/radeonsi/si_blit.c| 14 ++-
 src/gallium/drivers/radeonsi/si_clear.c   |  7 +-
 src/gallium/drivers/radeonsi/si_compute.c | 15 +--
 src/gallium/drivers/radeonsi/si_descriptors.c | 10 +-
 src/gallium/drivers/radeonsi/si_gfx_cs.c  | 29 +++---
 src/gallium/drivers/radeonsi/si_pipe.c| 95 +++
 src/gallium/drivers/radeonsi/si_pipe.h|  3 +-
 src/gallium/drivers/radeonsi/si_state.c   |  3 +-
 src/gallium/drivers/radeonsi/si_state.h   |  1 +
 src/gallium/drivers/radeonsi/si_state_draw.c  | 25 +++--
 src/gallium/drivers/radeonsi/si_texture.c |  3 +
 11 files changed, 130 insertions(+), 75 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_blit.c 
b/src/gallium/drivers/radeonsi/si_blit.c
index bb8d1cbd12d..f39cb5d143f 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -1345,25 +1345,31 @@ static void si_flush_resource(struct pipe_context *ctx,
 
if (separate_dcc_dirty) {
tex->separate_dcc_dirty = false;
vi_separate_dcc_process_and_reset_stats(ctx, tex);
}
}
 }
 
 void si_decompress_dcc(struct si_context *sctx, struct si_texture *tex)
 {
-   if (!tex->dcc_offset)
+   /* If graphics is disabled, we can't decompress DCC, but it shouldn't
+* be compressed either. The caller should simply discard it.
+*/
+   if (!tex->dcc_offset || !sctx->has_graphics)
return;
 
si_blit_decompress_color(sctx, tex, 0, tex->buffer.b.b.last_level,
 0, util_max_layer(>buffer.b.b, 0),
 true);
 }
 
 void si_init_blit_functions(struct si_context *sctx)
 {
sctx->b.resource_copy_region = si_resource_copy_region;
-   sctx->b.blit = si_blit;
-   sctx->b.flush_resource = si_flush_resource;
-   sctx->b.generate_mipmap = si_generate_mipmap;
+
+   if (sctx->has_graphics) {
+   sctx->b.blit = si_blit;
+   sctx->b.flush_resource = si_flush_resource;
+   sctx->b.generate_mipmap = si_generate_mipmap;
+   }
 }
diff --git a/src/gallium/drivers/radeonsi/si_clear.c 
b/src/gallium/drivers/radeonsi/si_clear.c
index 9a00bb73b94..e1805f2a1c9 100644
--- a/src/gallium/drivers/radeonsi/si_clear.c
+++ b/src/gallium/drivers/radeonsi/si_clear.c
@@ -764,15 +764,18 @@ static void si_clear_texture(struct pipe_context *pipe,
util_clear_render_target(pipe, sf, ,
 box->x, box->y,
 box->width, box->height);
}
}
pipe_surface_reference(, NULL);
 }
 
 void si_init_clear_functions(struct si_context *sctx)
 {
-   sctx->b.clear = si_clear;
sctx->b.clear_render_target = si_clear_render_target;
-   sctx->b.clear_depth_stencil = si_clear_depth_stencil;
sctx->b.clear_texture = si_clear_texture;
+
+   if (sctx->has_graphics) {
+   sctx->b.clear = si_clear;
+   sctx->b.clear_depth_stencil = si_clear_depth_stencil;
+   }
 }
diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 1a62b3e0844..87addd53976 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -880,26 +880,28 @@ static void si_launch_grid(
info->block[0] * info->block[1] * info->block[2] > 256;
 
if (cs_regalloc_hang)
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
 SI_CONTEXT_CS_PARTIAL_FLUSH;
 
if (program->ir_type != PIPE_SHADER_IR_NATIVE &&
program->shader.compilation_failed)
return;
 
-   if (sctx->last_num_draw_calls != sctx->num_draw_calls) {
-   si_update_fb_dirtiness_after_rendering(sctx);
-   sctx->last_num_draw_calls = sctx->num_draw_calls;
-   }
+   if (sctx->has_graphics) {
+   if (sctx->last_num_draw_calls != sctx->num_draw_calls) {
+   si_update_fb_dirtiness_after_rendering(sctx);
+   sctx->last_num_draw_calls = sctx->num_draw_calls;
+   }
 
-   si_decompress_textures(sctx, 1 << PIPE_SHADER_COMPUTE);
+   si_decompress_textures(sctx, 1 << PIPE_SHADER_COMPUTE);
+   }
 
/* Add buffer sizes for memory checking in need_cs_space. */
si_context_add_resource_size(sctx, >shader.bo->b.b);
/* TODO: add the scratch buffer */
 
if (info->indirect) {
si_context_add_resource_size(sctx, info->indirect);
 
/* Indirect buffers use TC L2 on GFX9, but not older hw. */
if (sctx->chip_class <= VI &&
@@ -917,21 +919,22 @@ static void

Re: [Mesa-dev] [PATCH] anv/cmd_buffer: check for NULL framebuffer

On Tue, 2019-02-12 at 11:31 -0600, Jason Ekstrand wrote:
> On Tue, Feb 12, 2019 at 10:48 AM Juan A. Suarez Romero  
> wrote:
> > This can happen when we record a VkCmdDraw in a secondary buffer that
> > 
> > was created inheriting from the primary buffer, but with the framebuffer
> > 
> > set to NULL in the VkCommandBufferInheritanceInfo.
> > 
> > 
> > 
> > Vulkan 1.1.81 spec says that "the application must ensure (using scissor
> > 
> > if neccesary) that all rendering is contained in the render area [...]
> > 
> > [which] must be contained within the framebuffer dimesions".
> > 
> > 
> > 
> > While this should be done by the application, commit 465e5a86 added the
> > 
> > clamp to the framebuffer size, in case of application does not do it.
> > 
> > But this requires to know the framebuffer dimensions.
> > 
> > 
> > 
> > If we do not have a framebuffer at that moment, the best compromise we
> > 
> > can do is to just apply the scissor as it is, and let the application to
> > 
> > ensure the rendering is contained in the render area.
> > 
> > 
> > 
> > v2: do not clamp to framebuffer if there isn't a framebuffer
> > 
> > 
> > 
> > v3 (Jason):
> > 
> > - clamp earlier in the conditional
> > 
> > - clamp to render area if command buffer is primary
> > 
> > 
> > 
> > v4: clamp also x and y to render area (Jason)
> > 
> > 
> > 
> > Fixes: 465e5a86 ("anv: Clamp scissors to the framebuffer boundary")
> > 
> > CC: Jason Ekstrand 
> > 
> > ---
> > 
> >  src/intel/vulkan/gen7_cmd_buffer.c | 32 +-
> > 
> >  1 file changed, 27 insertions(+), 5 deletions(-)
> > 
> > 
> > 
> > diff --git a/src/intel/vulkan/gen7_cmd_buffer.c 
> > b/src/intel/vulkan/gen7_cmd_buffer.c
> > 
> > index 352892aee33..2924c6031fd 100644
> > 
> > --- a/src/intel/vulkan/gen7_cmd_buffer.c
> > 
> > +++ b/src/intel/vulkan/gen7_cmd_buffer.c
> > 
> > @@ -70,12 +70,34 @@ gen7_cmd_buffer_emit_scissor(struct anv_cmd_buffer 
> > *cmd_buffer)
> > 
> >};
> > 
> > 
> > 
> >const int max = 0x;
> > 
> > +
> > 
> > +  uint32_t y = s->offset.y;
> > 
> > +  uint32_t x = s->offset.x;
> > 
> > +  uint32_t height = s->offset.y + s->extent.height - 1;
> > 
> > +  uint32_t width = s->offset.x + s->extent.width - 1;
> 
> These should be x_max and y_max not width and height.  With that changed,

Right. I'll change also "x" and "y" by "x_min" and "y_min".
> Reviewed-by: Jason Ekstrand 
> 
> Sorry we're going to v5...
> 
 Not problem!
J.A.
> --Jason
>  
> > +
> > 
> > +  /* Do this math using int64_t so overflow gets clamped correctly. */
> > 
> > +  if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
> > 
> > + y = clamp_int64((uint64_t) y, 
> > cmd_buffer->state.render_area.offset.y, max);
> > 
> > + x = clamp_int64((uint64_t) x, 
> > cmd_buffer->state.render_area.offset.x, max);
> > 
> > + height = clamp_int64((uint64_t) height, 0,
> > 
> > +  cmd_buffer->state.render_area.offset.y +
> > 
> > +  cmd_buffer->state.render_area.extent.height 
> > - 1);
> > 
> > + width = clamp_int64((uint64_t) width, 0,
> > 
> > + cmd_buffer->state.render_area.offset.x +
> > 
> > + cmd_buffer->state.render_area.extent.width - 
> > 1);
> > 
> > +  } else if (fb) {
> > 
> > + y = clamp_int64((uint64_t) y, 0, max);
> > 
> > + x = clamp_int64((uint64_t) x, 0, max);
> > 
> > + height = clamp_int64((uint64_t) height, 0, fb->height - 1);
> > 
> > + width = clamp_int64((uint64_t) width, 0, fb->width - 1);
> > 
> > +  }
> > 
> > +
> > 
> >struct GEN7_SCISSOR_RECT scissor = {
> > 
> > - /* Do this math using int64_t so overflow gets clamped correctly. 
> > */
> > 
> > - .ScissorRectangleYMin = clamp_int64(s->offset.y, 0, max),
> > 
> > - .ScissorRectangleXMin = clamp_int64(s->offset.x, 0, max),
> > 
> > - .ScissorRectangleYMax = clamp_int64((uint64_t) s->offset.y + 
> > s->extent.height - 1, 0, fb->height - 1),
> > 
> > - .ScissorRectangleXMax = clamp_int64((uint64_t) s->offset.x + 
> > s->extent.width - 1, 0, fb->width - 1)
> > 
> > + .ScissorRectangleYMin = y,
> > 
> > + .ScissorRectangleXMin = x,
> > 
> > + .ScissorRectangleYMax = height,
> > 
> > + .ScissorRectangleXMax = width
> > 
> >};
> > 
> > 
> > 
> >if (s->extent.width <= 0 || s->extent.height <= 0) {
> > 
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 34/40] intel/compiler: validate region restrictions for half-float conversions

---
 src/intel/compiler/brw_eu_validate.c|  64 -
 src/intel/compiler/test_eu_validate.cpp | 122 
 2 files changed, 185 insertions(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_eu_validate.c 
b/src/intel/compiler/brw_eu_validate.c
index 000a05cb6ac..203641fecb9 100644
--- a/src/intel/compiler/brw_eu_validate.c
+++ b/src/intel/compiler/brw_eu_validate.c
@@ -531,7 +531,69 @@ general_restrictions_based_on_operand_types(const struct 
gen_device_info *devinf
exec_type_size == 8 && dst_type_size == 4)
   dst_type_size = 8;
 
-   if (exec_type_size > dst_type_size) {
+   /* From the BDW+ PRM:
+*
+*"There is no direct conversion from HF to DF or DF to HF.
+* There is no direct conversion from HF to Q/UQ or Q/UQ to HF."
+*/
+   enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst);
+   ERROR_IF(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MOV &&
+((dst_type == BRW_REGISTER_TYPE_HF && type_sz(src0_type) == 8) ||
+ (dst_type_size == 8 && src0_type == BRW_REGISTER_TYPE_HF)),
+"There are no direct conversion between 64-bit types and HF");
+
+   /* From the BDW+ PRM:
+*
+*   "Conversion between Integer and HF (Half Float) must be
+*DWord-aligned and strided by a DWord on the destination."
+*
+* But this seems to be expanded on CHV and SKL+ by:
+*
+*   "There is a relaxed alignment rule for word destinations. When
+*the destination type is word (UW, W, HF), destination data types
+*can be aligned to either the lowest word or the second lowest
+*word of the execution channel. This means the destination data
+*words can be either all in the even word locations or all in the
+*odd word locations."
+*
+* We do not implement the second rule as is though, since empirical testing
+* shows inconsistencies:
+*   - It suggests that packed 16-bit is not allowed, which is not true.
+*   - It suggests that conversions from Q/DF to W (which need to be 64-bit
+* aligned on the destination) are not possible, which is not true.
+*   - It suggests that conversions from 16-bit executions types to W need
+* to be 32-bit aligned, which doesn't seem to be necessary.
+*
+* So from this rule we only validate the implication that conversion from
+* F to HF needs to be DWord aligned too (in BDW this is limited to
+* conversions from integer types).
+*/
+   bool is_half_float_conversion =
+   brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MOV &&
+   dst_type != src0_type &&
+   (dst_type == BRW_REGISTER_TYPE_HF || src0_type == BRW_REGISTER_TYPE_HF);
+
+   if (is_half_float_conversion) {
+  assert(devinfo->gen >= 8);
+
+  if ((dst_type == BRW_REGISTER_TYPE_HF && 
brw_reg_type_is_integer(src0_type)) ||
+  (brw_reg_type_is_integer(dst_type) && src0_type == 
BRW_REGISTER_TYPE_HF)) {
+ ERROR_IF(dst_stride * dst_type_size != 4,
+  "Conversions between integer and half-float must be strided "
+  "by a DWord on the destination");
+
+ unsigned subreg = brw_inst_dst_da1_subreg_nr(devinfo, inst);
+ ERROR_IF(subreg % 4 != 0,
+  "Conversions between integer and half-float must be aligned "
+  "to a DWord on the destination");
+  } else if ((devinfo->is_cherryview || devinfo->gen >= 9) &&
+ dst_type == BRW_REGISTER_TYPE_HF) {
+ ERROR_IF(dst_stride != 2,
+  "Conversions to HF must have either all words in even word "
+  "locations or all words in odd word locations");
+  }
+
+   } else if (exec_type_size > dst_type_size) {
   if (!(dst_type_is_byte && inst_is_raw_move(devinfo, inst))) {
  ERROR_IF(dst_stride * dst_type_size != exec_type_size,
   "Destination stride must be equal to the ratio of the sizes "
diff --git a/src/intel/compiler/test_eu_validate.cpp 
b/src/intel/compiler/test_eu_validate.cpp
index 73300b23122..1557b6d2452 100644
--- a/src/intel/compiler/test_eu_validate.cpp
+++ b/src/intel/compiler/test_eu_validate.cpp
@@ -848,6 +848,128 @@ TEST_P(validation_test, 
byte_destination_relaxed_alignment)
}
 }
 
+TEST_P(validation_test, half_float_conversion)
+{
+   static const struct {
+  enum brw_reg_type dst_type;
+  enum brw_reg_type src_type;
+  unsigned dst_stride;
+  unsigned dst_subnr;
+  bool expected_result;
+   } inst[] = {
+#define INST(dst_type, src_type, dst_stride, dst_subnr, expected_result)  \
+  {   \
+ BRW_REGISTER_TYPE_##dst_type,\
+ BRW_REGISTER_TYPE_##src_type,\
+ BRW_HORIZONTAL_STRIDE_##dst_stride,  \
+ dst_subnr,

[Mesa-dev] [PATCH] anv/cmd_buffer: check for NULL framebuffer

This can happen when we record a VkCmdDraw in a secondary buffer that
was created inheriting from the primary buffer, but with the framebuffer
set to NULL in the VkCommandBufferInheritanceInfo.

Vulkan 1.1.81 spec says that "the application must ensure (using scissor
if neccesary) that all rendering is contained in the render area [...]
[which] must be contained within the framebuffer dimesions".

While this should be done by the application, commit 465e5a86 added the
clamp to the framebuffer size, in case of application does not do it.
But this requires to know the framebuffer dimensions.

If we do not have a framebuffer at that moment, the best compromise we
can do is to just apply the scissor as it is, and let the application to
ensure the rendering is contained in the render area.

v2: do not clamp to framebuffer if there isn't a framebuffer

v3 (Jason):
- clamp earlier in the conditional
- clamp to render area if command buffer is primary

v4: clamp also x and y to render area (Jason)

Fixes: 465e5a86 ("anv: Clamp scissors to the framebuffer boundary")
CC: Jason Ekstrand 
---
 src/intel/vulkan/gen7_cmd_buffer.c | 32 +-
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/src/intel/vulkan/gen7_cmd_buffer.c 
b/src/intel/vulkan/gen7_cmd_buffer.c
index 352892aee33..2924c6031fd 100644
--- a/src/intel/vulkan/gen7_cmd_buffer.c
+++ b/src/intel/vulkan/gen7_cmd_buffer.c
@@ -70,12 +70,34 @@ gen7_cmd_buffer_emit_scissor(struct anv_cmd_buffer 
*cmd_buffer)
   };
 
   const int max = 0x;
+
+  uint32_t y = s->offset.y;
+  uint32_t x = s->offset.x;
+  uint32_t height = s->offset.y + s->extent.height - 1;
+  uint32_t width = s->offset.x + s->extent.width - 1;
+
+  /* Do this math using int64_t so overflow gets clamped correctly. */
+  if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
+ y = clamp_int64((uint64_t) y, cmd_buffer->state.render_area.offset.y, 
max);
+ x = clamp_int64((uint64_t) x, cmd_buffer->state.render_area.offset.x, 
max);
+ height = clamp_int64((uint64_t) height, 0,
+  cmd_buffer->state.render_area.offset.y +
+  cmd_buffer->state.render_area.extent.height - 1);
+ width = clamp_int64((uint64_t) width, 0,
+ cmd_buffer->state.render_area.offset.x +
+ cmd_buffer->state.render_area.extent.width - 1);
+  } else if (fb) {
+ y = clamp_int64((uint64_t) y, 0, max);
+ x = clamp_int64((uint64_t) x, 0, max);
+ height = clamp_int64((uint64_t) height, 0, fb->height - 1);
+ width = clamp_int64((uint64_t) width, 0, fb->width - 1);
+  }
+
   struct GEN7_SCISSOR_RECT scissor = {
- /* Do this math using int64_t so overflow gets clamped correctly. */
- .ScissorRectangleYMin = clamp_int64(s->offset.y, 0, max),
- .ScissorRectangleXMin = clamp_int64(s->offset.x, 0, max),
- .ScissorRectangleYMax = clamp_int64((uint64_t) s->offset.y + 
s->extent.height - 1, 0, fb->height - 1),
- .ScissorRectangleXMax = clamp_int64((uint64_t) s->offset.x + 
s->extent.width - 1, 0, fb->width - 1)
+ .ScissorRectangleYMin = y,
+ .ScissorRectangleXMin = x,
+ .ScissorRectangleYMax = height,
+ .ScissorRectangleXMax = width
   };
 
   if (s->extent.width <= 0 || s->extent.height <= 0) {
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 03/40] intel/compiler: split float to 64-bit opcodes from int to 64-bit

Going forward having these split is a bit more convenient since these two
groups have different restrictions.

v2:
 - Rebased on top of new regioning lowering pass.

Reviewed-by: Topi Pohjolainen  (v1)
Reviewed-by: Jason Ekstrand 
---
 src/intel/compiler/brw_fs_nir.cpp | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 1041296b903..bb7591422d4 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -801,10 +801,17 @@ fs_visitor::nir_emit_alu(const fs_builder , 
nir_alu_instr *instr)
case nir_op_f2f64:
case nir_op_f2i64:
case nir_op_f2u64:
+  assert(type_sz(op[0].type) > 2); /* brw_nir_lower_conversions */
+  inst = bld.MOV(result, op[0]);
+  inst->saturate = instr->dest.saturate;
+  break;
+
case nir_op_i2f64:
case nir_op_i2i64:
case nir_op_u2f64:
case nir_op_u2u64:
+  assert(type_sz(op[0].type) > 1); /* brw_nir_lower_conversions */
+  /* fallthrough */
case nir_op_f2f32:
case nir_op_f2i32:
case nir_op_f2u32:
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH v4 5/5] gallium/auxiliary/vl: Add video compositor compute shader render

2019-02-12 Thread Zhu, James

Thank you for the advice.


James


From: Marek Olšák 
Sent: Monday, February 11, 2019 4:56 PM
To: Zhu, James
Cc: mesa-dev@lists.freedesktop.org; jzh...@gmail.com
Subject: Re: [Mesa-dev] [PATCH v4 5/5] gallium/auxiliary/vl: Add video 
compositor compute shader render

Instead of querying PIPE_CAP_COMPUTE multiple times, you can save the returned 
value at initialization once and use that.

On Fri, Feb 8, 2019 at 2:22 PM Zhu, James 
mailto:james@amd.com>> wrote:
Add compute shader initilization, assign and cleanup in vl_compositor API.
Set video compositor compute shader render as default when pipe support it.

Signed-off-by: James Zhu mailto:james@amd.com>>
Reviewed-by: Christian König 
mailto:christian.koe...@amd.com>>
---
 src/gallium/auxiliary/vl/vl_compositor.c | 105 ++-
 src/gallium/auxiliary/vl/vl_compositor.h |   3 +
 2 files changed, 80 insertions(+), 28 deletions(-)

diff --git a/src/gallium/auxiliary/vl/vl_compositor.c 
b/src/gallium/auxiliary/vl/vl_compositor.c
index 4509913..2d624c8 100644
--- a/src/gallium/auxiliary/vl/vl_compositor.c
+++ b/src/gallium/auxiliary/vl/vl_compositor.c
@@ -28,6 +28,7 @@
 #include "util/u_sampler.h"

 #include "vl_compositor_gfx.h"
+#include "vl_compositor_cs.h"

 static bool
 init_shaders(struct vl_compositor *c)
@@ -40,18 +41,6 @@ init_shaders(struct vl_compositor *c)
   return false;
}

-   c->fs_video_buffer = create_frag_shader_video_buffer(c);
-   if (!c->fs_video_buffer) {
-  debug_printf("Unable to create YCbCr-to-RGB fragment shader.\n");
-  return false;
-   }
-
-   c->fs_weave_rgb = create_frag_shader_weave_rgb(c);
-   if (!c->fs_weave_rgb) {
-  debug_printf("Unable to create YCbCr-to-RGB weave fragment shader.\n");
-  return false;
-   }
-
c->fs_yuv.weave.y = create_frag_shader_deint_yuv(c, true, true);
c->fs_yuv.weave.uv = create_frag_shader_deint_yuv(c, false, true);
c->fs_yuv.bob.y = create_frag_shader_deint_yuv(c, true, false);

If compute is used, I wouldn't like any graphics shaders to be created.

Marek

@@ -74,12 +63,6 @@ init_shaders(struct vl_compositor *c)
   return false;
}

-   c->fs_rgba = create_frag_shader_rgba(c);
-   if (!c->fs_rgba) {
-  debug_printf("Unable to create RGB-to-RGB fragment shader.\n");
-  return false;
-   }
-
c->fs_rgb_yuv.y = create_frag_shader_rgb_yuv(c, true);
c->fs_rgb_yuv.uv = create_frag_shader_rgb_yuv(c, false);
if (!c->fs_rgb_yuv.y || !c->fs_rgb_yuv.uv) {
@@ -87,6 +70,44 @@ init_shaders(struct vl_compositor *c)
   return false;
}

+   if (c->pipe->screen->get_param(c->pipe->screen, PIPE_CAP_COMPUTE)) {
+  c->cs_video_buffer = vl_compositor_cs_create_shader(c, 
compute_shader_video_buffer);
+  if (!c->cs_video_buffer) {
+ debug_printf("Unable to create video_buffer compute shader.\n");
+ return false;
+  }
+
+  c->cs_weave_rgb = vl_compositor_cs_create_shader(c, 
compute_shader_weave);
+  if (!c->cs_weave_rgb) {
+ debug_printf("Unable to create weave_rgb compute shader.\n");
+ return false;
+  }
+
+  c->cs_rgba = vl_compositor_cs_create_shader(c, compute_shader_rgba);
+  if (!c->cs_rgba) {
+ debug_printf("Unable to create RGB-to-RGB compute shader.\n");
+ return false;
+  }
+   } else {
+  c->fs_video_buffer = create_frag_shader_video_buffer(c);
+  if (!c->fs_video_buffer) {
+ debug_printf("Unable to create YCbCr-to-RGB fragment shader.\n");
+ return false;
+  }
+
+  c->fs_weave_rgb = create_frag_shader_weave_rgb(c);
+  if (!c->fs_weave_rgb) {
+ debug_printf("Unable to create YCbCr-to-RGB weave fragment 
shader.\n");
+ return false;
+  }
+
+  c->fs_rgba = create_frag_shader_rgba(c);
+  if (!c->fs_rgba) {
+ debug_printf("Unable to create RGB-to-RGB fragment shader.\n");
+ return false;
+  }
+   }
+
return true;
 }

@@ -95,17 +116,24 @@ static void cleanup_shaders(struct vl_compositor *c)
assert(c);

c->pipe->delete_vs_state(c->pipe, c->vs);
-   c->pipe->delete_fs_state(c->pipe, c->fs_video_buffer);
-   c->pipe->delete_fs_state(c->pipe, c->fs_weave_rgb);
c->pipe->delete_fs_state(c->pipe, c->fs_yuv.weave.y);
c->pipe->delete_fs_state(c->pipe, c->fs_yuv.weave.uv);
c->pipe->delete_fs_state(c->pipe, c->fs_yuv.bob.y);
c->pipe->delete_fs_state(c->pipe, c->fs_yuv.bob.uv);
c->pipe->delete_fs_state(c->pipe, c->fs_palette.yuv);
c->pipe->delete_fs_state(c->pipe, c->fs_palette.rgb);
-   c->pipe->delete_fs_state(c->pipe, c->fs_rgba);
c->pipe->delete_fs_state(c->pipe, c->fs_rgb_yuv.y);
c->pipe->delete_fs_state(c->pipe, c->fs_rgb_yuv.uv);
+
+   if (c->pipe->screen->get_param(c->pipe->screen, PIPE_CAP_COMPUTE)) {
+  c->pipe->delete_compute_state(c->pipe, c->cs_video_buffer);
+  c->pipe->delete_compute_state(c->pipe, c->cs_weave_rgb);
+

[Mesa-dev] [Bug 109535] [Tracker] Mesa 19.0 release tracker

https://bugs.freedesktop.org/show_bug.cgi?id=109535
Bug 109535 depends on bug 109543, which changed state.

Bug 109543 Summary: After upgrade mesa to 19.0.0~rc1 all vulkan based 
application stop working ["vulkan-cube" received SIGSEGV in 
radv_pipeline_init_blend_state at ../src/amd/vulkan/radv_pipeline.c:699]
https://bugs.freedesktop.org/show_bug.cgi?id=109543

   What|Removed |Added

 Status|NEW |RESOLVED
 Resolution|--- |FIXED

-- 
You are receiving this mail because:
You are the QA Contact for the bug.
You are the assignee for the bug.___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] mesa: INVALID_VALUE for wrong type or format in ClearBufferData

2019-02-12 Thread Andres Gomez

Instead of generating a GL_INVALID_ENUM error when the type or format
is incorrect while using glClear{Named}Buffer{Sub}Data, generate
GL_INVALID_VALUE.

From page 72 (page 94 of the PDF) of the OpenGL 4.6 spec:

  " An INVALID_VALUE error is generated if type is not one of the
types in table 8.2.

An INVALID_VALUE error is generated if format is not one of the
formats in table 8.3."

Fixes the following test:
KHR-GL45.direct_state_access.buffers_errors

Cc: Pi Tabred 
Cc: Brian Paul 
Signed-off-by: Andres Gomez 
---
 src/mesa/main/bufferobj.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index 534326858bb..25b47ddab66 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -346,7 +346,7 @@ buffer_object_subdata_range_good(struct gl_context *ctx,
 
 /**
  * Test the format and type parameters and set the GL error code for
- * \c glClearBufferData and \c glClearBufferSubData.
+ * \c glClear{Named}Buffer{Sub}Data.
  *
  * \param ctx GL context.
  * \param internalformat  Format to which the data is to be converted.
@@ -356,7 +356,7 @@ buffer_object_subdata_range_good(struct gl_context *ctx,
  * \return   If internalformat, format and type are legal the mesa_format
  *   corresponding to internalformat, otherwise MESA_FORMAT_NONE.
  *
- * \sa glClearBufferData and glClearBufferSubData
+ * \sa glClear{Named}Buffer{Sub}Data
  */
 static mesa_format
 validate_clear_buffer_format(struct gl_context *ctx,
@@ -386,14 +386,14 @@ validate_clear_buffer_format(struct gl_context *ctx,
}
 
if (!_mesa_is_color_format(format)) {
-  _mesa_error(ctx, GL_INVALID_ENUM,
+  _mesa_error(ctx, GL_INVALID_VALUE,
   "%s(format is not a color format)", caller);
   return MESA_FORMAT_NONE;
}
 
errorFormatType = _mesa_error_check_format_and_type(ctx, format, type);
if (errorFormatType != GL_NO_ERROR) {
-  _mesa_error(ctx, GL_INVALID_ENUM,
+  _mesa_error(ctx, GL_INVALID_VALUE,
   "%s(invalid format or type)", caller);
   return MESA_FORMAT_NONE;
}
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 24/40] intel/compiler: implement isign for int8

Reviewed-by: Jason Ekstrand 
---
 src/intel/compiler/brw_fs_nir.cpp | 25 +
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 3a6e4a2eb60..40c0481ac53 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -906,11 +906,28 @@ fs_visitor::nir_emit_alu(const fs_builder , 
nir_alu_instr *instr)
*  Predicated OR sets 1 if val is positive.
*/
   uint32_t bit_size = nir_dest_bit_size(instr->dest.dest);
-  assert(bit_size == 32 || bit_size == 16);
 
-  fs_reg zero = bit_size == 32 ? brw_imm_d(0) : brw_imm_w(0);
-  fs_reg one = bit_size == 32 ? brw_imm_d(1) : brw_imm_w(1);
-  fs_reg shift = bit_size == 32 ? brw_imm_d(31) : brw_imm_w(15);
+  fs_reg zero, one, shift;
+  switch (bit_size) {
+  case 32:
+ zero = brw_imm_d(0);
+ one = brw_imm_d(1);
+ shift = brw_imm_d(31);
+ break;
+  case 16:
+ zero = brw_imm_w(0);
+ one = brw_imm_w(1);
+ shift = brw_imm_w(15);
+ break;
+  case 8: {
+ zero = setup_imm_b(bld, 0);
+ one = setup_imm_b(bld, 1);
+ shift = setup_imm_b(bld, 7);
+ break;
+  }
+  default:
+ unreachable("unsupported bit-size");
+  };
 
   bld.CMP(bld.null_reg_d(), op[0], zero, BRW_CONDITIONAL_G);
   bld.ASR(result, op[0], shift);
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 15/40] intel/compiler: don't compact 3-src instructions with Src1Type or Src2Type bits

We are now using these bits, so don't assert that they are not set. In gen8,
if these bits are set compaction is not possible. On gen9 and CHV platforms
set_3src_control_index() checks these bits (and others) against a table to
validate if the particular bit combination is eligible for compaction or not.

v2
 - Add more detail in the commit message explaining the situation for SKL+
   and CHV (Jason)

Reviewed-by: Topi Pohjolainen 
Reviewed-by: Jason Ekstrand 
Reviewed-by: Matt Turner 
---
 src/intel/compiler/brw_eu_compact.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_eu_compact.c 
b/src/intel/compiler/brw_eu_compact.c
index ae14ef10ec0..20fed254331 100644
--- a/src/intel/compiler/brw_eu_compact.c
+++ b/src/intel/compiler/brw_eu_compact.c
@@ -928,8 +928,11 @@ has_3src_unmapped_bits(const struct gen_device_info 
*devinfo,
   assert(!brw_inst_bits(src, 127, 126) &&
  !brw_inst_bits(src, 105, 105) &&
  !brw_inst_bits(src, 84, 84) &&
- !brw_inst_bits(src, 36, 35) &&
  !brw_inst_bits(src, 7,  7));
+
+  /* Src1Type and Src2Type, used for mixed-precision floating point */
+  if (brw_inst_bits(src, 36, 35))
+ return true;
}
 
return false;
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 17/40] intel/compiler: set correct precision fields for 3-source float instructions

Source0 and Destination extract the floating-point precision automatically
from the SrcType and DstType instruction fields respectively when they are
set to types :F or :HF. For Source1 and Source2 operands, we use the new
1-bit fields Src1Type and Src2Type, where 0 means normal precision and 1
means half-precision. Since we always use the type of the destination for
all operands when we emit 3-source instructions, we only need set Src1Type
and Src2Type to 1 when we are emitting a half-precision instruction.

v2:
 - Set the bit separately for each source based on its type so we can
   do mixed floating-point mode in the future (Topi).

v3:
 - Use regular citation style for the comment referencing the PRM (Matt).
 - Decided not to add asserts in the emission code to check that only
   mixed HF/F types are used since such checks would break negative tests
   for brw_eu_validate.c (Matt)

Reviewed-by: Topi Pohjolainen 
Reviewed-by: Jason Ekstrand 
Reviewed-by: Matt Turner 
---
 src/intel/compiler/brw_eu_emit.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 30037e71b00..195c26ab760 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -843,6 +843,22 @@ brw_alu3(struct brw_codegen *p, unsigned opcode, struct 
brw_reg dest,
   */
  brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
  brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
+
+ /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
+  *
+  *"Three source instructions can use operands with mixed-mode
+  * precision. When SrcType field is set to :f or :hf it defines
+  * precision for source 0 only, and fields Src1Type and Src2Type
+  * define precision for other source operands:
+  *
+  * 0b = :f. Single precision Float (32-bit).
+  * 1b = :hf. Half precision Float (16-bit)."
+  */
+ if (src1.type == BRW_REGISTER_TYPE_HF)
+brw_inst_set_3src_a16_src1_type(devinfo, inst, 1);
+
+ if (src2.type == BRW_REGISTER_TYPE_HF)
+brw_inst_set_3src_a16_src2_type(devinfo, inst, 1);
   }
}
 
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 2/4] radv: use MAX_{VBS, VERTEX_ATTRIBS} when defining max vertex input limits

Signed-off-by: Samuel Pitoiset 
---
 src/amd/vulkan/radv_device.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index 0fef92773e1..9778b13ce86 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -934,8 +934,8 @@ void radv_GetPhysicalDeviceProperties(
.maxDescriptorSetSampledImages= 
max_descriptor_set_size,
.maxDescriptorSetStorageImages= 
max_descriptor_set_size,
.maxDescriptorSetInputAttachments = 
max_descriptor_set_size,
-   .maxVertexInputAttributes = 32,
-   .maxVertexInputBindings   = 32,
+   .maxVertexInputAttributes = MAX_VERTEX_ATTRIBS,
+   .maxVertexInputBindings   = MAX_VBS,
.maxVertexInputAttributeOffset= 2047,
.maxVertexInputBindingStride  = 2048,
.maxVertexOutputComponents= 128,
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] nir: remove jump from two merging jump-ending blocks

In opt_peel_initial_if optimization, when moving the continue list to
end of the continue block, before the jump, could happen that the
continue list itself also ends with a jump.

This would mean that we would have two jump instructions in a row: the
first one from the continue list and the second one from the contine
block.

As inserting an instruction after a jump is not allowed (and it does not
make sense, as it will not be executed), remove the jump from the
continue block and keep the one from continue list, as it will be
executed first.

CC: Jason Ekstrand 
---
 src/compiler/nir/nir_opt_if.c | 21 +++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/compiler/nir/nir_opt_if.c b/src/compiler/nir/nir_opt_if.c
index 932af9e37ab..a011401b3b4 100644
--- a/src/compiler/nir/nir_opt_if.c
+++ b/src/compiler/nir/nir_opt_if.c
@@ -241,12 +241,29 @@ opt_peel_loop_initial_if(nir_loop *loop)
nir_cf_reinsert(,
nir_after_block_before_jump(find_continue_block(loop)));
 
+   bool continue_list_jumps =
+  nir_block_ends_in_jump(exec_node_data(nir_block,
+exec_list_get_tail(continue_list),
+cf_node.node));
+
nir_cf_extract(, nir_before_cf_list(continue_list),
 nir_after_cf_list(continue_list));
 
-   /* Get continue block again as the previous reinsert might have removed the 
block. */
+   /* Get continue block again as the previous reinsert might have removed the
+* block.  Also, if both the continue list and the continue block ends in
+* jump instructions, removes the jump from the later, as it will not be
+* executed if we insert the continue list before it */
+
+   nir_block *continue_block = find_continue_block(loop);
+
+   if (continue_list_jumps) {
+  nir_instr *last_instr = nir_block_last_instr(continue_block);
+  if (last_instr && last_instr->type == nir_instr_type_jump)
+ nir_instr_remove(last_instr);
+   }
+
nir_cf_reinsert(,
-   nir_after_block_before_jump(find_continue_block(loop)));
+   nir_after_block_before_jump(continue_block));
 
nir_cf_node_remove(>cf_node);
 
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] radv: always export gl_SampleMask when the fragment shader uses it

For some reasons, this breaks trees rendering in Project Cars.

Fixes: 85010585cde ("radv: only enable gl_SampleMask if MSAA is enabled too")
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109401
Signed-off-by: Samuel Pitoiset 
---
 src/amd/vulkan/radv_pipeline.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index fb6c61cf3f0..a9df2b94b93 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -3183,11 +3183,11 @@ radv_compute_db_shader_control(const struct radv_device 
*device,
bool disable_rbplus = device->physical_device->has_rbplus &&
  !device->physical_device->rbplus_allowed;
 
-   /* Do not enable the gl_SampleMask fragment shader output if MSAA is
-* disabled.
+   /* It shouldn't be needed to export gl_SampleMask when MSAA is disabled
+* but this appears to break Project Cars (DXVK). See
+* https://bugs.freedesktop.org/show_bug.cgi?id=109401
 */
-   bool mask_export_enable = ms->num_samples > 1 &&
- ps->info.info.ps.writes_sample_mask;
+   bool mask_export_enable = ps->info.info.ps.writes_sample_mask;
 
return  S_02880C_Z_EXPORT_ENABLE(ps->info.info.ps.writes_z) |

S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(ps->info.info.ps.writes_stencil) |
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [Bug 109532] ir_variable has maximum access out of bounds -- but it's not out of bounds

https://bugs.freedesktop.org/show_bug.cgi?id=109532

--- Comment #26 from asimiklit  ---
(In reply to Ian Romanick from comment #17)
> (In reply to asimiklit from comment #6)
> > Created attachment 143288 [details]
> > this simple program helps me to reproduce this issue.
> > 
> > just share my simple reproducer)
> > 
> > Run it in this way:
> > 
> >simple_reproducer shader.comp
> 
> It seems like this could be made into a piglit test that could be compiled
> with glslparsertest.  If you haven't already submitted such a test, please
> do. :)

I work on it)

-- 
You are receiving this mail because:
You are the assignee for the bug.___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [Bug 109575] Mesa-19.0.0-rc1 : Computer Crashes trying to run anything Vulkan

https://bugs.freedesktop.org/show_bug.cgi?id=109575

--- Comment #10 from Samuel Pitoiset  ---
Interesting, thanks for bisecting. I will investigate.

-- 
You are receiving this mail because:
You are the assignee for the bug.
You are the QA Contact for the bug.___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 08/40] intel/compiler: implement 16-bit fsign

v2:
 - make 16-bit be its own separate case (Jason)

v3:
 - Drop the result_int temporary (Jason)

Reviewed-by: Topi Pohjolainen  (v1)
Reviewed-by: Jason Ekstrand 
---
 src/intel/compiler/brw_fs_nir.cpp | 17 -
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 4c7a839390c..64e24f86b5a 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -844,7 +844,21 @@ fs_visitor::nir_emit_alu(const fs_builder , 
nir_alu_instr *instr)
 : bld.MOV(result, brw_imm_f(1.0f));
 
  set_predicate(BRW_PREDICATE_NORMAL, inst);
-  } else if (type_sz(op[0].type) < 8) {
+  } else if (type_sz(op[0].type) == 2) {
+ /* AND(val, 0x8000) gives the sign bit.
+  *
+  * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not 
zero.
+  */
+ fs_reg zero = retype(brw_imm_uw(0), BRW_REGISTER_TYPE_HF);
+ bld.CMP(bld.null_reg_f(), op[0], zero, BRW_CONDITIONAL_NZ);
+
+ op[0].type = BRW_REGISTER_TYPE_UW;
+ result.type = BRW_REGISTER_TYPE_UW;
+ bld.AND(result, op[0], brw_imm_uw(0x8000u));
+
+ inst = bld.OR(result, result, brw_imm_uw(0x3c00u));
+ inst->predicate = BRW_PREDICATE_NORMAL;
+  } else if (type_sz(op[0].type) == 4) {
  /* AND(val, 0x8000) gives the sign bit.
   *
   * Predicated OR ORs 1.0 (0x3f80) with the sign bit if val is not
@@ -866,6 +880,7 @@ fs_visitor::nir_emit_alu(const fs_builder , 
nir_alu_instr *instr)
   * - The sign is encoded in the high 32-bit of each DF
   * - We need to produce a DF result.
   */
+ assert(type_sz(op[0].type) == 8);
 
  fs_reg zero = vgrf(glsl_type::double_type);
  bld.MOV(zero, setup_imm_df(bld, 0.0));
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 13/40] intel/compiler: add instruction setters for Src1Type and Src2Type.

The original SrcType is a 3-bit field that takes a subset of the types
supported for the hardware for 3-source instructions. Since gen8,
when the half-float type was added, 3-source floating point operations
can use use mixed precision mode, where not all the operands have the
same floating-point precision. While the precision for the first operand
is taken from the type in SrcType, the bits in Src1Type (bit 36) and
Src2Type (bit 35) define the precision for the other operands
(0: normal precision, 1: half precision).

Reviewed-by: Topi Pohjolainen 
Reviewed-by: Matt Turner 
---
 src/intel/compiler/brw_inst.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/intel/compiler/brw_inst.h b/src/intel/compiler/brw_inst.h
index 71316f12215..1f55d45125d 100644
--- a/src/intel/compiler/brw_inst.h
+++ b/src/intel/compiler/brw_inst.h
@@ -222,6 +222,8 @@ F8(3src_src1_negate,39, 39, 40, 40)
 F8(3src_src1_abs,   38, 38, 39, 39)
 F8(3src_src0_negate,37, 37, 38, 38)
 F8(3src_src0_abs,   36, 36, 37, 37)
+F8(3src_a16_src1_type,  -1, -1, 36, 36)
+F8(3src_a16_src2_type,  -1, -1, 35, 35)
 F8(3src_a16_flag_reg_nr,34, 34, 33, 33)
 F8(3src_a16_flag_subreg_nr, 33, 33, 32, 32)
 FF(3src_a16_dst_reg_file,
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 01/40] compiler/nir: add an is_conversion field to nir_op_info

This is set to True only for numeric conversion opcodes.
---
 src/compiler/nir/nir.h|  3 ++
 src/compiler/nir/nir_opcodes.py   | 73 +--
 src/compiler/nir/nir_opcodes_c.py |  1 +
 3 files changed, 44 insertions(+), 33 deletions(-)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index ff2c41faf27..2793662b1d9 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -926,6 +926,9 @@ typedef struct {
nir_alu_type input_types[NIR_MAX_VEC_COMPONENTS];
 
nir_op_algebraic_property algebraic_properties;
+
+   /* Whether this represents a numeric conversion opcode */
+   bool is_conversion;
 } nir_op_info;
 
 extern const nir_op_info nir_op_infos[nir_num_opcodes];
diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
index d32005846a6..dc4cd9ac63d 100644
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -33,12 +33,13 @@ class Opcode(object):
NOTE: this must be kept in sync with nir_op_info
"""
def __init__(self, name, output_size, output_type, input_sizes,
-input_types, algebraic_properties, const_expr):
+input_types, is_conversion, algebraic_properties, const_expr):
   """Parameters:
 
   - name is the name of the opcode (prepend nir_op_ for the enum name)
   - all types are strings that get nir_type_ prepended to them
   - input_types is a list of types
+  - is_conversion is true if this opcode represents a type conversion
   - algebraic_properties is a space-seperated string, where nir_op_is_ is
 prepended before each entry
   - const_expr is an expression or series of statements that computes the
@@ -70,6 +71,7 @@ class Opcode(object):
   assert isinstance(input_sizes[0], int)
   assert isinstance(input_types, list)
   assert isinstance(input_types[0], str)
+  assert isinstance(is_conversion, bool)
   assert isinstance(algebraic_properties, str)
   assert isinstance(const_expr, str)
   assert len(input_sizes) == len(input_types)
@@ -84,6 +86,7 @@ class Opcode(object):
   self.output_type = output_type
   self.input_sizes = input_sizes
   self.input_types = input_types
+  self.is_conversion = is_conversion
   self.algebraic_properties = algebraic_properties
   self.const_expr = const_expr
 
@@ -138,21 +141,22 @@ associative = "associative "
 opcodes = {}
 
 def opcode(name, output_size, output_type, input_sizes, input_types,
-   algebraic_properties, const_expr):
+   is_conversion, algebraic_properties, const_expr):
assert name not in opcodes
opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
-  input_types, algebraic_properties, const_expr)
+  input_types, is_conversion, algebraic_properties,
+  const_expr)
 
 def unop_convert(name, out_type, in_type, const_expr):
-   opcode(name, 0, out_type, [0], [in_type], "", const_expr)
+   opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
 
 def unop(name, ty, const_expr):
-   opcode(name, 0, ty, [0], [ty], "", const_expr)
+   opcode(name, 0, ty, [0], [ty], False, "", const_expr)
 
 def unop_horiz(name, output_size, output_type, input_size, input_type,
const_expr):
-   opcode(name, output_size, output_type, [input_size], [input_type], "",
-  const_expr)
+   opcode(name, output_size, output_type, [input_size], [input_type],
+  False, "", const_expr)
 
 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
 reduce_expr, final_expr):
@@ -173,6 +177,8 @@ def unop_reduce(name, output_size, output_type, input_type, 
prereduce_expr,
unop_horiz(name + "4", output_size, output_type, 4, input_type,
   final(reduce_(reduce_(src0, src1), reduce_(src2, src3
 
+def unop_numeric_convert(name, out_type, in_type, const_expr):
+   opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
 
 # These two move instructions differ in what modifiers they support and what
 # the negate modifier means. Otherwise, they are identical.
@@ -215,13 +221,13 @@ for src_t in [tint, tuint, tfloat, tbool]:
   if bit_size == 16 and dst_t == tfloat and src_t == tfloat:
   rnd_modes = ['_rtne', '_rtz', '']
   for rnd_mode in rnd_modes:
-  unop_convert("{0}2{1}{2}{3}".format(src_t[0], dst_t[0],
-   bit_size, rnd_mode),
-   dst_t + str(bit_size), src_t, "src0")
+  unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0], 
dst_t[0],
+  bit_size, 
rnd_mode),
+   dst_t + str(bit_size), src_t, "src0")
   else:
   conv_expr = "src0 != 0" if dst_t == tbool else "src0"
-

[Mesa-dev] [PATCH v4 37/40] intel/compiler: validate region restrictions for mixed float mode

---
 src/intel/compiler/brw_eu_validate.c| 256 ++
 src/intel/compiler/test_eu_validate.cpp | 618 
 2 files changed, 874 insertions(+)

diff --git a/src/intel/compiler/brw_eu_validate.c 
b/src/intel/compiler/brw_eu_validate.c
index ed9c8fe59dd..a61d4c46e81 100644
--- a/src/intel/compiler/brw_eu_validate.c
+++ b/src/intel/compiler/brw_eu_validate.c
@@ -170,6 +170,13 @@ src1_is_null(const struct gen_device_info *devinfo, const 
brw_inst *inst)
   brw_inst_src1_da_reg_nr(devinfo, inst) == BRW_ARF_NULL;
 }
 
+static bool
+src0_is_acc(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+   return brw_inst_src0_reg_file(devinfo, inst) == 
BRW_ARCHITECTURE_REGISTER_FILE &&
+  brw_inst_src0_da_reg_nr(devinfo, inst) == BRW_ARF_ACCUMULATOR;
+}
+
 static bool
 src0_is_grf(const struct gen_device_info *devinfo, const brw_inst *inst)
 {
@@ -847,6 +854,254 @@ general_restrictions_on_region_parameters(const struct 
gen_device_info *devinfo,
return error_msg;
 }
 
+static struct string
+special_restrictions_for_mixed_float_mode(const struct gen_device_info 
*devinfo,
+  const brw_inst *inst)
+{
+   struct string error_msg = { .str = NULL, .len = 0 };
+
+   unsigned opcode = brw_inst_opcode(devinfo, inst);
+   unsigned num_sources = num_sources_from_inst(devinfo, inst);
+   if (num_sources >= 3)
+  return error_msg;
+
+   if (!is_mixed_float(devinfo, inst))
+  return error_msg;
+
+   unsigned exec_size = 1 << brw_inst_exec_size(devinfo, inst);
+   bool is_align16 = brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16;
+
+   enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst);
+   enum brw_reg_type src1_type = brw_inst_src1_type(devinfo, inst);
+   enum brw_reg_type dst_type = brw_inst_dst_type(devinfo, inst);
+
+   unsigned dst_stride = STRIDE(brw_inst_dst_hstride(devinfo, inst));
+   bool dst_is_packed = is_packed(exec_size * dst_stride, exec_size, 
dst_stride);
+
+   /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+* Float Operations:
+*
+*"Indirect addressing on source is not supported when source and
+* destination data types are mixed float."
+*
+* Indirect addressing is only supported on the first source, so we only
+* check that.
+*/
+   ERROR_IF(types_are_mixed_float(dst_type, src0_type) &&
+brw_inst_src0_address_mode(devinfo, inst) != BRW_ADDRESS_DIRECT,
+"Indirect addressing on source is not supported when source and "
+"destination data types are mixed float");
+
+   /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+* Float Operations:
+*
+*"No SIMD16 in mixed mode when destination is f32. Instruction
+* execution size must be no more than 8."
+*/
+   ERROR_IF(exec_size > 8 && dst_type == BRW_REGISTER_TYPE_F,
+"Mixed float mode with 32-bit float destination is limited "
+"to SIMD8");
+
+   if (is_align16) {
+  /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+   * Float Operations:
+   *
+   *   "In Align16 mode, when half float and float data types are mixed
+   *between source operands OR between source and destination operands,
+   *the register content are assumed to be packed."
+   *
+   * Since Align16 doesn't have a concept of horizontal stride (or width),
+   * it means that vertical stride must always be 4, since 0 and 2 would
+   * lead to replicated data, and any other value is disallowed in Align16.
+   * However, the PRM also says:
+   *
+   *   "In Align16, vertical stride can never be zero for f16"
+   *
+   * Which is oddly redundant and specific considering the more general
+   * assumption that all operands are assumed to be packed, so we
+   * understand that this might be hinting that there may be an exception
+   * for f32 operands with a vstride of 0, so we don't validate this for
+   * them while we don't have empirical evidence that it is forbidden.
+   */
+  ERROR_IF(brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4 &&
+   (src0_type != BRW_REGISTER_TYPE_F ||
+brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0),
+   "Align16 mixed float mode assumes packed data (vstride must "
+   "be 4 -or 0 for f32 operands-)");
+
+  ERROR_IF(num_sources >= 2 &&
+   brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4 &&
+   (src1_type != BRW_REGISTER_TYPE_F ||
+brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0),
+   "Align16 mixed float mode assumes packed data (vstride must "
+   "be 4 -or 0 for f32 operands-)");
+
+  /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+   * Float Operations:
+   *
+   *   "For Align16

[Mesa-dev] [PATCH 3/3] intel/fs: Drop the fs_surface_builder

All of the actual abstraction (except possibly setting size_written)
happens as part of the logical opcodes.  The only thing that the surface
builder is providing at this point is extra levels of functions to call
through.  I'm going to be adding bindless image support soon and all the
extra abstraction here is just getting in the way.
---
 src/intel/Makefile.sources|   2 -
 src/intel/compiler/brw_fs_nir.cpp | 361 ++
 src/intel/compiler/brw_fs_surface_builder.cpp | 212 --
 src/intel/compiler/brw_fs_surface_builder.h   |  89 -
 src/intel/compiler/meson.build|   2 -
 5 files changed, 195 insertions(+), 471 deletions(-)
 delete mode 100644 src/intel/compiler/brw_fs_surface_builder.cpp
 delete mode 100644 src/intel/compiler/brw_fs_surface_builder.h

diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources
index 94a28d370e8..a5c8828a6b6 100644
--- a/src/intel/Makefile.sources
+++ b/src/intel/Makefile.sources
@@ -69,8 +69,6 @@ COMPILER_FILES = \
compiler/brw_fs_register_coalesce.cpp \
compiler/brw_fs_saturate_propagation.cpp \
compiler/brw_fs_sel_peephole.cpp \
-   compiler/brw_fs_surface_builder.cpp \
-   compiler/brw_fs_surface_builder.h \
compiler/brw_fs_validate.cpp \
compiler/brw_fs_visitor.cpp \
compiler/brw_inst.h \
diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index b7f71338f75..c33a2d40917 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -23,13 +23,11 @@
 
 #include "compiler/glsl/ir.h"
 #include "brw_fs.h"
-#include "brw_fs_surface_builder.h"
 #include "brw_nir.h"
 #include "util/u_math.h"
 #include "util/bitscan.h"
 
 using namespace brw;
-using namespace brw::surface_access;
 
 void
 fs_visitor::emit_nir_code()
@@ -3379,18 +3377,16 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder ,
 
   cs_prog_data->uses_num_work_groups = true;
 
-  fs_reg surf_index = brw_imm_ud(surface);
+  fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
+  srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(surface);
+  srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
+  srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1); /* num components */
 
   /* Read the 3 GLuint components of gl_NumWorkGroups */
   for (unsigned i = 0; i < 3; i++) {
- fs_reg read_result =
-emit_untyped_read(bld, surf_index,
-  brw_imm_ud(i << 2),
-  1 /* dims */, 1 /* size */,
-  BRW_PREDICATE_NONE);
- read_result.type = dest.type;
- bld.MOV(dest, read_result);
- dest = offset(dest, bld, 1);
+ srcs[SURFACE_LOGICAL_SRC_ADDRESS] = brw_imm_ud(i << 2);
+ bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
+  offset(dest, bld, i), srcs, SURFACE_LOGICAL_NUM_SRCS);
   }
   break;
}
@@ -3440,8 +3436,10 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder ,
   assert(stage == MESA_SHADER_COMPUTE);
 
   const unsigned bit_size = nir_dest_bit_size(instr->dest);
-  fs_reg offset_reg = retype(get_nir_src(instr->src[0]),
- BRW_REGISTER_TYPE_UD);
+  fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
+  srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM);
+  srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[0]);
+  srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
 
   /* Make dest unsigned because that's what the temporary will be */
   dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
@@ -3449,19 +3447,19 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder ,
   /* Read the vector */
   if (nir_intrinsic_align(instr) >= 4) {
  assert(nir_dest_bit_size(instr->dest) == 32);
- fs_reg read_result = emit_untyped_read(bld, brw_imm_ud(GEN7_BTI_SLM),
-offset_reg, 1 /* dims */,
-instr->num_components,
-BRW_PREDICATE_NONE);
- for (unsigned i = 0; i < instr->num_components; i++)
-bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
+ srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
+ fs_inst *inst =
+bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
+ dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
+ inst->size_written = instr->num_components * dispatch_width * 4;
   } else {
  assert(nir_dest_bit_size(instr->dest) <= 32);
  assert(nir_dest_num_components(instr->dest) == 1);
- fs_reg read_result =
-emit_byte_scattered_read(bld, brw_imm_ud(GEN7_BTI_SLM), offset_reg,
- 1 /* dims */, 1, bit_size,
-

Re: [Mesa-dev] [PATCH 0/4] RadeonSI: Upload constants to VRAM via SDMA

2019-02-12 Thread Dieter Nützel

Sorry that I step in so late, but the whole family recover slowly from a
bad flu...

Tried your 'latest" three series altogether with my Polaris 20 (NIR!).
UH and UV hang after some seconds reliable. VM faults. Have to dig
deeper in (remote) to get some logs.

But my reported Polaris triangle corruptions are solved, now.
W'll try to verify which patches fixed it.

Look here:
https://www.phoronix.com/forums/forum/phoronix/latest-phoronix-articles/1079319-running-the-radeonsi-nir-back-end-with-mesa-19-1-git?p=1079390#post1079390

Greetings,
Dieter

Am 07.02.2019 02:21, schrieb Marek Olšák:

Hi,

This patch series increases radeonsi performance in some cases.
glxgears performance decreases slightly.

Visible VRAM is usually congested due to CPU accesses, which cause
buffers to be evicted from that part of VRAM. This removes
the congestion for all data pushed into const_uploader.

We have had many problems with const_uploader slowing stuff down due
to visible VRAM congestion. The most recent one is this Starcraft 2
issue report on github:

https://github.com/iXit/Mesa-3D/issues/333

Since const_uploader reuses buffers from the winsys buffer cache,
the odds are that the reused buffers are already evicted, so the first
use is usually slower due to higher shader load latencies.

This series uses SDMA to get constants into VRAM, so it doesn't have
any of the above drawbacks.

SC2 numbers with various other methods (from the github issue report):
- originally: 50-55 fps
- changing const_uploader to STREAM: 75-80 fps
- use stream_uploader for constants in Nine: 90 fps
- this series: 105-110 fps

Trivial benchmarks such as glxgears can expect 20% decrease
in performance due to the added cost of the SDMA CS ioctl that wasn't
there before.

CPU-bound apps with many IBs are almost unaffected thanks to winsys
multithreading.

Feedback welcome,

Thanks,
Marek
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] st/va:Add support for indirect manner by returning VA_STATUS_ERROR_OPERATION_FAILED

2019-02-12 Thread Guttula, Suresh

Based on VA Spec,DeriveImage() returns VA_STATUS_ERROR_OPERATION_FAILED if 
driver
dont have support for internal surface formats.Currently vaDeriveImage()
failed for non-contiguous planes and operation failed error string is
required to support indirect manner i.e. vaCreateImage()+vaPutImage()
incase vaDeriveImage() failed with VA_STATUS_ERROR_OPERATION_FAILED.

This patch will notify to the client as operation failed with proper
error sting,so that client will fallback to vaCreateImage()+vaPutImage().

v2: updated commit message based on VA spec.

Signed-off-by: suresh guttula 
---
 src/gallium/state_trackers/va/image.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/gallium/state_trackers/va/image.c 
b/src/gallium/state_trackers/va/image.c
index 807fc83..f7e0db0 100644
--- a/src/gallium/state_trackers/va/image.c
+++ b/src/gallium/state_trackers/va/image.c
@@ -212,9 +212,12 @@ vlVaDeriveImage(VADriverContextP ctx, VASurfaceID surface, 
VAImage *image)
 
surf = handle_table_get(drv->htab, surface);
 
-   if (!surf || !surf->buffer || surf->buffer->interlaced)
+   if (!surf || !surf->buffer)
   return VA_STATUS_ERROR_INVALID_SURFACE;
 
+   if (surf->buffer->interlaced)
+ return VA_STATUS_ERROR_OPERATION_FAILED;
+
surfaces = surf->buffer->get_surfaces(surf->buffer);
if (!surfaces || !surfaces[0]->texture)
   return VA_STATUS_ERROR_ALLOCATION_FAILED;
@@ -261,7 +264,7 @@ vlVaDeriveImage(VADriverContextP ctx, VASurfaceID surface, 
VAImage *image)
default:
   /* VaDeriveImage is designed for contiguous planes. */
   FREE(img);
-  return VA_STATUS_ERROR_INVALID_IMAGE_FORMAT;
+  return VA_STATUS_ERROR_OPERATION_FAILED;
}
 
img_buf = CALLOC(1, sizeof(vlVaBuffer));
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] nir: allow stitching of non-empty block

Hi Juan,

On Tue, Feb 12, 2019 at 04:37:23PM +0100, Juan A. Suarez Romero wrote:
> On Fri, 2019-02-08 at 15:39 -0600, Jason Ekstrand wrote:
> > I had a chat with Caio about this and I'm skeptical.  In general, users of 
> > the CF manipulation code shouldn't be stitching two blocks together where 
> > the first contains a jump and the second is non-empty.  If the caller knows 
> > that this case is ok, then they can check for it and empty out the one 
> > block before stitching.  Also, I'm not really seeing how peel_initial_if 
> > would hit this case from your example.
> > 
> > 
> The problem happens when moving the continous list to the end of continue 
> block in loop; the former ends in a jump ("break") and the later also ends in 
> a jump ("continue"), so stitch block complains because there will be an 
> instruction (the "continue") after the jump (the "break").

I was investigating this yesterday and attempted to write a MR, could
you take a look?

https://gitlab.freedesktop.org/mesa/mesa/merge_requests/238


Caio
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] anv/cmd_buffer: check for NULL framebuffer

On Tue, Feb 12, 2019 at 10:48 AM Juan A. Suarez Romero 
wrote:

> This can happen when we record a VkCmdDraw in a secondary buffer that
> was created inheriting from the primary buffer, but with the framebuffer
> set to NULL in the VkCommandBufferInheritanceInfo.
>
> Vulkan 1.1.81 spec says that "the application must ensure (using scissor
> if neccesary) that all rendering is contained in the render area [...]
> [which] must be contained within the framebuffer dimesions".
>
> While this should be done by the application, commit 465e5a86 added the
> clamp to the framebuffer size, in case of application does not do it.
> But this requires to know the framebuffer dimensions.
>
> If we do not have a framebuffer at that moment, the best compromise we
> can do is to just apply the scissor as it is, and let the application to
> ensure the rendering is contained in the render area.
>
> v2: do not clamp to framebuffer if there isn't a framebuffer
>
> v3 (Jason):
> - clamp earlier in the conditional
> - clamp to render area if command buffer is primary
>
> v4: clamp also x and y to render area (Jason)
>
> Fixes: 465e5a86 ("anv: Clamp scissors to the framebuffer boundary")
> CC: Jason Ekstrand 
> ---
>  src/intel/vulkan/gen7_cmd_buffer.c | 32 +-
>  1 file changed, 27 insertions(+), 5 deletions(-)
>
> diff --git a/src/intel/vulkan/gen7_cmd_buffer.c
> b/src/intel/vulkan/gen7_cmd_buffer.c
> index 352892aee33..2924c6031fd 100644
> --- a/src/intel/vulkan/gen7_cmd_buffer.c
> +++ b/src/intel/vulkan/gen7_cmd_buffer.c
> @@ -70,12 +70,34 @@ gen7_cmd_buffer_emit_scissor(struct anv_cmd_buffer
> *cmd_buffer)
>};
>
>const int max = 0x;
> +
> +  uint32_t y = s->offset.y;
> +  uint32_t x = s->offset.x;
> +  uint32_t height = s->offset.y + s->extent.height - 1;
> +  uint32_t width = s->offset.x + s->extent.width - 1;
>

These should be x_max and y_max not width and height.  With that changed,

Reviewed-by: Jason Ekstrand 

Sorry we're going to v5...

--Jason


> +
> +  /* Do this math using int64_t so overflow gets clamped correctly. */
> +  if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
> + y = clamp_int64((uint64_t) y,
> cmd_buffer->state.render_area.offset.y, max);
> + x = clamp_int64((uint64_t) x,
> cmd_buffer->state.render_area.offset.x, max);
> + height = clamp_int64((uint64_t) height, 0,
> +  cmd_buffer->state.render_area.offset.y +
> +  cmd_buffer->state.render_area.extent.height
> - 1);
> + width = clamp_int64((uint64_t) width, 0,
> + cmd_buffer->state.render_area.offset.x +
> + cmd_buffer->state.render_area.extent.width -
> 1);
> +  } else if (fb) {
> + y = clamp_int64((uint64_t) y, 0, max);
> + x = clamp_int64((uint64_t) x, 0, max);
> + height = clamp_int64((uint64_t) height, 0, fb->height - 1);
> + width = clamp_int64((uint64_t) width, 0, fb->width - 1);
> +  }
> +
>struct GEN7_SCISSOR_RECT scissor = {
> - /* Do this math using int64_t so overflow gets clamped
> correctly. */
> - .ScissorRectangleYMin = clamp_int64(s->offset.y, 0, max),
> - .ScissorRectangleXMin = clamp_int64(s->offset.x, 0, max),
> - .ScissorRectangleYMax = clamp_int64((uint64_t) s->offset.y +
> s->extent.height - 1, 0, fb->height - 1),
> - .ScissorRectangleXMax = clamp_int64((uint64_t) s->offset.x +
> s->extent.width - 1, 0, fb->width - 1)
> + .ScissorRectangleYMin = y,
> + .ScissorRectangleXMin = x,
> + .ScissorRectangleYMax = height,
> + .ScissorRectangleXMax = width
>};
>
>if (s->extent.width <= 0 || s->extent.height <= 0) {
> --
> 2.20.1
>
>
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 00/38] radv, ac: 16-bit and 8-bit arithmetic and 8-bit storage

How about splitting this series in four different parts? One for every 
extension? Is this doable without too much troubles?


On 2/12/19 6:02 PM, Rhys Perry wrote:

It currently requires review (and possibly rebasing). Marek Olšák send
some feedback for a few of the patches but other than that, it hasn't
gotten much attention.

Also patch 35 seems to vectorize 32-bit code which can help or hurt
shaders quite a bit and seems to hurt shaders overall. I'm not yet
sure how to solve this without removing it or changing the result of
LLVM's SLP vectorizer significantly.
IIRC enabling SLP vectorizer also uncovered a RA bug with a shader.

I think I'll look into the issues with patch 35 again.

On Tue, 12 Feb 2019 at 16:30, Samuel Pitoiset  wrote:

What's the status of this?

On 12/7/18 6:21 PM, Rhys Perry wrote:

This series add support for:
- VK_KHR_shader_float16_int8
- VK_AMD_gpu_shader_half_float
- VK_AMD_gpu_shader_int16
- VK_KHR_8bit_storage
on VI+. Half floats are currently disabled on LLVM 7 because of a bug
causing large memory usage and long (or unbounded) compilation times with
some tests.

It depends on the follow patch series:
- https://patchwork.freedesktop.org/series/53454/
- https://patchwork.freedesktop.org/series/53602/
- https://patchwork.freedesktop.org/series/53660/

An older version was tested on my Polaris card, but due to hardware issues
I currently can't test the latest version of the series.

deqp-vk has no regressions and none of the newly enabled tests fail.

Rhys Perry (38):
ac: add various helpers for float16/int16/int8
ac/nir: implement 8-bit push constant, ssbo and ubo loads
ac/nir: implement 8-bit ssbo stores
ac/nir: fix 16-bit ssbo stores
ac/nir: implement 8-bit nir_load_const_instr
ac/nir: implement 8-bit conversions
ac/nir: fix 64-bit nir_op_f2f16_rtz
ac/nir: make ac_build_clamp work on all bit sizes
ac/nir: make ac_build_fract work on all bit sizes
ac/nir: make ac_build_isign work on all bit sizes
ac/nir: make ac_build_fsign work on all bit sizes
ac/nir: make ac_build_fdiv support 16-bit floats
ac/nir: implement half-float nir_op_frcp
ac/nir: implement half-float nir_op_frsq
ac/nir: implement half-float nir_op_ldexp
radv: lower 16-bit flrp
ac/nir: support half floats in emit_b2f
ac/nir: make emit_b2i work on all bit sizes
ac/nir: implement 16-bit shifts
compiler/nir: add lowering option for 16-bit ffma
ac/nir: implement 16-bit ac_build_ddxy
ac/nir: implement 8 and 16 bit ac_build_readlane
nir: make bitfield_reverse and ifind_msb work with all integers
ac/nir: make ac_find_lsb work on all bit sizes
ac/nir: make ac_build_umsb work on all bit sizes
ac/nir: implement 8 and 16 bit ac_build_imsb
ac/nir: make ac_build_bit_count work on all bit sizes
ac/nir: make ac_build_bitfield_reverse work on all bit sizes
ac/nir: implement 16-bit pack/unpack opcodes
ac/nir: add 8-bit and 16-bit types to glsl_base_to_llvm_type
ac/nir,radv: create an array of varying output types
ac/nir: store all outputs as f32
radv: store all fragment shader inputs as f32
radv: handle all fragment output types
ac,radv: run LLVM's SLP vectorizer
ac/nir: generate better code for nir_op_f2f16_rtz
ac/nir: have nir_op_f2f16 round to zero
radv: expose float16, int16 and int8 features and extensions

   src/amd/common/ac_llvm_build.c| 355 ++
   src/amd/common/ac_llvm_build.h|  22 +-
   src/amd/common/ac_llvm_util.c |   9 +-
   src/amd/common/ac_llvm_util.h |   1 +
   src/amd/common/ac_nir_to_llvm.c   | 258 +++
   src/amd/common/ac_shader_abi.h|   1 +
   src/amd/vulkan/radv_device.c  |  17 ++
   src/amd/vulkan/radv_extensions.py |   4 +
   src/amd/vulkan/radv_nir_to_llvm.c |  92 ---
   src/amd/vulkan/radv_shader.c  |   7 +
   src/broadcom/compiler/nir_to_vir.c|   1 +
   src/compiler/nir/nir.h|   1 +
   src/compiler/nir/nir_opcodes.py   |   4 +-
   src/compiler/nir/nir_opt_algebraic.py |   4 +-
   src/gallium/drivers/radeonsi/si_get.c |   1 +
   src/gallium/drivers/vc4/vc4_program.c |   1 +
   16 files changed, 516 insertions(+), 262 deletions(-)


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 00/38] radv, ac: 16-bit and 8-bit arithmetic and 8-bit storage

2019-02-12 Thread Rhys Perry

It currently requires review (and possibly rebasing). Marek Olšák send
some feedback for a few of the patches but other than that, it hasn't
gotten much attention.

Also patch 35 seems to vectorize 32-bit code which can help or hurt
shaders quite a bit and seems to hurt shaders overall. I'm not yet
sure how to solve this without removing it or changing the result of
LLVM's SLP vectorizer significantly.
IIRC enabling SLP vectorizer also uncovered a RA bug with a shader.

I think I'll look into the issues with patch 35 again.

On Tue, 12 Feb 2019 at 16:30, Samuel Pitoiset  wrote:
>
> What's the status of this?
>
> On 12/7/18 6:21 PM, Rhys Perry wrote:
> > This series add support for:
> > - VK_KHR_shader_float16_int8
> > - VK_AMD_gpu_shader_half_float
> > - VK_AMD_gpu_shader_int16
> > - VK_KHR_8bit_storage
> > on VI+. Half floats are currently disabled on LLVM 7 because of a bug
> > causing large memory usage and long (or unbounded) compilation times with
> > some tests.
> >
> > It depends on the follow patch series:
> > - https://patchwork.freedesktop.org/series/53454/
> > - https://patchwork.freedesktop.org/series/53602/
> > - https://patchwork.freedesktop.org/series/53660/
> >
> > An older version was tested on my Polaris card, but due to hardware issues
> > I currently can't test the latest version of the series.
> >
> > deqp-vk has no regressions and none of the newly enabled tests fail.
> >
> > Rhys Perry (38):
> >ac: add various helpers for float16/int16/int8
> >ac/nir: implement 8-bit push constant, ssbo and ubo loads
> >ac/nir: implement 8-bit ssbo stores
> >ac/nir: fix 16-bit ssbo stores
> >ac/nir: implement 8-bit nir_load_const_instr
> >ac/nir: implement 8-bit conversions
> >ac/nir: fix 64-bit nir_op_f2f16_rtz
> >ac/nir: make ac_build_clamp work on all bit sizes
> >ac/nir: make ac_build_fract work on all bit sizes
> >ac/nir: make ac_build_isign work on all bit sizes
> >ac/nir: make ac_build_fsign work on all bit sizes
> >ac/nir: make ac_build_fdiv support 16-bit floats
> >ac/nir: implement half-float nir_op_frcp
> >ac/nir: implement half-float nir_op_frsq
> >ac/nir: implement half-float nir_op_ldexp
> >radv: lower 16-bit flrp
> >ac/nir: support half floats in emit_b2f
> >ac/nir: make emit_b2i work on all bit sizes
> >ac/nir: implement 16-bit shifts
> >compiler/nir: add lowering option for 16-bit ffma
> >ac/nir: implement 16-bit ac_build_ddxy
> >ac/nir: implement 8 and 16 bit ac_build_readlane
> >nir: make bitfield_reverse and ifind_msb work with all integers
> >ac/nir: make ac_find_lsb work on all bit sizes
> >ac/nir: make ac_build_umsb work on all bit sizes
> >ac/nir: implement 8 and 16 bit ac_build_imsb
> >ac/nir: make ac_build_bit_count work on all bit sizes
> >ac/nir: make ac_build_bitfield_reverse work on all bit sizes
> >ac/nir: implement 16-bit pack/unpack opcodes
> >ac/nir: add 8-bit and 16-bit types to glsl_base_to_llvm_type
> >ac/nir,radv: create an array of varying output types
> >ac/nir: store all outputs as f32
> >radv: store all fragment shader inputs as f32
> >radv: handle all fragment output types
> >ac,radv: run LLVM's SLP vectorizer
> >ac/nir: generate better code for nir_op_f2f16_rtz
> >ac/nir: have nir_op_f2f16 round to zero
> >radv: expose float16, int16 and int8 features and extensions
> >
> >   src/amd/common/ac_llvm_build.c| 355 ++
> >   src/amd/common/ac_llvm_build.h|  22 +-
> >   src/amd/common/ac_llvm_util.c |   9 +-
> >   src/amd/common/ac_llvm_util.h |   1 +
> >   src/amd/common/ac_nir_to_llvm.c   | 258 +++
> >   src/amd/common/ac_shader_abi.h|   1 +
> >   src/amd/vulkan/radv_device.c  |  17 ++
> >   src/amd/vulkan/radv_extensions.py |   4 +
> >   src/amd/vulkan/radv_nir_to_llvm.c |  92 ---
> >   src/amd/vulkan/radv_shader.c  |   7 +
> >   src/broadcom/compiler/nir_to_vir.c|   1 +
> >   src/compiler/nir/nir.h|   1 +
> >   src/compiler/nir/nir_opcodes.py   |   4 +-
> >   src/compiler/nir/nir_opt_algebraic.py |   4 +-
> >   src/gallium/drivers/radeonsi/si_get.c |   1 +
> >   src/gallium/drivers/vc4/vc4_program.c |   1 +
> >   16 files changed, 516 insertions(+), 262 deletions(-)
> >
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [MR] nir: move pixel_center_integer/origin_upper_left to shader_info.fs

2019-02-12 Thread apinheiro


https://gitlab.freedesktop.org/mesa/mesa/merge_requests/237

New version of the thread that I sent recently, showing two initial 
versions to solve the regression I found on MR #144:


https://lists.freedesktop.org/archives/mesa-dev/2019-February/214808.html

This MR includes a v2 of the second option. It includes Jason's feedback 
plus some extra cleaning-ups that I found while re-checking the GLSL 
linker (like removing PixelCenterInteger/OriginUpperLeft from 
gl_program, as that info is tracked at gl_program.info.fs.xxx). Due 
that, it touches a lot of places on Mesa.


This MR also includes a second patch, that removes 
pixel_center_integer/origin_upper_left from the ir variable. Basically 
because that is already tracked in a lot of places, so it is not really 
needed. It is on a different patch because I initially though of it as 
something optional, as it is not really required to fix the regression. 
But after finishing it, the only reason to keep it as a different patch 
is to make easier the review. I think that it would be better to squash 
both patches, although I don't have a strong opinion, I would let the 
reviewer give his opinion.



___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [Bug 109575] Mesa-19.0.0-rc1 : Computer Crashes trying to run anything Vulkan

https://bugs.freedesktop.org/show_bug.cgi?id=109575

Samuel Pitoiset  changed:

   What|Removed |Added

 Status|NEW |RESOLVED
 Resolution|--- |FIXED

--- Comment #14 from Samuel Pitoiset  ---
Fixed.

https://cgit.freedesktop.org/mesa/mesa/commit/?id=1b8983c25be19073c02fe9630e949be55f8280fa

Thanks for confirming. I guess it should also work on Ubuntu 16.04.

-- 
You are receiving this mail because:
You are the QA Contact for the bug.
You are the assignee for the bug.___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [Bug 109107] gallium/st/va: change va max_profiles when using Radeon VCN Hardware

https://bugs.freedesktop.org/show_bug.cgi?id=109107

--- Comment #5 from leoxs...@gmail.com ---
The fixe is in Mesa master branch, also includes fix for playing VP9 with
Chromium. Please make sure run with "allow_rgb10_configs=false" for now.

-- 
You are receiving this mail because:
You are the assignee for the bug.
You are the QA Contact for the bug.___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [Bug 109575] Mesa-19.0.0-rc1 : Computer Crashes trying to run anything Vulkan

https://bugs.freedesktop.org/show_bug.cgi?id=109575

--- Comment #13 from LunarG  ---
Yes! this patch worked for me on my Ubuntu 18.04 - AMD R9 380 system! I will
check with my other AMD systems today.

-- 
You are receiving this mail because:
You are the QA Contact for the bug.
You are the assignee for the bug.___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 00/38] radv, ac: 16-bit and 8-bit arithmetic and 8-bit storage


What's the status of this?

On 12/7/18 6:21 PM, Rhys Perry wrote:

This series add support for:
- VK_KHR_shader_float16_int8
- VK_AMD_gpu_shader_half_float
- VK_AMD_gpu_shader_int16
- VK_KHR_8bit_storage
on VI+. Half floats are currently disabled on LLVM 7 because of a bug
causing large memory usage and long (or unbounded) compilation times with
some tests.

It depends on the follow patch series:
- https://patchwork.freedesktop.org/series/53454/
- https://patchwork.freedesktop.org/series/53602/
- https://patchwork.freedesktop.org/series/53660/

An older version was tested on my Polaris card, but due to hardware issues
I currently can't test the latest version of the series.

deqp-vk has no regressions and none of the newly enabled tests fail.

Rhys Perry (38):
   ac: add various helpers for float16/int16/int8
   ac/nir: implement 8-bit push constant, ssbo and ubo loads
   ac/nir: implement 8-bit ssbo stores
   ac/nir: fix 16-bit ssbo stores
   ac/nir: implement 8-bit nir_load_const_instr
   ac/nir: implement 8-bit conversions
   ac/nir: fix 64-bit nir_op_f2f16_rtz
   ac/nir: make ac_build_clamp work on all bit sizes
   ac/nir: make ac_build_fract work on all bit sizes
   ac/nir: make ac_build_isign work on all bit sizes
   ac/nir: make ac_build_fsign work on all bit sizes
   ac/nir: make ac_build_fdiv support 16-bit floats
   ac/nir: implement half-float nir_op_frcp
   ac/nir: implement half-float nir_op_frsq
   ac/nir: implement half-float nir_op_ldexp
   radv: lower 16-bit flrp
   ac/nir: support half floats in emit_b2f
   ac/nir: make emit_b2i work on all bit sizes
   ac/nir: implement 16-bit shifts
   compiler/nir: add lowering option for 16-bit ffma
   ac/nir: implement 16-bit ac_build_ddxy
   ac/nir: implement 8 and 16 bit ac_build_readlane
   nir: make bitfield_reverse and ifind_msb work with all integers
   ac/nir: make ac_find_lsb work on all bit sizes
   ac/nir: make ac_build_umsb work on all bit sizes
   ac/nir: implement 8 and 16 bit ac_build_imsb
   ac/nir: make ac_build_bit_count work on all bit sizes
   ac/nir: make ac_build_bitfield_reverse work on all bit sizes
   ac/nir: implement 16-bit pack/unpack opcodes
   ac/nir: add 8-bit and 16-bit types to glsl_base_to_llvm_type
   ac/nir,radv: create an array of varying output types
   ac/nir: store all outputs as f32
   radv: store all fragment shader inputs as f32
   radv: handle all fragment output types
   ac,radv: run LLVM's SLP vectorizer
   ac/nir: generate better code for nir_op_f2f16_rtz
   ac/nir: have nir_op_f2f16 round to zero
   radv: expose float16, int16 and int8 features and extensions

  src/amd/common/ac_llvm_build.c| 355 ++
  src/amd/common/ac_llvm_build.h|  22 +-
  src/amd/common/ac_llvm_util.c |   9 +-
  src/amd/common/ac_llvm_util.h |   1 +
  src/amd/common/ac_nir_to_llvm.c   | 258 +++
  src/amd/common/ac_shader_abi.h|   1 +
  src/amd/vulkan/radv_device.c  |  17 ++
  src/amd/vulkan/radv_extensions.py |   4 +
  src/amd/vulkan/radv_nir_to_llvm.c |  92 ---
  src/amd/vulkan/radv_shader.c  |   7 +
  src/broadcom/compiler/nir_to_vir.c|   1 +
  src/compiler/nir/nir.h|   1 +
  src/compiler/nir/nir_opcodes.py   |   4 +-
  src/compiler/nir/nir_opt_algebraic.py |   4 +-
  src/gallium/drivers/radeonsi/si_get.c |   1 +
  src/gallium/drivers/vc4/vc4_program.c |   1 +
  16 files changed, 516 insertions(+), 262 deletions(-)


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH v5 5/5] gallium/auxiliary/vl: Add video compositor compute shader render

2019-02-12 Thread Liu, Leo


On 2/12/19 10:35 AM, Zhu, James wrote:
> Add compute shader initilization, assign and cleanup in vl_compositor API.
> Set video compositor compute shader render as default when pipe support it.
>
> Signed-off-by: James Zhu 
> Reviewed-by: Christian König 
> ---
>   src/gallium/auxiliary/vl/vl_compositor.c | 108 
> +++
>   src/gallium/auxiliary/vl/vl_compositor.h |   3 +
>   2 files changed, 83 insertions(+), 28 deletions(-)
>
> diff --git a/src/gallium/auxiliary/vl/vl_compositor.c 
> b/src/gallium/auxiliary/vl/vl_compositor.c
> index 4509913..cd404b9 100644
> --- a/src/gallium/auxiliary/vl/vl_compositor.c
> +++ b/src/gallium/auxiliary/vl/vl_compositor.c
> @@ -28,6 +28,9 @@
>   #include "util/u_sampler.h"
>   
>   #include "vl_compositor_gfx.h"
> +#include "vl_compositor_cs.h"
> +
> +static int pip_compute_supported = 0;

Please rename it to "pipe_xxx" and put it to vl_compositor structure.

Leo


>   
>   static bool
>   init_shaders(struct vl_compositor *c)
> @@ -40,18 +43,6 @@ init_shaders(struct vl_compositor *c)
> return false;
>  }
>   
> -   c->fs_video_buffer = create_frag_shader_video_buffer(c);
> -   if (!c->fs_video_buffer) {
> -  debug_printf("Unable to create YCbCr-to-RGB fragment shader.\n");
> -  return false;
> -   }
> -
> -   c->fs_weave_rgb = create_frag_shader_weave_rgb(c);
> -   if (!c->fs_weave_rgb) {
> -  debug_printf("Unable to create YCbCr-to-RGB weave fragment shader.\n");
> -  return false;
> -   }
> -
>  c->fs_yuv.weave.y = create_frag_shader_deint_yuv(c, true, true);
>  c->fs_yuv.weave.uv = create_frag_shader_deint_yuv(c, false, true);
>  c->fs_yuv.bob.y = create_frag_shader_deint_yuv(c, true, false);
> @@ -74,12 +65,6 @@ init_shaders(struct vl_compositor *c)
> return false;
>  }
>   
> -   c->fs_rgba = create_frag_shader_rgba(c);
> -   if (!c->fs_rgba) {
> -  debug_printf("Unable to create RGB-to-RGB fragment shader.\n");
> -  return false;
> -   }
> -
>  c->fs_rgb_yuv.y = create_frag_shader_rgb_yuv(c, true);
>  c->fs_rgb_yuv.uv = create_frag_shader_rgb_yuv(c, false);
>  if (!c->fs_rgb_yuv.y || !c->fs_rgb_yuv.uv) {
> @@ -87,6 +72,44 @@ init_shaders(struct vl_compositor *c)
> return false;
>  }
>   
> +   if (pip_compute_supported) {
> +  c->cs_video_buffer = vl_compositor_cs_create_shader(c, 
> compute_shader_video_buffer);
> +  if (!c->cs_video_buffer) {
> + debug_printf("Unable to create video_buffer compute shader.\n");
> + return false;
> +  }
> +
> +  c->cs_weave_rgb = vl_compositor_cs_create_shader(c, 
> compute_shader_weave);
> +  if (!c->cs_weave_rgb) {
> + debug_printf("Unable to create weave_rgb compute shader.\n");
> + return false;
> +  }
> +
> +  c->cs_rgba = vl_compositor_cs_create_shader(c, compute_shader_rgba);
> +  if (!c->cs_rgba) {
> + debug_printf("Unable to create RGB-to-RGB compute shader.\n");
> + return false;
> +  }
> +   } else {
> +  c->fs_video_buffer = create_frag_shader_video_buffer(c);
> +  if (!c->fs_video_buffer) {
> + debug_printf("Unable to create YCbCr-to-RGB fragment shader.\n");
> + return false;
> +  }
> +
> +  c->fs_weave_rgb = create_frag_shader_weave_rgb(c);
> +  if (!c->fs_weave_rgb) {
> + debug_printf("Unable to create YCbCr-to-RGB weave fragment 
> shader.\n");
> + return false;
> +  }
> +
> +  c->fs_rgba = create_frag_shader_rgba(c);
> +  if (!c->fs_rgba) {
> + debug_printf("Unable to create RGB-to-RGB fragment shader.\n");
> + return false;
> +  }
> +   }
> +
>  return true;
>   }
>   
> @@ -95,17 +118,24 @@ static void cleanup_shaders(struct vl_compositor *c)
>  assert(c);
>   
>  c->pipe->delete_vs_state(c->pipe, c->vs);
> -   c->pipe->delete_fs_state(c->pipe, c->fs_video_buffer);
> -   c->pipe->delete_fs_state(c->pipe, c->fs_weave_rgb);
>  c->pipe->delete_fs_state(c->pipe, c->fs_yuv.weave.y);
>  c->pipe->delete_fs_state(c->pipe, c->fs_yuv.weave.uv);
>  c->pipe->delete_fs_state(c->pipe, c->fs_yuv.bob.y);
>  c->pipe->delete_fs_state(c->pipe, c->fs_yuv.bob.uv);
>  c->pipe->delete_fs_state(c->pipe, c->fs_palette.yuv);
>  c->pipe->delete_fs_state(c->pipe, c->fs_palette.rgb);
> -   c->pipe->delete_fs_state(c->pipe, c->fs_rgba);
>  c->pipe->delete_fs_state(c->pipe, c->fs_rgb_yuv.y);
>  c->pipe->delete_fs_state(c->pipe, c->fs_rgb_yuv.uv);
> +
> +   if (pip_compute_supported) {
> +  c->pipe->delete_compute_state(c->pipe, c->cs_video_buffer);
> +  c->pipe->delete_compute_state(c->pipe, c->cs_weave_rgb);
> +  c->pipe->delete_compute_state(c->pipe, c->cs_rgba);
> +   } else {
> +  c->pipe->delete_fs_state(c->pipe, c->fs_video_buffer);
> +  c->pipe->delete_fs_state(c->pipe, c->fs_weave_rgb);
> +  c->pipe->delete_fs_state(c->pipe, c->fs_rgba);
> +   }
>   }
>   
>   static bool
> @@ -409,6 +439,7 @@

Re: [Mesa-dev] [PATCH] nir: allow stitching of non-empty block

On Fri, 2019-02-08 at 15:39 -0600, Jason Ekstrand wrote:
> I had a chat with Caio about this and I'm skeptical.  In general, users of 
> the CF manipulation code shouldn't be stitching two blocks together where the 
> first contains a jump and the second is non-empty.  If the caller knows that 
> this case is ok, then they can check for it and empty out the one block 
> before stitching.  Also, I'm not really seeing how peel_initial_if would hit 
> this case from your example.
> 
> 
The problem happens when moving the continous list to the end of continue block 
in loop; the former ends in a jump ("break") and the later also ends in a jump 
("continue"), so stitch block complains because there will be an instruction 
(the "continue") after the jump (the "break").
As you mentioned, maybe the caller can detect this situation and just get rid 
of the jump instruction in the continue block, before the stitching. After all, 
after the merge it won't never be called.
I'm sending a new patch for this.

J.A.
> --Jason
> 
> 
> On Fri, Jan 25, 2019 at 11:37 AM Juan A. Suarez Romero  
> wrote:
> > When stitching two blocks A and B, where A's last instruction is a jump,
> > 
> > it is not required that B is empty; it can be plainly removed.
> > 
> > 
> > 
> > This can happen in a situation like this:
> > 
> > 
> > 
> > vec1 1 ssa_1 = load_const (true)
> > 
> > vec1 1 ssa_2 = load_const (false)
> > 
> > block block_1:
> > 
> > [...]
> > 
> > loop {
> > 
> >   vec1 ssa_3 = phi block_1: ssa_2, block_4: ssa_1
> > 
> >   if ssa_3 {
> > 
> > block block_2:
> > 
> > [...]
> > 
> > break
> > 
> >   } else {
> > 
> > block block_3:
> > 
> >   }
> > 
> >   vec1 ssa_4 = 
> > 
> >   if ssa_4 {
> > 
> > block block_4:
> > 
> > continue
> > 
> >   } else {
> > 
> > block block_5:
> > 
> >   }
> > 
> >   block block_6:
> > 
> >   [...]
> > 
> > }
> > 
> > 
> > 
> > And opt_peel_loop_initial_if is applied. In this case, we would be
> > 
> > ending up stitching block_2 (which finalizes with a jump) with
> > 
> > block_4, which is not empty.
> > 
> > 
> > 
> > CC: Jason Ekstrand 
> > 
> > ---
> > 
> >  src/compiler/nir/nir_control_flow.c | 1 -
> > 
> >  1 file changed, 1 deletion(-)
> > 
> > 
> > 
> > diff --git a/src/compiler/nir/nir_control_flow.c 
> > b/src/compiler/nir/nir_control_flow.c
> > 
> > index ddba2e55b45..27508f230d6 100644
> > 
> > --- a/src/compiler/nir/nir_control_flow.c
> > 
> > +++ b/src/compiler/nir/nir_control_flow.c
> > 
> > @@ -550,7 +550,6 @@ stitch_blocks(nir_block *before, nir_block *after)
> > 
> >  */
> > 
> > 
> > 
> > if (nir_block_ends_in_jump(before)) {
> > 
> > -  assert(exec_list_is_empty(>instr_list));
> > 
> >if (after->successors[0])
> > 
> >   remove_phi_src(after->successors[0], after);
> > 
> >if (after->successors[1])
> > 
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v5 5/5] gallium/auxiliary/vl: Add video compositor compute shader render

2019-02-12 Thread Zhu, James

Add compute shader initilization, assign and cleanup in vl_compositor API.
Set video compositor compute shader render as default when pipe support it.

Signed-off-by: James Zhu 
Reviewed-by: Christian König 
---
 src/gallium/auxiliary/vl/vl_compositor.c | 108 +++
 src/gallium/auxiliary/vl/vl_compositor.h |   3 +
 2 files changed, 83 insertions(+), 28 deletions(-)

diff --git a/src/gallium/auxiliary/vl/vl_compositor.c 
b/src/gallium/auxiliary/vl/vl_compositor.c
index 4509913..cd404b9 100644
--- a/src/gallium/auxiliary/vl/vl_compositor.c
+++ b/src/gallium/auxiliary/vl/vl_compositor.c
@@ -28,6 +28,9 @@
 #include "util/u_sampler.h"
 
 #include "vl_compositor_gfx.h"
+#include "vl_compositor_cs.h"
+
+static int pip_compute_supported = 0;
 
 static bool
 init_shaders(struct vl_compositor *c)
@@ -40,18 +43,6 @@ init_shaders(struct vl_compositor *c)
   return false;
}
 
-   c->fs_video_buffer = create_frag_shader_video_buffer(c);
-   if (!c->fs_video_buffer) {
-  debug_printf("Unable to create YCbCr-to-RGB fragment shader.\n");
-  return false;
-   }
-
-   c->fs_weave_rgb = create_frag_shader_weave_rgb(c);
-   if (!c->fs_weave_rgb) {
-  debug_printf("Unable to create YCbCr-to-RGB weave fragment shader.\n");
-  return false;
-   }
-
c->fs_yuv.weave.y = create_frag_shader_deint_yuv(c, true, true);
c->fs_yuv.weave.uv = create_frag_shader_deint_yuv(c, false, true);
c->fs_yuv.bob.y = create_frag_shader_deint_yuv(c, true, false);
@@ -74,12 +65,6 @@ init_shaders(struct vl_compositor *c)
   return false;
}
 
-   c->fs_rgba = create_frag_shader_rgba(c);
-   if (!c->fs_rgba) {
-  debug_printf("Unable to create RGB-to-RGB fragment shader.\n");
-  return false;
-   }
-
c->fs_rgb_yuv.y = create_frag_shader_rgb_yuv(c, true);
c->fs_rgb_yuv.uv = create_frag_shader_rgb_yuv(c, false);
if (!c->fs_rgb_yuv.y || !c->fs_rgb_yuv.uv) {
@@ -87,6 +72,44 @@ init_shaders(struct vl_compositor *c)
   return false;
}
 
+   if (pip_compute_supported) {
+  c->cs_video_buffer = vl_compositor_cs_create_shader(c, 
compute_shader_video_buffer);
+  if (!c->cs_video_buffer) {
+ debug_printf("Unable to create video_buffer compute shader.\n");
+ return false;
+  }
+
+  c->cs_weave_rgb = vl_compositor_cs_create_shader(c, 
compute_shader_weave);
+  if (!c->cs_weave_rgb) {
+ debug_printf("Unable to create weave_rgb compute shader.\n");
+ return false;
+  }
+
+  c->cs_rgba = vl_compositor_cs_create_shader(c, compute_shader_rgba);
+  if (!c->cs_rgba) {
+ debug_printf("Unable to create RGB-to-RGB compute shader.\n");
+ return false;
+  }
+   } else {
+  c->fs_video_buffer = create_frag_shader_video_buffer(c);
+  if (!c->fs_video_buffer) {
+ debug_printf("Unable to create YCbCr-to-RGB fragment shader.\n");
+ return false;
+  }
+
+  c->fs_weave_rgb = create_frag_shader_weave_rgb(c);
+  if (!c->fs_weave_rgb) {
+ debug_printf("Unable to create YCbCr-to-RGB weave fragment 
shader.\n");
+ return false;
+  }
+
+  c->fs_rgba = create_frag_shader_rgba(c);
+  if (!c->fs_rgba) {
+ debug_printf("Unable to create RGB-to-RGB fragment shader.\n");
+ return false;
+  }
+   }
+
return true;
 }
 
@@ -95,17 +118,24 @@ static void cleanup_shaders(struct vl_compositor *c)
assert(c);
 
c->pipe->delete_vs_state(c->pipe, c->vs);
-   c->pipe->delete_fs_state(c->pipe, c->fs_video_buffer);
-   c->pipe->delete_fs_state(c->pipe, c->fs_weave_rgb);
c->pipe->delete_fs_state(c->pipe, c->fs_yuv.weave.y);
c->pipe->delete_fs_state(c->pipe, c->fs_yuv.weave.uv);
c->pipe->delete_fs_state(c->pipe, c->fs_yuv.bob.y);
c->pipe->delete_fs_state(c->pipe, c->fs_yuv.bob.uv);
c->pipe->delete_fs_state(c->pipe, c->fs_palette.yuv);
c->pipe->delete_fs_state(c->pipe, c->fs_palette.rgb);
-   c->pipe->delete_fs_state(c->pipe, c->fs_rgba);
c->pipe->delete_fs_state(c->pipe, c->fs_rgb_yuv.y);
c->pipe->delete_fs_state(c->pipe, c->fs_rgb_yuv.uv);
+
+   if (pip_compute_supported) {
+  c->pipe->delete_compute_state(c->pipe, c->cs_video_buffer);
+  c->pipe->delete_compute_state(c->pipe, c->cs_weave_rgb);
+  c->pipe->delete_compute_state(c->pipe, c->cs_rgba);
+   } else {
+  c->pipe->delete_fs_state(c->pipe, c->fs_video_buffer);
+  c->pipe->delete_fs_state(c->pipe, c->fs_weave_rgb);
+  c->pipe->delete_fs_state(c->pipe, c->fs_rgba);
+   }
 }
 
 static bool
@@ -409,6 +439,7 @@ vl_compositor_clear_layers(struct vl_compositor_state *s)
   s->layers[i].clearing = i ? false : true;
   s->layers[i].blend = NULL;
   s->layers[i].fs = NULL;
+  s->layers[i].cs = NULL;
   s->layers[i].viewport.scale[2] = 1;
   s->layers[i].viewport.translate[2] = 0;
   s->layers[i].rotate = VL_COMPOSITOR_ROTATE_0;
@@ -532,26 +563,39 @@ vl_compositor_set_buffer_layer(struct vl_compositor_state

Re: [Mesa-dev] [PATCH v4 39/40] anv/pipeline: support Float16 and Int8 SPIR-V capabilities in gen8+


On February 12, 2019 05:57:09 Iago Toral Quiroga  wrote:


v2:
 - Merge Float16 and Int8 capabilities into a single patch (Jason)
 - Merged patch that enabled SPIR-V front-end checks for these caps
   (except for Int8, which was already merged)

Reviewed-by: Jason Ekstrand  (v1)
---
src/compiler/shader_info.h| 1 +
src/compiler/spirv/spirv_to_nir.c | 4 +++-
src/intel/vulkan/anv_pipeline.c   | 2 ++
3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/compiler/shader_info.h b/src/compiler/shader_info.h
index 3d871938751..4726c185243 100644
--- a/src/compiler/shader_info.h
+++ b/src/compiler/shader_info.h
@@ -38,6 +38,7 @@ struct spirv_supported_capabilities {
   bool descriptor_array_dynamic_indexing;
   bool device_group;
   bool draw_parameters;
+   bool float16;
   bool float64;
   bool geometry_streams;
   bool gcn_shader;
diff --git a/src/compiler/spirv/spirv_to_nir.c 
b/src/compiler/spirv/spirv_to_nir.c

index 7e07de2bfc0..309ed6c59b0 100644
--- a/src/compiler/spirv/spirv_to_nir.c
+++ b/src/compiler/spirv/spirv_to_nir.c
@@ -3556,7 +3556,6 @@ vtn_handle_preamble_instruction(struct vtn_builder 
*b, SpvOp opcode,

  case SpvCapabilityLinkage:
  case SpvCapabilityVector16:
  case SpvCapabilityFloat16Buffer:
-  case SpvCapabilityFloat16:
  case SpvCapabilitySparseResidency:
 vtn_warn("Unsupported SPIR-V capability: %s",
  spirv_capability_to_string(cap));
@@ -3573,6 +3572,9 @@ vtn_handle_preamble_instruction(struct vtn_builder 
*b, SpvOp opcode,

  case SpvCapabilityFloat64:
 spv_check_supported(float64, cap);
 break;
+  case SpvCapabilityFloat16:
+ spv_check_supported(float16, cap);
+ break;
  case SpvCapabilityInt64:
 spv_check_supported(int64, cap);
 break;
diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index e2024212bd9..0e8c4245df6 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -139,8 +139,10 @@ anv_shader_compile_to_nir(struct anv_device *device,
 .device_group = true,
 .draw_parameters = true,
 .float64 = pdevice->info.gen >= 8,
+ .float16 = pdevice->info.gen >= 8,


Please things sorted


 .geometry_streams = true,
 .image_write_without_format = true,
+ .int8 = pdevice->info.gen >= 8,
 .int16 = pdevice->info.gen >= 8,
 .int64 = pdevice->info.gen >= 8,
 .min_lod = true,
--
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev




___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [ANNOUNCE] Mesa 18.3.3 release candidate

2019-02-12 Thread Carsten Haitzler

On Mon, 11 Feb 2019 11:41:08 -0800 Eric Anholt  said:

> Carsten Haitzler  writes:
> 
> > On Mon, 04 Feb 2019 16:31:57 -0800 Eric Anholt  said:
> >
> >> Carsten Haitzler  writes:
> >> 
> >> > On Fri, 1 Feb 2019 11:08:07 + Emil Velikov 
> >> > said:
> >> >
> >> >> Hi Carsten,
> >> >> 
> >> >> On 2019/01/31, Carsten Haitzler wrote:
> >> >> > On Wed, 30 Jan 2019 18:33:35 + Emil Velikov
> >> >> >  said:
> >> >> > 
> >> >> > You might want to hold off on this. My bugfix was actually patched
> >> >> > out by partly removing some of it. The void ptr math should never
> >> >> > have been there and wasn't in the final patch.
> >> >> > 
> >> >> > I'm talking about:
> >> >> > 
> >> >> > +void *cpu2 = cpu + 8;
> >> >> > 
> >> >> > In 300d3ae8b1445b5060f92c77c0f577f4b7b2c7d6
> >> >> > 
> >> >> > At least with gcc8 mesa is a dud on Raspberry Pi (can't
> >> >> > upload/downlaod textures without crashing) without the fixes. I moved
> >> >> > the secondary ptr math into the ASM chunk because the C compiler
> >> >> > seemed to just mess up cpu2 ptr content/value for me on gcc8 (it also
> >> >> > kept the parameter inputs/outputs cleaner and consistent with other
> >> >> > ASM chunks). Keeping this as void ptr math alone is just wrong and
> >> >> > asking for trouble and as it unfixed a fix I already had in submitted
> >> >> > patches.
> >> >> > 
> >> >> > Being at FOSDEM I now no longer have access to my OS image with all of
> >> >> > this set up to test and won't until next week. I can't dig in and
> >> >> > verify. Without my fixes at all it's a dead man walking with gcc8,
> >> >> > and thus Arch Linux is broken entirely on Rpi without it (and has
> >> >> > been for a while now).
> >> 
> >> FWIW, my testing was done on gcc 8.30 on raspberry pi.
> >
> > I finally have time and am back with my Pi box. This was gcc 8.2.0 that I
> > was using.
> >
> >> I skipped the part of moving the C expression into the asm because it
> >> didn't make sense, and appeared in the series before the part that
> >> actually fixed the asm clobbers bug, so it (like the .fpu neon part)
> >> looked like random hacks.
> >
> > I did explicitly break just and only that change out. 0004 in the series was
> > just that. The log explained compiler bugs prevent calculating the address
> > in C (it ends up junk) so moved it to the asm block. That required changing
> > the cpu2 refs all to be a register instead and add this register to the
> > clobber list, so of course the patch was more than just a 2 liner, but it
> > was straightforward.
> 
> I'm quite skeptical that it's a compiler bug instead of a bug on our
> end.  If we have a bug in our constraints, I want to fix that bug rather
> than papering over it such that we just don't tickle it on your
> particular compiler/flags combination.  Even if it's a compiler bug, we
> should figure it out and report it.
> 
> If you don't have the time to figure out the root cause, let's see if I
> can.  What compiler flags are you seeing used in your build?  I've been
> using piglit's texsubimage to test with various compiler flags to try to
> reproduce your issue, and haven't managed to with a debug,
> debugoptimized or release build in meson on Mesa master.

I'm just using the standard aur build on a clean system with no customizations
installed. the meson command being used that includes that .h file is:

cc -Isrc/gallium/drivers/vc4/691f666@@vc4@sta -Isrc/gallium/drivers/vc4
-I../mesa/src/gallium/drivers/vc4 -Isrc -I../mesa/src -Iinclude
-I../mesa/include -I../mesa/src/gallium/include -Isrc/gallium/auxiliary
-I../mesa/src/gallium/auxiliary -Isrc/broadcom -I../mesa/src/broadcom
-Isrc/broadcom/cle -I../mesa/src/broadcom/cle -Isrc/gallium/drivers
-I../mesa/src/gallium/drivers -I../mesa/include/drm-uapi -Isrc/compiler/nir
-I../mesa/src/compiler/nir -I/usr/include/libdrm -fdiagnostics-color=always
-DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -std=c99 -D__STDC_CONSTANT_MACROS
-D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS
'-DPACKAGE_VERSION=\"19.0.0-devel\"'
'-DPACKAGE_BUGREPORT=\"https://bugs.freedesktop.org/enter_bug.cgi?product=Mesa\;'
-DGLX_USE_TLS -DHAVE_ST_VDPAU -DENABLE_ST_OMX_BELLAGIO=1
-DENABLE_ST_OMX_TIZONIA=0 -DHAVE_X11_PLATFORM -DGLX_INDIRECT_RENDERING
-DGLX_DIRECT_RENDERING -DGLX_USE_DRM -DHAVE_DRM_PLATFORM
-DHAVE_SURFACELESS_PLATFORM -DENABLE_SHADER_CACHE -DHAVE___BUILTIN_BSWAP32
-DHAVE___BUILTIN_BSWAP64 -DHAVE___BUILTIN_CLZ -DHAVE___BUILTIN_CLZLL
-DHAVE___BUILTIN_CTZ -DHAVE___BUILTIN_EXPECT -DHAVE___BUILTIN_FFS
-DHAVE___BUILTIN_FFSLL -DHAVE___BUILTIN_POPCOUNT -DHAVE___BUILTIN_POPCOUNTLL
-DHAVE___BUILTIN_UNREACHABLE -DHAVE_FUNC_ATTRIBUTE_CONST
-DHAVE_FUNC_ATTRIBUTE_FLATTEN -DHAVE_FUNC_ATTRIBUTE_MALLOC
-DHAVE_FUNC_ATTRIBUTE_PURE -DHAVE_FUNC_ATTRIBUTE_UNUSED
-DHAVE_FUNC_ATTRIBUTE_WARN_UNUSED_RESULT -DHAVE_FUNC_ATTRIBUTE_WEAK
-DHAVE_FUNC_ATTRIBUTE_FORMAT -DHAVE_FUNC_ATTRIBUTE_PACKED
-DHAVE_FUNC_ATTRIBUTE_RETURNS_NONNULL -DHAVE_FUNC_ATTRIBUTE_VISIBILITY
-DHAVE_FUNC_ATTRIBUTE_ALIAS

[Mesa-dev] [PATCH 4/4] radv: reduce the number of loaded channels for vertex input fetches

It's unnecessary to load more channels than the vertex attribute
format. The remaining channels are filled with 0 for y and z,
and 1 for w.

29077 shaders in 15096 tests
Totals:
SGPRS: 1321605 -> 1318869 (-0.21 %)
VGPRS: 935236 -> 932252 (-0.32 %)
Spilled SGPRs: 24860 -> 24776 (-0.34 %)
Code Size: 49832348 -> 49819464 (-0.03 %) bytes
Max Waves: 242101 -> 242611 (0.21 %)

Totals from affected shaders:
SGPRS: 93675 -> 90939 (-2.92 %)
VGPRS: 58016 -> 55032 (-5.14 %)
Spilled SGPRs: 172 -> 88 (-48.84 %)
Code Size: 2862740 -> 2849856 (-0.45 %) bytes
Max Waves: 15474 -> 15984 (3.30 %)

This mostly helps Croteam games (Talos/Sam2017).

Signed-off-by: Samuel Pitoiset 
---
 src/amd/vulkan/radv_nir_to_llvm.c | 83 ++-
 1 file changed, 81 insertions(+), 2 deletions(-)

diff --git a/src/amd/vulkan/radv_nir_to_llvm.c 
b/src/amd/vulkan/radv_nir_to_llvm.c
index 7f74678d5f1..b1e0c64e4e1 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -1967,6 +1967,72 @@ adjust_vertex_fetch_alpha(struct radv_shader_context 
*ctx,
return alpha;
 }
 
+static unsigned
+get_num_channels_from_data_format(unsigned data_format)
+{
+   switch (data_format) {
+   case V_008F0C_BUF_DATA_FORMAT_8:
+   case V_008F0C_BUF_DATA_FORMAT_16:
+   case V_008F0C_BUF_DATA_FORMAT_32:
+   return 1;
+   case V_008F0C_BUF_DATA_FORMAT_8_8:
+   case V_008F0C_BUF_DATA_FORMAT_16_16:
+   case V_008F0C_BUF_DATA_FORMAT_32_32:
+   return 2;
+   case V_008F0C_BUF_DATA_FORMAT_10_11_11:
+   case V_008F0C_BUF_DATA_FORMAT_11_11_10:
+   case V_008F0C_BUF_DATA_FORMAT_32_32_32:
+   return 3;
+   case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
+   case V_008F0C_BUF_DATA_FORMAT_10_10_10_2:
+   case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
+   case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
+   case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
+   return 4;
+   default:
+   break;
+   }
+
+   return 4;
+}
+
+static LLVMValueRef
+radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx,
+   LLVMValueRef value,
+   unsigned num_channels,
+   bool is_float)
+{
+   LLVMValueRef zero = is_float ? ctx->ac.f32_0 : ctx->ac.i32_0;
+   LLVMValueRef one = is_float ? ctx->ac.f32_1 : ctx->ac.i32_1;
+   LLVMTypeRef elemtype;
+   LLVMValueRef chan[4];
+
+   if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
+   unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
+
+   if (num_channels == 4 && vec_size == 4)
+   return value;
+
+   num_channels = MIN2(num_channels, vec_size);
+
+   for (unsigned i = 0; i < num_channels; i++)
+   chan[i] = ac_llvm_extract_elem(>ac, value, i);
+
+   elemtype = LLVMGetElementType(LLVMTypeOf(value));
+   } else {
+   if (num_channels) {
+   assert(num_channels == 1);
+   chan[0] = value;
+   }
+   elemtype = LLVMTypeOf(value);
+   }
+
+   for (unsigned i = num_channels; i < 4; i++)
+   chan[i] = i == 3 ? one : zero;
+
+   return ac_build_gather_values(>ac, chan, 4);
+}
+
 static void
 handle_vs_input_decl(struct radv_shader_context *ctx,
 struct nir_variable *variable)
@@ -1979,7 +2045,7 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
unsigned attrib_count = glsl_count_attribute_slots(variable->type, 
true);
uint8_t input_usage_mask =

ctx->shader_info->info.vs.input_usage_mask[variable->data.location];
-   unsigned num_channels = util_last_bit(input_usage_mask);
+   unsigned num_input_channels = util_last_bit(input_usage_mask);
 
variable->data.driver_location = variable->data.location * 4;
 
@@ -1987,6 +2053,10 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
for (unsigned i = 0; i < attrib_count; ++i) {
LLVMValueRef output[4];
unsigned attrib_index = variable->data.location + i - 
VERT_ATTRIB_GENERIC0;
+   unsigned attrib_format = 
ctx->options->key.vs.vertex_attribute_formats[attrib_index];
+   unsigned data_format = attrib_format & 0x0f;
+   unsigned num_format = (attrib_format >> 4) & 0x07;
+   bool is_float = num_format == V_008F0C_BUF_NUM_FORMAT_FLOAT;
 
if (ctx->options->key.vs.instance_rate_inputs & (1u << 
attrib_index)) {
uint32_t divisor = 
ctx->options->key.vs.instance_rate_divisors[attrib_index];
@@ -2018,12 +2088,21 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
 
t_list = ac_build_load_to_sgpr(>ac, t_list_ptr, t_offset);
 
+   /* Adjust the number of channels to load based on the vertex
+

[Mesa-dev] [PATCH 1/4] ac: make use of ac_build_expand_to_vec4() in visit_image_store()

And make ac_build_expand() a static function.

Signed-off-by: Samuel Pitoiset 
---
 src/amd/common/ac_llvm_build.c  | 9 +
 src/amd/common/ac_llvm_build.h  | 3 ---
 src/amd/common/ac_nir_to_llvm.c | 2 +-
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 9395bd1bbda..d06eb7df50c 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -548,10 +548,11 @@ ac_build_gather_values(struct ac_llvm_context *ctx,
 /* Expand a scalar or vector to  by filling the remaining
  * channels with undef. Extract at most src_channels components from the input.
  */
-LLVMValueRef ac_build_expand(struct ac_llvm_context *ctx,
-LLVMValueRef value,
-unsigned src_channels,
-unsigned dst_channels)
+static LLVMValueRef
+ac_build_expand(struct ac_llvm_context *ctx,
+   LLVMValueRef value,
+   unsigned src_channels,
+   unsigned dst_channels)
 {
LLVMTypeRef elemtype;
LLVMValueRef chan[dst_channels];
diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index f218eaf2832..7f8e2398a25 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -171,9 +171,6 @@ LLVMValueRef
 ac_build_gather_values(struct ac_llvm_context *ctx,
   LLVMValueRef *values,
   unsigned value_count);
-LLVMValueRef ac_build_expand(struct ac_llvm_context *ctx,
-LLVMValueRef value,
-unsigned src_channels, unsigned dst_channels);
 LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx,
 LLVMValueRef value,
 unsigned num_channels);
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 54559b19f02..e06b00a34e9 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -2487,7 +2487,7 @@ static void visit_image_store(struct ac_nir_context *ctx,
unsigned src_channels = ac_get_llvm_num_components(src);
 
if (src_channels == 3)
-   src = ac_build_expand(>ac, src, 3, 4);
+   src = ac_build_expand_to_vec4(>ac, src, 3);
 
params[0] = src; /* data */
params[1] = rsrc;
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 3/4] radv: store vertex attribute formats as pipeline keys

The formats will be used for reducing the number of loaded channels.

Signed-off-by: Samuel Pitoiset 
---
 src/amd/vulkan/radv_pipeline.c | 22 +++---
 src/amd/vulkan/radv_private.h  |  1 +
 src/amd/vulkan/radv_shader.h   |  1 +
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index fb6c61cf3f0..b5de53537d3 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -1874,13 +1874,27 @@ radv_generate_graphics_pipeline_key(struct 
radv_pipeline *pipeline,
}
 
for (unsigned i = 0; i < input_state->vertexAttributeDescriptionCount; 
++i) {
-   unsigned location = 
input_state->pVertexAttributeDescriptions[i].location;
-   unsigned binding = 
input_state->pVertexAttributeDescriptions[i].binding;
+   const VkVertexInputAttributeDescription *desc =
+   _state->pVertexAttributeDescriptions[i];
+   const struct vk_format_description *format_desc;
+   unsigned location = desc->location;
+   unsigned binding = desc->binding;
+   unsigned num_format, data_format;
+   int first_non_void;
+
if (binding_input_rate & (1u << binding)) {
key.instance_rate_inputs |= 1u << location;
key.instance_rate_divisors[location] = 
instance_rate_divisors[binding];
}
 
+   format_desc = vk_format_description(desc->format);
+   first_non_void = 
vk_format_get_first_non_void_channel(desc->format);
+
+   num_format = radv_translate_buffer_numformat(format_desc, 
first_non_void);
+   data_format = radv_translate_buffer_dataformat(format_desc, 
first_non_void);
+
+   key.vertex_attribute_formats[location] = data_format | 
(num_format << 4);
+
if (pipeline->device->physical_device->rad_info.chip_class <= 
VI &&
pipeline->device->physical_device->rad_info.family != 
CHIP_STONEY) {
VkFormat format = 
input_state->pVertexAttributeDescriptions[i].format;
@@ -1932,8 +1946,10 @@ radv_fill_shader_keys(struct radv_shader_variant_key 
*keys,
 {
keys[MESA_SHADER_VERTEX].vs.instance_rate_inputs = 
key->instance_rate_inputs;
keys[MESA_SHADER_VERTEX].vs.alpha_adjust = key->vertex_alpha_adjust;
-   for (unsigned i = 0; i < MAX_VERTEX_ATTRIBS; ++i)
+   for (unsigned i = 0; i < MAX_VERTEX_ATTRIBS; ++i) {
keys[MESA_SHADER_VERTEX].vs.instance_rate_divisors[i] = 
key->instance_rate_divisors[i];
+   keys[MESA_SHADER_VERTEX].vs.vertex_attribute_formats[i] = 
key->vertex_attribute_formats[i];
+   }
 
if (nir[MESA_SHADER_TESS_CTRL]) {
keys[MESA_SHADER_VERTEX].vs.as_ls = true;
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index d4a9e9f0e95..0e6b7aca61c 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -362,6 +362,7 @@ struct radv_pipeline_cache {
 struct radv_pipeline_key {
uint32_t instance_rate_inputs;
uint32_t instance_rate_divisors[MAX_VERTEX_ATTRIBS];
+   uint8_t vertex_attribute_formats[MAX_VERTEX_ATTRIBS];
uint64_t vertex_alpha_adjust;
unsigned tess_input_vertices;
uint32_t col_format;
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index b67cd2b4f15..1f440b0d54e 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -65,6 +65,7 @@ enum {
 struct radv_vs_variant_key {
uint32_t instance_rate_inputs;
uint32_t instance_rate_divisors[MAX_VERTEX_ATTRIBS];
+   uint8_t vertex_attribute_formats[MAX_VERTEX_ATTRIBS];
 
/* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega 
HW.
 * so we may need to fix it up. */
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [Bug 109543] After upgrade mesa to 19.0.0~rc1 all vulkan based application stop working ["vulkan-cube" received SIGSEGV in radv_pipeline_init_blend_state at ../src/amd/vulkan/radv_pipeline.

https://bugs.freedesktop.org/show_bug.cgi?id=109543

Samuel Pitoiset  changed:

   What|Removed |Added

 Resolution|--- |FIXED
 Status|NEW |RESOLVED

--- Comment #14 from Samuel Pitoiset  ---
Fixed.

https://cgit.freedesktop.org/mesa/mesa/commit/?id=129a9f4937b8f2adb4d37999677d748d816d611c

-rc3 will have the fix.

-- 
You are receiving this mail because:
You are the QA Contact for the bug.
You are the assignee for the bug.___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [Bug 109615] 19.0.0_rc2 fails u_format_test on ppc64