Re: [Mesa-dev] [PATCH v2 15/20] i965/cs: Emit compute shader code and upload programs

2015-04-27 Thread Kenneth Graunke
On Friday, April 24, 2015 04:33:07 PM Jordan Justen wrote:
 v2:
  * Don't bother checking for 'gen  5' (krh)
  * Populate sampler data in key (krh)
 
 Signed-off-by: Jordan Justen jordan.l.jus...@intel.com
 ---
  src/mesa/drivers/dri/i965/brw_context.h  |   1 +
  src/mesa/drivers/dri/i965/brw_cs.cpp | 224 
 +++
  src/mesa/drivers/dri/i965/brw_state_upload.c |   3 +
  3 files changed, 228 insertions(+)
 
 diff --git a/src/mesa/drivers/dri/i965/brw_context.h 
 b/src/mesa/drivers/dri/i965/brw_context.h
 index 56827d8..9e13c59 100644
 --- a/src/mesa/drivers/dri/i965/brw_context.h
 +++ b/src/mesa/drivers/dri/i965/brw_context.h
 @@ -148,6 +148,7 @@ struct brw_vs_prog_key;
  struct brw_vue_prog_key;
  struct brw_wm_prog_key;
  struct brw_wm_prog_data;
 +struct brw_cs_prog_key;
  struct brw_cs_prog_data;
  
  enum brw_pipeline {
 diff --git a/src/mesa/drivers/dri/i965/brw_cs.cpp 
 b/src/mesa/drivers/dri/i965/brw_cs.cpp
 index 8021147..648f0f0 100644
 --- a/src/mesa/drivers/dri/i965/brw_cs.cpp
 +++ b/src/mesa/drivers/dri/i965/brw_cs.cpp
 @@ -22,8 +22,15 @@
   */
  
  
 +#include util/ralloc.h
  #include brw_context.h
  #include brw_cs.h
 +#include brw_fs.h
 +#include brw_eu.h
 +#include brw_wm.h
 +#include intel_mipmap_tree.h
 +#include brw_state.h
 +#include intel_batchbuffer.h
  
  extern C
  bool
 @@ -46,3 +53,220 @@ brw_cs_prog_data_compare(const void *in_a, const void 
 *in_b)
  
 return true;
  }
 +
 +
 +static const unsigned *
 +brw_cs_emit(struct brw_context *brw,
 +void *mem_ctx,
 +const struct brw_cs_prog_key *key,
 +struct brw_cs_prog_data *prog_data,
 +struct gl_compute_program *cp,
 +struct gl_shader_program *prog,
 +unsigned *final_assembly_size)
 +{
 +   bool start_busy = false;
 +   double start_time = 0;
 +
 +   if (unlikely(brw-perf_debug)) {
 +  start_busy = (brw-batch.last_bo 
 +drm_intel_bo_busy(brw-batch.last_bo));
 +  start_time = get_time();
 +   }
 +
 +   struct brw_shader *shader = NULL;
 +   if (prog)
 +  shader = (struct brw_shader *) 
 prog-_LinkedShaders[MESA_SHADER_COMPUTE];
 +
 +   if (unlikely(INTEL_DEBUG  DEBUG_CS))
 +  brw_dump_ir(compute, prog, shader-base, cp-Base);
 +
 +   /* Now the main event: Visit the shader IR and generate our CS IR for it.
 +*/
 +   fs_visitor v(brw, mem_ctx, key, prog_data, prog, cp, 8);
 +   if (!v.run_cs()) {
 +  if (prog) {
 + prog-LinkStatus = false;
 + ralloc_strcat(prog-InfoLog, v.fail_msg);
 +  }
 +
 +  _mesa_problem(NULL, Failed to compile fragment shader: %s\n,
 +v.fail_msg);
 +
 +  return NULL;
 +   }
 +
 +   cfg_t *simd16_cfg = NULL;
 +   fs_visitor v2(brw, mem_ctx, key, prog_data, prog, cp, 16);
 +   if (likely(!(INTEL_DEBUG  DEBUG_NO16))) {

I know some programs may require SIMD16 shaders to even function (on
some hardware), and that you're planning to make SIMD16 compute shader
compilation never fail (at some point).  So we may want to rework this
logic eventually.  But it's probably OK for now.

 +  if (!v.simd16_unsupported) {
 + /* Try a SIMD16 compile */
 + v2.import_uniforms(v);
 + if (!v2.run_cs()) {
 +perf_debug(SIMD16 shader failed to compile, falling back to 
 +   SIMD8 at a 10-20%% performance cost: %s, 
 v2.fail_msg);
 + } else {
 +simd16_cfg = v2.cfg;
 + }
 +  } else {
 + perf_debug(SIMD16 shader unsupported, falling back to 
 +SIMD8 at a 10-20%% performance cost: %s, v.no16_msg);
 +  }
 +   }

Let's eliminate the at a 10-20% performance cost part of the messages.
That was something Eric measured about fragment shading - the
performance characteristics of compute shaders will probably be entirely
different.

 +
 +   prog_data-local_size[0] = cp-LocalSize[0];
 +   prog_data-local_size[1] = cp-LocalSize[1];
 +   prog_data-local_size[2] = cp-LocalSize[2];
 +
 +   cfg_t *simd8_cfg;
 +   int no_simd8 = (INTEL_DEBUG  DEBUG_NO8) || brw-no_simd8;

I don't really see any value in supporting INTEL_DEBUG=no8 for compute
shaders.  Let's just drop this.

Why don't we have a single cfg_t *cfg variable.  After compiling the
SIMD8 program, set cfg = v.cfg and prog_data-simd_size = 8.  If we
successfully compile a SIMD16 program, set cfg = v2.cfg and override
prog_data-simd_size = 16.  Then pass cfg to fs_generator.

I think that'll streamline this code a fair bit.

 +   if (no_simd8  simd16_cfg) {
 +  simd8_cfg = NULL;
 +  prog_data-no_8 = true;
 +   } else {
 +  simd8_cfg = v.cfg;
 +  prog_data-no_8 = false;
 +   }
 +
 +   fs_generator g(brw, mem_ctx, (void*) key, prog_data-base, cp-Base,
 +  v.promoted_constants, v.runtime_check_aads_emit, CS);
 +   if (INTEL_DEBUG  DEBUG_CS) {
 +  char *name = ralloc_asprintf(mem_ctx, %s compute shader %d,
 +   

[Mesa-dev] [PATCH v2 15/20] i965/cs: Emit compute shader code and upload programs

2015-04-24 Thread Jordan Justen
v2:
 * Don't bother checking for 'gen  5' (krh)
 * Populate sampler data in key (krh)

Signed-off-by: Jordan Justen jordan.l.jus...@intel.com
---
 src/mesa/drivers/dri/i965/brw_context.h  |   1 +
 src/mesa/drivers/dri/i965/brw_cs.cpp | 224 +++
 src/mesa/drivers/dri/i965/brw_state_upload.c |   3 +
 3 files changed, 228 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h 
b/src/mesa/drivers/dri/i965/brw_context.h
index 56827d8..9e13c59 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -148,6 +148,7 @@ struct brw_vs_prog_key;
 struct brw_vue_prog_key;
 struct brw_wm_prog_key;
 struct brw_wm_prog_data;
+struct brw_cs_prog_key;
 struct brw_cs_prog_data;
 
 enum brw_pipeline {
diff --git a/src/mesa/drivers/dri/i965/brw_cs.cpp 
b/src/mesa/drivers/dri/i965/brw_cs.cpp
index 8021147..648f0f0 100644
--- a/src/mesa/drivers/dri/i965/brw_cs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_cs.cpp
@@ -22,8 +22,15 @@
  */
 
 
+#include util/ralloc.h
 #include brw_context.h
 #include brw_cs.h
+#include brw_fs.h
+#include brw_eu.h
+#include brw_wm.h
+#include intel_mipmap_tree.h
+#include brw_state.h
+#include intel_batchbuffer.h
 
 extern C
 bool
@@ -46,3 +53,220 @@ brw_cs_prog_data_compare(const void *in_a, const void *in_b)
 
return true;
 }
+
+
+static const unsigned *
+brw_cs_emit(struct brw_context *brw,
+void *mem_ctx,
+const struct brw_cs_prog_key *key,
+struct brw_cs_prog_data *prog_data,
+struct gl_compute_program *cp,
+struct gl_shader_program *prog,
+unsigned *final_assembly_size)
+{
+   bool start_busy = false;
+   double start_time = 0;
+
+   if (unlikely(brw-perf_debug)) {
+  start_busy = (brw-batch.last_bo 
+drm_intel_bo_busy(brw-batch.last_bo));
+  start_time = get_time();
+   }
+
+   struct brw_shader *shader = NULL;
+   if (prog)
+  shader = (struct brw_shader *) prog-_LinkedShaders[MESA_SHADER_COMPUTE];
+
+   if (unlikely(INTEL_DEBUG  DEBUG_CS))
+  brw_dump_ir(compute, prog, shader-base, cp-Base);
+
+   /* Now the main event: Visit the shader IR and generate our CS IR for it.
+*/
+   fs_visitor v(brw, mem_ctx, key, prog_data, prog, cp, 8);
+   if (!v.run_cs()) {
+  if (prog) {
+ prog-LinkStatus = false;
+ ralloc_strcat(prog-InfoLog, v.fail_msg);
+  }
+
+  _mesa_problem(NULL, Failed to compile fragment shader: %s\n,
+v.fail_msg);
+
+  return NULL;
+   }
+
+   cfg_t *simd16_cfg = NULL;
+   fs_visitor v2(brw, mem_ctx, key, prog_data, prog, cp, 16);
+   if (likely(!(INTEL_DEBUG  DEBUG_NO16))) {
+  if (!v.simd16_unsupported) {
+ /* Try a SIMD16 compile */
+ v2.import_uniforms(v);
+ if (!v2.run_cs()) {
+perf_debug(SIMD16 shader failed to compile, falling back to 
+   SIMD8 at a 10-20%% performance cost: %s, v2.fail_msg);
+ } else {
+simd16_cfg = v2.cfg;
+ }
+  } else {
+ perf_debug(SIMD16 shader unsupported, falling back to 
+SIMD8 at a 10-20%% performance cost: %s, v.no16_msg);
+  }
+   }
+
+   prog_data-local_size[0] = cp-LocalSize[0];
+   prog_data-local_size[1] = cp-LocalSize[1];
+   prog_data-local_size[2] = cp-LocalSize[2];
+
+   cfg_t *simd8_cfg;
+   int no_simd8 = (INTEL_DEBUG  DEBUG_NO8) || brw-no_simd8;
+   if (no_simd8  simd16_cfg) {
+  simd8_cfg = NULL;
+  prog_data-no_8 = true;
+   } else {
+  simd8_cfg = v.cfg;
+  prog_data-no_8 = false;
+   }
+
+   fs_generator g(brw, mem_ctx, (void*) key, prog_data-base, cp-Base,
+  v.promoted_constants, v.runtime_check_aads_emit, CS);
+   if (INTEL_DEBUG  DEBUG_CS) {
+  char *name = ralloc_asprintf(mem_ctx, %s compute shader %d,
+   prog-Label ? prog-Label : unnamed,
+   prog-Name);
+  g.enable_debug(name);
+   }
+   if (simd16_cfg) {
+  prog_data-simd_size = 16;
+  g.generate_code(simd16_cfg, 16);
+   } else if (simd8_cfg) {
+  prog_data-simd_size = 8;
+  g.generate_code(simd8_cfg, 8);
+   }
+
+   if (unlikely(brw-perf_debug)  shader) {
+  if (shader-compiled_once) {
+ _mesa_problem(brw-ctx, CS programs shouldn't need recompiles);
+  }
+  shader-compiled_once = true;
+
+  if (start_busy  !drm_intel_bo_busy(brw-batch.last_bo)) {
+ perf_debug(CS compile took %.03f ms and stalled the GPU\n,
+(get_time() - start_time) * 1000);
+  }
+   }
+
+   return g.get_assembly(final_assembly_size);
+}
+
+static bool
+brw_codegen_cs_prog(struct brw_context *brw,
+struct gl_shader_program *prog,
+struct brw_compute_program *cp,
+struct brw_cs_prog_key *key)
+{
+   struct gl_context *ctx = brw-ctx;
+   const GLuint *program;
+   void *mem_ctx = ralloc_context(NULL);