[Mesa-dev] [PATCH] i965: Reduce code duplication in handling of depth, stencil, and HiZ.

2013-03-27 Thread Paul Berry
This patch consolidates duplicate code in the brw_depthbuffer and
gen7_depthbuffer state atoms.  Previously, these state atoms contained
5 chunks of code for emitting the _3DSTATE_DEPTH_BUFFER packet (3 for
Gen4-6 and 2 for Gen7).  Also a lot of logic for determining the
appropriate buffer setup was duplicated between the Gen4-6 and Gen7
functions.

This refactor splits the code into three separate functions:
brw_emit_depthbuffer(), which determines the appropriate buffer setup
in a mostly generation-independent way, brw_emit_depth_stencil_hiz(),
which emits the appropriate state packets for Gen4-6, and
gen7_emit_depth_stencil_hiz(), which emits the appropriate state
packets for Gen7.

Tested using Piglit on Gen5-7 (no regressions).
---
 src/mesa/drivers/dri/i965/brw_context.h |  23 
 src/mesa/drivers/dri/i965/brw_misc_state.c  | 189 +++-
 src/mesa/drivers/dri/i965/brw_vtbl.c|   2 +
 src/mesa/drivers/dri/i965/gen7_misc_state.c |  93 +-
 src/mesa/drivers/dri/intel/intel_context.h  |  16 +++
 5 files changed, 171 insertions(+), 152 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h 
b/src/mesa/drivers/dri/i965/brw_context.h
index 8ff70c9..1ea038f 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1347,6 +1347,29 @@ struct opcode_desc {
 
 extern const struct opcode_desc opcode_descs[128];
 
+void
+brw_emit_depthbuffer(struct brw_context *brw);
+
+void
+brw_emit_depth_stencil_hiz(struct brw_context *brw,
+   struct intel_mipmap_tree *depth_mt,
+   uint32_t depth_offset, uint32_t depthbuffer_format,
+   uint32_t depth_surface_type,
+   struct intel_mipmap_tree *stencil_mt,
+   struct intel_mipmap_tree *hiz_mt,
+   bool separate_stencil, uint32_t width,
+   uint32_t height, uint32_t tile_x, uint32_t tile_y);
+
+void
+gen7_emit_depth_stencil_hiz(struct brw_context *brw,
+struct intel_mipmap_tree *depth_mt,
+uint32_t depth_offset, uint32_t depthbuffer_format,
+uint32_t depth_surface_type,
+struct intel_mipmap_tree *stencil_mt,
+struct intel_mipmap_tree *hiz_mt,
+bool separate_stencil, uint32_t width,
+uint32_t height, uint32_t tile_x, uint32_t tile_y);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c 
b/src/mesa/drivers/dri/i965/brw_misc_state.c
index d6bd86c..3821eda 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -561,7 +561,8 @@ brw_workaround_depthstencil_alignment(struct brw_context 
*brw,
}
 }
 
-static void emit_depthbuffer(struct brw_context *brw)
+void
+brw_emit_depthbuffer(struct brw_context *brw)
 {
struct intel_context *intel = brw-intel;
struct gl_context *ctx = intel-ctx;
@@ -574,20 +575,23 @@ static void emit_depthbuffer(struct brw_context *brw)
struct intel_mipmap_tree *hiz_mt = brw-depthstencil.hiz_mt;
uint32_t tile_x = brw-depthstencil.tile_x;
uint32_t tile_y = brw-depthstencil.tile_y;
-   unsigned int len;
bool separate_stencil = false;
+   uint32_t depth_surface_type = BRW_SURFACE_NULL;
+   uint32_t depthbuffer_format = BRW_DEPTHFORMAT_D32_FLOAT;
+   uint32_t depth_offset = 0;
+   uint32_t width = 1, height = 1;
 
-   if (stencil_mt  stencil_mt-format == MESA_FORMAT_S8)
-  separate_stencil = true;
+   if (stencil_mt) {
+  separate_stencil = stencil_mt-format == MESA_FORMAT_S8;
 
-   /* 3DSTATE_DEPTH_BUFFER, 3DSTATE_STENCIL_BUFFER are both
-* non-pipelined state that will need the PIPE_CONTROL workaround.
-*/
-   if (intel-gen == 6) {
-  intel_emit_post_sync_nonzero_flush(intel);
-  intel_emit_depth_stall_flushes(intel);
+  /* Gen7 only supports separate stencil */
+  assert(separate_stencil || intel-gen  7);
}
 
+   /* Gen7 only supports separate stencil */
+   assert(intel-gen  6 || !depth_mt ||
+  !_mesa_is_format_packed_depth_stencil(depth_mt-format));
+
/* If there's a packed depth/stencil bound to stencil only, we need to
 * emit the packed depth/stencil buffer packet.
 */
@@ -596,31 +600,21 @@ static void emit_depthbuffer(struct brw_context *brw)
   depth_mt = stencil_mt;
}
 
-   if (intel-gen = 6)
-  len = 7;
-   else if (intel-is_g4x || intel-gen == 5)
-  len = 6;
-   else
-  len = 5;
-
-   if (!depth_irb  !separate_stencil) {
-  BEGIN_BATCH(len);
-  OUT_BATCH(_3DSTATE_DEPTH_BUFFER  16 | (len - 2));
-  OUT_BATCH((BRW_DEPTHFORMAT_D32_FLOAT  18) |
-   (BRW_SURFACE_NULL  29));
-  OUT_BATCH(0);
-  OUT_BATCH(0);
-  OUT_BATCH(0);
-
-  if (intel-is_g4x || intel-gen = 5)
-   

Re: [Mesa-dev] Very low framerate when recording desktop content in Weston using mesa git on Radeon 5770 (glReadPixels slow path)

2013-03-27 Thread Bengt Richter

On 03/26/2013 10:51 PM Matt Turner wrote:

On Tue, Mar 26, 2013 at 2:44 PM, Bengt Richterb...@oz.net  wrote:

uint32_t
component_delta2(uint32_t next, uint32_t prev)
{
 return next0xff00ff)-(prev0xff00ff)+0x100)0xff00ff)+
 (((next0xff00)-(prev0xff00))0xff00));
}


Does removing all the spaces make it faster? ;)

LOL .. actually I didn't put them in in the first place ;-)
But inlining might make the calling loop faster.

Hm, easy to try now ... inlining cut the time in almost in half again.
I assigned to a volatile so the loop wouldn't get optimized away.
I just have a loop in the test kludge like

else if (strcmp(argv[1],v2)==0){
for (i=0;i256*256*256;++i){
antiO2 = component_delta2(i, i^0xff);
}
}

For the above with inlined component_delta2 I get
55ms vs 95ms not inlined, vs orig not inlined 167ms, FWIW.
My optimization reflex just got triggered, I didn't look at the
full post context to see if it might really be useful or not.

Regards,
Bengt Richter

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] Haiku using the Wayland state_tracker?

2013-03-27 Thread Pekka Paalanen
On Tue, 26 Mar 2013 16:47:15 -0500
Alexander von Gluck IV kallis...@unixzen.com wrote:

 I've been hitting several brick walls working on the Haiku GL stuff. 
 (mostly due to things being too complex for the time I have available)
 
 Given all the recent publicity for Wayland, I decided to look into the 
 wl_shm stuff.  From my point of view wl_shm looks *extremely* simple.
 
 Does anyone see any reason *not* to use the wayland state_tracker for 
 Haiku's GL rendering? The only possible down side I see is using EGL vs 
 OpenGL. (I admit I don't fully understand the pros and cons of EGL)

I'm not familiar with the wayland state_tracker, so I can only
comment from the Wayland protocol perspective. I'm not sure what you
intend to do with wl_shm, either. Are you implementing Wayland
platform support?

If you are only ever going to have software rendered GL, then I guess
you might use wl_shm. If you have any reason to believe you might ever
want hardware accelerated GL, then wl_shm won't work. (Actually, you
probably want to choose between wl_shm and something else according to
your renderer. Maybe.)

wl_shm basically deals with mmappable files, i.e. directly
CPU-accessible memory. Buffers suitable for hardware rendering or
texturing are often not CPU-accessible, or extremely slow for that.
Conversely, CPU-accessible memory is often not usable for GPU, or is
slow. And you really don't want to have extra copies between CPU and
GPU memory, especially just for buffer passing.

Mesa contains another Wayland protocol interface used for hardware
accelerated graphics buffers: wl_drm.

Also, EGL vs. OpenGL is like comparing a bucket to paint. EGL is just
one form of a bucket, that can give you OpenGL as the paint. There are
other buckets, and other paints, and you cannot use a bucket as paint,
nor paint as a bucket. Probably I just didn't understand what you are
actually comparing here. (and sorry for a bad analogue :-p)

I have a blog post about Wayland, that is maybe not directly related to
your question, but might give some insight, I hope:
http://ppaalanen.blogspot.fi/2012/11/on-supporting-wayland-gl-clients-and.html


Thanks,
pq
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] radeonsi: add instance divisor support v2

2013-03-27 Thread Christian König
From: Christian König christian.koe...@amd.com

v2: reduce key size, don't copy key around to much.

Signed-off-by: Christian König christian.koe...@amd.com
---
 src/gallium/drivers/radeonsi/radeonsi_shader.c |   67 +++-
 src/gallium/drivers/radeonsi/radeonsi_shader.h |   24 +
 src/gallium/drivers/radeonsi/si_state.c|   44 +---
 src/gallium/drivers/radeonsi/si_state_draw.c   |   18 +--
 4 files changed, 94 insertions(+), 59 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.c 
b/src/gallium/drivers/radeonsi/radeonsi_shader.c
index 0512528..5fdf46e 100644
--- a/src/gallium/drivers/radeonsi/radeonsi_shader.c
+++ b/src/gallium/drivers/radeonsi/radeonsi_shader.c
@@ -54,11 +54,9 @@
 struct si_shader_context
 {
struct radeon_llvm_context radeon_bld;
-   struct r600_context *rctx;
struct tgsi_parse_context parse;
struct tgsi_token * tokens;
struct si_pipe_shader *shader;
-   struct si_shader_key key;
unsigned type; /* TGSI_PROCESSOR_* specifies the type of shader. */
LLVMValueRef const_md;
LLVMValueRef const_resource;
@@ -112,22 +110,41 @@ static LLVMValueRef build_indexed_load(
return result;
 }
 
+static LLVMValueRef get_instance_index(
+   struct radeon_llvm_context * radeon_bld,
+   unsigned divisor)
+{
+   struct gallivm_state * gallivm = radeon_bld-soa.bld_base.base.gallivm;
+
+   LLVMValueRef result = LLVMGetParam(radeon_bld-main_fn, 
SI_PARAM_INSTANCE_ID);
+   result = LLVMBuildAdd(gallivm-builder, result, LLVMGetParam(
+   radeon_bld-main_fn, SI_PARAM_START_INSTANCE), );
+
+   if (divisor  1)
+   result = LLVMBuildUDiv(gallivm-builder, result,
+   lp_build_const_int32(gallivm, divisor), );
+
+   return result;
+}
+
 static void declare_input_vs(
struct si_shader_context * si_shader_ctx,
unsigned input_index,
const struct tgsi_full_declaration *decl)
 {
+   struct lp_build_context * base = 
si_shader_ctx-radeon_bld.soa.bld_base.base;
+   unsigned divisor = 
si_shader_ctx-shader-key.vs.instance_divisors[input_index];
+
+   unsigned chan;
+
LLVMValueRef t_list_ptr;
LLVMValueRef t_offset;
LLVMValueRef t_list;
LLVMValueRef attribute_offset;
-   LLVMValueRef buffer_index_reg;
+   LLVMValueRef buffer_index;
LLVMValueRef args[3];
LLVMTypeRef vec4_type;
LLVMValueRef input;
-   struct lp_build_context * base = 
si_shader_ctx-radeon_bld.soa.bld_base.base;
-   //struct pipe_vertex_element *velem = 
rctx-vertex_elements-elements[input_index];
-   unsigned chan;
 
/* Load the T list */
t_list_ptr = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, 
SI_PARAM_VERTEX_BUFFER);
@@ -139,14 +156,20 @@ static void declare_input_vs(
/* Build the attribute offset */
attribute_offset = lp_build_const_int32(base-gallivm, 0);
 
-   /* Load the buffer index, which is always stored in VGPR0
-* for Vertex Shaders */
-   buffer_index_reg = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, 
SI_PARAM_VERTEX_ID);
+   if (divisor) {
+   /* Build index from instance ID, start instance and divisor */
+   si_shader_ctx-shader-shader.uses_instanceid = true;
+   buffer_index = get_instance_index(si_shader_ctx-radeon_bld, 
divisor);
+   } else {
+   /* Load the buffer index, which is always stored in VGPR0
+* for Vertex Shaders */
+   buffer_index = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, 
SI_PARAM_VERTEX_ID);
+   }
 
vec4_type = LLVMVectorType(base-elem_type, 4);
args[0] = t_list;
args[1] = attribute_offset;
-   args[2] = buffer_index_reg;
+   args[2] = buffer_index;
input = build_intrinsic(base-gallivm-builder,
llvm.SI.vs.load.input, vec4_type, args, 3,
LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
@@ -239,7 +262,7 @@ static void declare_input_fs(
/* XXX: Handle all possible interpolation modes */
switch (decl-Interp.Interpolate) {
case TGSI_INTERPOLATE_COLOR:
-   if (si_shader_ctx-key.flatshade) {
+   if (si_shader_ctx-shader-key.ps.flatshade) {
interp_param = 0;
} else {
if (decl-Interp.Centroid)
@@ -272,7 +295,7 @@ static void declare_input_fs(
 
/* XXX: Could there be more than TGSI_NUM_CHANNELS (4) ? */
if (decl-Semantic.Name == TGSI_SEMANTIC_COLOR 
-   si_shader_ctx-key.color_two_side) {
+   si_shader_ctx-shader-key.ps.color_two_side) {
LLVMValueRef args[4];
LLVMValueRef face, is_face_positive;
LLVMValueRef back_attr_number =
@@ -351,15 +374,12 @@ static void declare_system_value(
unsigned 

Re: [Mesa-dev] [PATCH 4/4] radeonsi: add instance divisor support

2013-03-27 Thread Christian König

Am 26.03.2013 18:03, schrieb Michel Dänzer:

On Die, 2013-03-26 at 17:37 +0100, Christian König wrote:

Am 26.03.2013 15:56, schrieb Michel Dänzer:

On Die, 2013-03-26 at 14:51 +0100, Christian König wrote:

From: Christian König christian.koe...@amd.com

Signed-off-by: Christian König christian.koe...@amd.com
[...]
diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.h 
b/src/gallium/drivers/radeonsi/radeonsi_shader.h
index 9dae742..e09f297 100644
--- a/src/gallium/drivers/radeonsi/radeonsi_shader.h
+++ b/src/gallium/drivers/radeonsi/radeonsi_shader.h
@@ -111,13 +111,18 @@ struct si_shader {
unsignednr_cbufs;
   };
   
-struct si_shader_key {

-   unsignedexport_16bpc:8;
-   unsignednr_cbufs:4;
-   unsignedcolor_two_side:1;
-   unsignedalpha_func:3;
-   unsignedflatshade:1;
-   float   alpha_ref;
+union si_shader_key {
+   struct {
+   unsignedexport_16bpc:8;
+   unsignednr_cbufs:4;
+   unsignedcolor_two_side:1;
+   unsignedalpha_func:3;
+   unsignedflatshade:1;
+   float   alpha_ref;
+   } ps;
+   struct {
+   unsignedinstance_divisors[PIPE_MAX_ATTRIBS];
+   } vs;
   };

This grows the shader key from 8 to 128 bytes. I don't suppose the
instance divisors could be encoded in a more compact way? E.g. loading
the divisor values from constants and only tracking which elements use a
divisor in a bitmask in the key.

Considered that also, and I have two problems with that approach:
1. While immediates are converted to shifts  muls, dividing even by a
constant in the shader isn't cheap.

Is that really significant? How much work would it be to come up with a
worst case test and measure the difference?


Well no idea how to measure that on SI, but when I implemented the same 
feature on R600 the difference between using reciprocal and mul compared 
to mulhi where quite significant.





How about storing only a byte for the instance_divisor? That limit's the
divisor to a modulo of 256, but I don't think that would be so extremly bad.

I have no idea what the impact of that would be. What happens if an app
tries to use a divisor = 256?


It probably would select the wrong shader :(


That would reduce the key to 32 bytes instead.

Still seems kind of big.


Ok how about the following compromise: First we use a short for the 
instance divisor, that makes the key 32 bytes in size and should leave 
enough room for larger instance divisors, and second we don't copy the 
key around so much any more.


Regards,
Christian.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 4/4] radeonsi: add instance divisor support

2013-03-27 Thread Christian König

Am 27.03.2013 12:02, schrieb Christian König:

Am 26.03.2013 18:03, schrieb Michel Dänzer:

On Die, 2013-03-26 at 17:37 +0100, Christian König wrote:

Am 26.03.2013 15:56, schrieb Michel Dänzer:

On Die, 2013-03-26 at 14:51 +0100, Christian König wrote:

From: Christian König christian.koe...@amd.com

Signed-off-by: Christian König christian.koe...@amd.com
[...]
diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.h 
b/src/gallium/drivers/radeonsi/radeonsi_shader.h

index 9dae742..e09f297 100644
--- a/src/gallium/drivers/radeonsi/radeonsi_shader.h
+++ b/src/gallium/drivers/radeonsi/radeonsi_shader.h
@@ -111,13 +111,18 @@ struct si_shader {
   unsignednr_cbufs;
   };
   -struct si_shader_key {
-unsignedexport_16bpc:8;
-unsignednr_cbufs:4;
-unsignedcolor_two_side:1;
-unsignedalpha_func:3;
-unsignedflatshade:1;
-floatalpha_ref;
+union si_shader_key {
+struct {
+unsignedexport_16bpc:8;
+unsignednr_cbufs:4;
+unsignedcolor_two_side:1;
+unsignedalpha_func:3;
+unsignedflatshade:1;
+floatalpha_ref;
+} ps;
+struct {
+unsignedinstance_divisors[PIPE_MAX_ATTRIBS];
+} vs;
   };

This grows the shader key from 8 to 128 bytes. I don't suppose the
instance divisors could be encoded in a more compact way? E.g. loading
the divisor values from constants and only tracking which elements 
use a

divisor in a bitmask in the key.

Considered that also, and I have two problems with that approach:
1. While immediates are converted to shifts  muls, dividing even by a
constant in the shader isn't cheap.

Is that really significant? How much work would it be to come up with a
worst case test and measure the difference?


Well no idea how to measure that on SI, but when I implemented the 
same feature on R600 the difference between using reciprocal and mul 
compared to mulhi where quite significant.




How about storing only a byte for the instance_divisor? That limit's 
the
divisor to a modulo of 256, but I don't think that would be so 
extremly bad.

I have no idea what the impact of that would be. What happens if an app
tries to use a divisor = 256?


It probably would select the wrong shader :(


That would reduce the key to 32 bytes instead.

Still seems kind of big.


Ok how about the following compromise: First we use a short for the 
instance divisor, that makes the key 32 bytes in size and should leave 
enough room for larger instance divisors, and second we don't copy the 
key around so much any more.


Ups I wanted to write 64bytes in size, sorry.

Christian.



Regards,
Christian.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev



___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 4/4] radeonsi: add instance divisor support

2013-03-27 Thread Michel Dänzer
On Mit, 2013-03-27 at 12:02 +0100, Christian König wrote: 
 Am 26.03.2013 18:03, schrieb Michel Dänzer:
  On Die, 2013-03-26 at 17:37 +0100, Christian König wrote:
  Am 26.03.2013 15:56, schrieb Michel Dänzer:
  On Die, 2013-03-26 at 14:51 +0100, Christian König wrote:
  From: Christian König christian.koe...@amd.com
 
  Signed-off-by: Christian König christian.koe...@amd.com
  [...]
  diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.h 
  b/src/gallium/drivers/radeonsi/radeonsi_shader.h
  index 9dae742..e09f297 100644
  --- a/src/gallium/drivers/radeonsi/radeonsi_shader.h
  +++ b/src/gallium/drivers/radeonsi/radeonsi_shader.h
  @@ -111,13 +111,18 @@ struct si_shader {
   unsignednr_cbufs;
 };
 
  -struct si_shader_key {
  -unsignedexport_16bpc:8;
  -unsignednr_cbufs:4;
  -unsignedcolor_two_side:1;
  -unsignedalpha_func:3;
  -unsignedflatshade:1;
  -float   alpha_ref;
  +union si_shader_key {
  +struct {
  +unsignedexport_16bpc:8;
  +unsignednr_cbufs:4;
  +unsignedcolor_two_side:1;
  +unsignedalpha_func:3;
  +unsignedflatshade:1;
  +float   alpha_ref;
  +} ps;
  +struct {
  +unsignedinstance_divisors[PIPE_MAX_ATTRIBS];
  +} vs;
 };
  This grows the shader key from 8 to 128 bytes. I don't suppose the
  instance divisors could be encoded in a more compact way? E.g. loading
  the divisor values from constants and only tracking which elements use a
  divisor in a bitmask in the key.
  Considered that also, and I have two problems with that approach:
  1. While immediates are converted to shifts  muls, dividing even by a
  constant in the shader isn't cheap.
  Is that really significant? How much work would it be to come up with a
  worst case test and measure the difference?
 
 Well no idea how to measure that on SI,

I'd guess something like: With otherwise trivial vertex and pixel
shaders, draw huge numbers of triangles generating one pixel each, and
measure how long it takes.


 but when I implemented the same feature on R600 the difference between
 using reciprocal and mul compared to mulhi where quite significant.

I don't see anything about this in the r600g shader key though. What's
the difference?


  How about storing only a byte for the instance_divisor? That limit's the
  divisor to a modulo of 256, but I don't think that would be so extremly 
  bad.
  I have no idea what the impact of that would be. What happens if an app
  tries to use a divisor = 256?
 
 It probably would select the wrong shader :(
 
  That would reduce the key to 32 bytes instead.
  Still seems kind of big.
 
 Ok how about the following compromise: First we use a short for the 
 instance divisor, that makes the key 32 bytes in size and should leave 
 enough room for larger instance divisors, and second we don't copy the 
 key around so much any more.

This still won't work correctly for some legal divisor values, right? So
I'm afraid this is a flawed compromise, as it doesn't really address the
key size issues (potentially huge number of shader variants, cycles
spent in memcmp, memory usage) either. If those can't be addressed, it
should at least handle all legal values correctly.


-- 
Earthling Michel Dänzer   |   http://www.amd.com
Libre software enthusiast |  Debian, X and DRI developer
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] radeonsi: Handle arbitrary 2-byte formats in resource_copy_region

2013-03-27 Thread Michel Dänzer
From: Michel Dänzer michel.daen...@amd.com

Fixes mplayer -vo vdpau OSD.

Reported-by: Igor Vagulin igor.vagu...@gmail.com
Signed-off-by: Michel Dänzer michel.daen...@amd.com
---
 src/gallium/drivers/radeonsi/r600_blit.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/r600_blit.c 
b/src/gallium/drivers/radeonsi/r600_blit.c
index eb69cd5..f9d2568 100644
--- a/src/gallium/drivers/radeonsi/r600_blit.c
+++ b/src/gallium/drivers/radeonsi/r600_blit.c
@@ -417,6 +417,12 @@ static void r600_resource_copy_region(struct pipe_context 
*ctx,
r600_change_format(dst, dst_level, orig_info[1],
   PIPE_FORMAT_R8_UNORM);
break;
+   case 2:
+   r600_change_format(src, src_level, orig_info[0],
+  PIPE_FORMAT_R8G8_UNORM);
+   r600_change_format(dst, dst_level, orig_info[1],
+  PIPE_FORMAT_R8G8_UNORM);
+   break;
case 4:
r600_change_format(src, src_level, orig_info[0],
   PIPE_FORMAT_R8G8B8A8_UNORM);
-- 
1.8.2.rc3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] radeonsi: Handle arbitrary 2-byte formats in resource_copy_region

2013-03-27 Thread Christian König

Am 27.03.2013 12:44, schrieb Michel Dänzer:

From: Michel Dänzer michel.daen...@amd.com

Fixes mplayer -vo vdpau OSD.

Reported-by: Igor Vagulin igor.vagu...@gmail.com
Signed-off-by: Michel Dänzer michel.daen...@amd.com


Reviewed-by: Christian König christian.koe...@amd.com
Tested-by: Christian König christian.koe...@amd.com


---
  src/gallium/drivers/radeonsi/r600_blit.c | 6 ++
  1 file changed, 6 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/r600_blit.c 
b/src/gallium/drivers/radeonsi/r600_blit.c
index eb69cd5..f9d2568 100644
--- a/src/gallium/drivers/radeonsi/r600_blit.c
+++ b/src/gallium/drivers/radeonsi/r600_blit.c
@@ -417,6 +417,12 @@ static void r600_resource_copy_region(struct pipe_context 
*ctx,
r600_change_format(dst, dst_level, orig_info[1],
   PIPE_FORMAT_R8_UNORM);
break;
+   case 2:
+   r600_change_format(src, src_level, orig_info[0],
+  PIPE_FORMAT_R8G8_UNORM);
+   r600_change_format(dst, dst_level, orig_info[1],
+  PIPE_FORMAT_R8G8_UNORM);
+   break;
case 4:
r600_change_format(src, src_level, orig_info[0],
   PIPE_FORMAT_R8G8B8A8_UNORM);


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 4/4] radeonsi: add instance divisor support

2013-03-27 Thread Alex Deucher
On Wed, Mar 27, 2013 at 7:25 AM, Michel Dänzer mic...@daenzer.net wrote:
 On Mit, 2013-03-27 at 12:02 +0100, Christian König wrote:
 Am 26.03.2013 18:03, schrieb Michel Dänzer:
  On Die, 2013-03-26 at 17:37 +0100, Christian König wrote:
  Am 26.03.2013 15:56, schrieb Michel Dänzer:
  On Die, 2013-03-26 at 14:51 +0100, Christian König wrote:
  From: Christian König christian.koe...@amd.com
 
  Signed-off-by: Christian König christian.koe...@amd.com
  [...]
  diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.h 
  b/src/gallium/drivers/radeonsi/radeonsi_shader.h
  index 9dae742..e09f297 100644
  --- a/src/gallium/drivers/radeonsi/radeonsi_shader.h
  +++ b/src/gallium/drivers/radeonsi/radeonsi_shader.h
  @@ -111,13 +111,18 @@ struct si_shader {
   unsignednr_cbufs;
 };
 
  -struct si_shader_key {
  -unsignedexport_16bpc:8;
  -unsignednr_cbufs:4;
  -unsignedcolor_two_side:1;
  -unsignedalpha_func:3;
  -unsignedflatshade:1;
  -float   alpha_ref;
  +union si_shader_key {
  +struct {
  +unsignedexport_16bpc:8;
  +unsignednr_cbufs:4;
  +unsignedcolor_two_side:1;
  +unsignedalpha_func:3;
  +unsignedflatshade:1;
  +float   alpha_ref;
  +} ps;
  +struct {
  +unsignedinstance_divisors[PIPE_MAX_ATTRIBS];
  +} vs;
 };
  This grows the shader key from 8 to 128 bytes. I don't suppose the
  instance divisors could be encoded in a more compact way? E.g. loading
  the divisor values from constants and only tracking which elements use a
  divisor in a bitmask in the key.
  Considered that also, and I have two problems with that approach:
  1. While immediates are converted to shifts  muls, dividing even by a
  constant in the shader isn't cheap.
  Is that really significant? How much work would it be to come up with a
  worst case test and measure the difference?

 Well no idea how to measure that on SI,

 I'd guess something like: With otherwise trivial vertex and pixel
 shaders, draw huge numbers of triangles generating one pixel each, and
 measure how long it takes.


 but when I implemented the same feature on R600 the difference between
 using reciprocal and mul compared to mulhi where quite significant.

 I don't see anything about this in the r600g shader key though. What's
 the difference?


  How about storing only a byte for the instance_divisor? That limit's the
  divisor to a modulo of 256, but I don't think that would be so extremly 
  bad.
  I have no idea what the impact of that would be. What happens if an app
  tries to use a divisor = 256?

 It probably would select the wrong shader :(

  That would reduce the key to 32 bytes instead.
  Still seems kind of big.

 Ok how about the following compromise: First we use a short for the
 instance divisor, that makes the key 32 bytes in size and should leave
 enough room for larger instance divisors, and second we don't copy the
 key around so much any more.

 This still won't work correctly for some legal divisor values, right? So
 I'm afraid this is a flawed compromise, as it doesn't really address the
 key size issues (potentially huge number of shader variants, cycles
 spent in memcmp, memory usage) either. If those can't be addressed, it
 should at least handle all legal values correctly.

We use the large keys now and look into runtime shader patching for
certain values as an improvement later on.

Alex
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 4/4] radeonsi: add instance divisor support

2013-03-27 Thread Marek Olšák
On Wed, Mar 27, 2013 at 1:43 PM, Alex Deucher alexdeuc...@gmail.com wrote:
 On Wed, Mar 27, 2013 at 7:25 AM, Michel Dänzer mic...@daenzer.net wrote:
 On Mit, 2013-03-27 at 12:02 +0100, Christian König wrote:
 Am 26.03.2013 18:03, schrieb Michel Dänzer:
  On Die, 2013-03-26 at 17:37 +0100, Christian König wrote:
  Am 26.03.2013 15:56, schrieb Michel Dänzer:
  On Die, 2013-03-26 at 14:51 +0100, Christian König wrote:
  From: Christian König christian.koe...@amd.com
 
  Signed-off-by: Christian König christian.koe...@amd.com
  [...]
  diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.h 
  b/src/gallium/drivers/radeonsi/radeonsi_shader.h
  index 9dae742..e09f297 100644
  --- a/src/gallium/drivers/radeonsi/radeonsi_shader.h
  +++ b/src/gallium/drivers/radeonsi/radeonsi_shader.h
  @@ -111,13 +111,18 @@ struct si_shader {
   unsignednr_cbufs;
 };
 
  -struct si_shader_key {
  -unsignedexport_16bpc:8;
  -unsignednr_cbufs:4;
  -unsignedcolor_two_side:1;
  -unsignedalpha_func:3;
  -unsignedflatshade:1;
  -float   alpha_ref;
  +union si_shader_key {
  +struct {
  +unsignedexport_16bpc:8;
  +unsignednr_cbufs:4;
  +unsignedcolor_two_side:1;
  +unsignedalpha_func:3;
  +unsignedflatshade:1;
  +float   alpha_ref;
  +} ps;
  +struct {
  +unsignedinstance_divisors[PIPE_MAX_ATTRIBS];
  +} vs;
 };
  This grows the shader key from 8 to 128 bytes. I don't suppose the
  instance divisors could be encoded in a more compact way? E.g. loading
  the divisor values from constants and only tracking which elements use a
  divisor in a bitmask in the key.
  Considered that also, and I have two problems with that approach:
  1. While immediates are converted to shifts  muls, dividing even by a
  constant in the shader isn't cheap.
  Is that really significant? How much work would it be to come up with a
  worst case test and measure the difference?

 Well no idea how to measure that on SI,

 I'd guess something like: With otherwise trivial vertex and pixel
 shaders, draw huge numbers of triangles generating one pixel each, and
 measure how long it takes.


 but when I implemented the same feature on R600 the difference between
 using reciprocal and mul compared to mulhi where quite significant.

 I don't see anything about this in the r600g shader key though. What's
 the difference?


  How about storing only a byte for the instance_divisor? That limit's the
  divisor to a modulo of 256, but I don't think that would be so extremly 
  bad.
  I have no idea what the impact of that would be. What happens if an app
  tries to use a divisor = 256?

 It probably would select the wrong shader :(

  That would reduce the key to 32 bytes instead.
  Still seems kind of big.

 Ok how about the following compromise: First we use a short for the
 instance divisor, that makes the key 32 bytes in size and should leave
 enough room for larger instance divisors, and second we don't copy the
 key around so much any more.

 This still won't work correctly for some legal divisor values, right? So
 I'm afraid this is a flawed compromise, as it doesn't really address the
 key size issues (potentially huge number of shader variants, cycles
 spent in memcmp, memory usage) either. If those can't be addressed, it
 should at least handle all legal values correctly.

 We use the large keys now and look into runtime shader patching for
 certain values as an improvement later on.

Yeah something like that, or we could just put the whole key in a
constant buffer and have the shader figure out what to do = no shader
recompilations. Or we could use indirect subroutines calls (as in
GL_ARB_shader_subroutine) and decide at draw time which subroutine
should be called depending on the key.

Marek
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] radeonsi: add cs tracing v2

2013-03-27 Thread Jerome Glisse
On Wed, Mar 27, 2013 at 4:45 AM, Christian König
deathsim...@vodafone.de wrote:
 Am 27.03.2013 01:43, schrieb Jerome Glisse:

 On Tue, Mar 26, 2013 at 6:45 PM, Dave Airlie airl...@gmail.com wrote:

 correctly). But Marek is quite right that this only counts for state
 objects
 and makes no sense for set_* and draw_* calls (and I'm currently
 thinking
 how to avoid that and can't come up with a proper solution). Anyway
 it's
 definitely not an urgent problem for radeonsi.

 It will be a problem once we actually start caring about performance
 and, most importantly, the CPU overhead of the driver.

 I still think that writing into the command buffers directly (e.g.
 without
 wrapper functions) is a bad idea, cause that lead to mixing driver
 logic
 and

 I'm convinced the exact opposite is a bad idea, because it adds
 another layer all commands must go through. A layer which brings no
 advantage. Think about apps which issue 1k-10k draw calls per frame.
 It's obvious that every byte moved around counts and the key to high
 framerate is to do (almost) nothing in the driver. It looks like the
 idea here is to make the driver as slow as possible.

 packet building in r600g. For example just try to figure out how the
 relocation in NOPs work by reading the source (please keep in mind
 that
 one
 of the primary goals why AMD is supporting this driver is to give a
 good
 example code for customers who want to implement that stuff on their
 own
 systems).

 I'm shocked. Sacrificing performance in the name of making the code
 nicer for some customers? Seriously? I thought the plan was to make
 the best graphics driver ever.


 Well, maybe I'm repeating myself: Performance is not a priority, it's
 only
 nice to have!

 Sorry to say so, but if we sacrifice a bit of performance for more code
 readability than that is perfectly ok with me (Don't understand me wrong
 I
 would really prefer to replace the closed source driver today than
 tomorrow,
 it's unfortunately just not what I'm paid for).

 On the other hand, we are talking about perfectly optimizeable inline
 functions and/or macros. All I'm saying is that we should structurize
 the
 code a bit more.

 Its okay to take steps in the right direction, but if you start taking
 steps that away
 from performance in lieu of code readability then please be prepared
 to deal with
 objections.

 The thing is in a lot of cases, code readability is in the eye of the
 beholder, I'm sure
 Jerome though r600g was perfectly readable when he wrote it, but a lot
 of us didn't
 and spent a lot of time trying to remove the CPU overheads, not least
 the amount of
 time Marek spent. The thing is performance is measureable, code
 readability isn't.

 Dave.

 Maybe once again you forgot why i did things the way i did them, i
 explained myself to you back then, i designed r600g for a new kernel
 api which was violently different from the cs one, my hope was that
 the other kernel api would be better, it was not and i never pushed
 more on that front. So r600g design was definitely not adapted to the
 cs ioctl and not thinked for it. History often explain a lot of things
 and people seems to forget about them.

 That being said, i too find ironic the code readability argument, if
 one understand the cs ioctl then the r600g code as it's nowadays make
 sense, but the radeonsi code is closer to what r600g use to be. So
 assuming same ioctl i would say that radeonsi should move towards what
 r600g is nowadays.

 Anyway just wanted to set history straight.


 Well I think you hit the point here quite well, may I ask what your kernel
 interface would have been looked like?

 Christian.

I use to have a branch on fdo with it, basicly what use to be
r600_hw_context was a nop in gallium and you had state in kernel (cb,
db, sampler view, sampler, ...) and you created them and then bound
them so everything was mostly security check at creation time and
bound time was pretty quick, it was also transaction based. Relocation
was easier too. Anyway it was a bad API, i know that in closed world
or more obscure stack you can have a kernel api that doesn't do much
security check and call it a day which gives you a lot more freedom on
api.

Cheers,
Jerome
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 4/6] draw/gs: Fetch more than one primitive per invocation

2013-03-27 Thread Brian Paul

On 03/26/2013 10:38 AM, Zack Rusin wrote:

Allows executing gs on up to 4 primitives at a time. Will also be
required by the llvm code because there we definitely don't want
to flush with just a single primitive.

Signed-off-by: Zack Rusinza...@vmware.com
---
  src/gallium/auxiliary/draw/draw_gs.c |   54 ++
  src/gallium/auxiliary/draw/draw_gs.h |1 +
  2 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_gs.c 
b/src/gallium/auxiliary/draw/draw_gs.c
index 81d9140..2b50c9c 100644
--- a/src/gallium/auxiliary/draw/draw_gs.c
+++ b/src/gallium/auxiliary/draw/draw_gs.c
@@ -58,6 +58,12 @@ draw_gs_get_input_index(int semantic, int index,
 return -1;
  }



Could you move some of the commit msg info to a comment on this 
function to explain what's going on?



+static INLINE boolean
+draw_gs_should_flush(struct draw_geometry_shader *shader)
+{
+   return (shader-fetched_prim_count == 4);
+}
+
  /*#define DEBUG_OUTPUTS 1*/
  static void
  tgsi_fetch_gs_outputs(struct draw_geometry_shader *shader,


-Brian
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 5/6] gallium/llvm: implement geometry shaders in the llvm paths

2013-03-27 Thread Brian Paul

On 03/26/2013 10:38 AM, Zack Rusin wrote:

This commits implements code generation of the geometry shaders in
the SOA paths. All the code is there but bugs are likely present.

Signed-off-by: Zack Rusinza...@vmware.com
---
  src/gallium/auxiliary/draw/draw_context.c  |   17 +-
  src/gallium/auxiliary/draw/draw_context.h  |5 +
  src/gallium/auxiliary/draw/draw_gs.c   |  297 +-
  src/gallium/auxiliary/draw/draw_gs.h   |   26 +-
  src/gallium/auxiliary/draw/draw_llvm.c |  567 ++--
  src/gallium/auxiliary/draw/draw_llvm.h |  161 +-
  .../draw/draw_pt_fetch_shade_pipeline_llvm.c   |   75 +++
  src/gallium/auxiliary/gallivm/lp_bld_tgsi.h|   27 +-
  src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c|  173 +-
  src/gallium/drivers/llvmpipe/lp_state_fs.c |4 +-
  10 files changed, 1273 insertions(+), 79 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_context.c 
b/src/gallium/auxiliary/draw/draw_context.c
index 6b70ac8..d64b82b 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -46,7 +46,7 @@
  #include gallivm/lp_bld_limits.h
  #include draw_llvm.h

-static boolean
+boolean
  draw_get_option_use_llvm(void)
  {
 static boolean first = TRUE;
@@ -808,16 +808,15 @@ draw_set_mapped_texture(struct draw_context *draw,
  uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS],
  uint32_t mip_offsets[PIPE_MAX_TEXTURE_LEVELS])
  {
-   if (shader_stage == PIPE_SHADER_VERTEX) {
  #ifdef HAVE_LLVM
-  if (draw-llvm)
- draw_llvm_set_mapped_texture(draw,
-  sview_idx,
-  width, height, depth, first_level,
-  last_level, base_ptr,
-  row_stride, img_stride, mip_offsets);
+   if (draw-llvm)
+  draw_llvm_set_mapped_texture(draw,
+   shader_stage,
+   sview_idx,
+   width, height, depth, first_level,
+   last_level, base_ptr,
+   row_stride, img_stride, mip_offsets);
  #endif
-   }
  }

  /**
diff --git a/src/gallium/auxiliary/draw/draw_context.h 
b/src/gallium/auxiliary/draw/draw_context.h
index 18c8595..369f6c8 100644
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -282,4 +282,9 @@ draw_get_shader_param(unsigned shader, enum pipe_shader_cap 
param);
  int
  draw_get_shader_param_no_llvm(unsigned shader, enum pipe_shader_cap param);

+#ifdef HAVE_LLVM
+boolean
+draw_get_option_use_llvm(void);
+#endif
+
  #endif /* DRAW_CONTEXT_H */
diff --git a/src/gallium/auxiliary/draw/draw_gs.c 
b/src/gallium/auxiliary/draw/draw_gs.c
index 2b50c9c..c1e1f56 100644
--- a/src/gallium/auxiliary/draw/draw_gs.c
+++ b/src/gallium/auxiliary/draw/draw_gs.c
@@ -29,6 +29,9 @@

  #include draw_private.h
  #include draw_context.h
+#ifdef HAVE_LLVM
+#include draw_llvm.h
+#endif

  #include tgsi/tgsi_parse.h
  #include tgsi/tgsi_exec.h
@@ -61,7 +64,7 @@ draw_gs_get_input_index(int semantic, int index,
  static INLINE boolean
  draw_gs_should_flush(struct draw_geometry_shader *shader)
  {
-   return (shader-fetched_prim_count == 4);
+   return (shader-fetched_prim_count == shader-vector_length);
  }

  /*#define DEBUG_OUTPUTS 1*/
@@ -176,7 +179,7 @@ static void tgsi_fetch_gs_input(struct draw_geometry_shader 
*shader,
  }

  static void tgsi_gs_prepare(struct draw_geometry_shader *shader,
-const void *constants[PIPE_MAX_CONSTANT_BUFFERS],
+const void *constants[PIPE_MAX_CONSTANT_BUFFERS],
  const unsigned 
constants_size[PIPE_MAX_CONSTANT_BUFFERS])
  {
 struct tgsi_exec_machine *machine = shader-machine;
@@ -199,10 +202,148 @@ static unsigned tgsi_gs_run(struct draw_geometry_shader 
*shader,
 /* run interpreter */
 tgsi_exec_machine_run(machine);

-   return
+   return

machine-Temps[TGSI_EXEC_TEMP_PRIMITIVE_I].xyzw[TGSI_EXEC_TEMP_PRIMITIVE_C].u[0];
  }

+#ifdef HAVE_LLVM
+
+static void
+llvm_fetch_gs_input(struct draw_geometry_shader *shader,
+unsigned *indices,
+unsigned num_vertices,
+unsigned prim_idx)
+{
+   unsigned slot, vs_slot, i;
+   unsigned input_vertex_stride = shader-input_vertex_stride;
+   const float (*input_ptr)[4];
+   float (*input_data)[6][PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS][TGSI_NUM_CHANNELS] 
=shader-gs_input-data;
+
+   input_ptr = shader-input;
+
+   for (i = 0; i  num_vertices; ++i) {
+  const float (*input)[4];
+#if DEBUG_INPUTS
+  debug_printf(%d) vertex index = %d (prim idx = %d)\n,
+   i, indices[i], prim_idx);
+#endif
+  input = (const float 

Re: [Mesa-dev] [PATCH 6/6] gallivm: implement breakc and implicit primitive flushing

2013-03-27 Thread Brian Paul

On 03/26/2013 10:38 AM, Zack Rusin wrote:

we were missing implementation of the breakc instruction and our
TGSI semantics currently require an implicit endprim at the end
of GS if none is present - this implements both.


Maybe I'm dense, but off-hand I don't see the relationship between 
endprim and breakc.  Can you elaborate?





Signed-off-by: Zack Rusinza...@vmware.com
---
  src/gallium/auxiliary/gallivm/lp_bld_tgsi.h|6 
  src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c |1 +
  src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c|   38 
  3 files changed, 45 insertions(+)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index 4c6456e..4acc592 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -392,6 +392,12 @@ struct lp_build_tgsi_soa_context
 LLVMValueRef emitted_prims_vec;
 LLVMValueRef total_emitted_vertices_vec;
 LLVMValueRef emitted_vertices_vec;
+   /* if a shader doesn't have ENDPRIM instruction but it has
+* a number of EMIT instructions it means the END instruction
+* implicitly invokes ENDPRIM. handle this via a flag here
+* in the future maybe we can enforce TGSI to always have
+* an explicit ENDPRIM */
+   boolean pending_end_primitive;

 LLVMValueRef consts_ptr;
 const LLVMValueRef *pos;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
index 41ddd99..55bb8e3 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@@ -867,6 +867,7 @@ lp_set_default_actions(struct lp_build_tgsi_context * 
bld_base)
 bld_base-op_actions[TGSI_OPCODE_COS].fetch_args = scalar_unary_fetch_args;
 bld_base-op_actions[TGSI_OPCODE_EX2].fetch_args = scalar_unary_fetch_args;
 bld_base-op_actions[TGSI_OPCODE_IF].fetch_args = scalar_unary_fetch_args;
+   bld_base-op_actions[TGSI_OPCODE_BREAKC].fetch_args = 
scalar_unary_fetch_args;
 bld_base-op_actions[TGSI_OPCODE_KIL].fetch_args = kil_fetch_args;
 bld_base-op_actions[TGSI_OPCODE_KILP].fetch_args = kilp_fetch_args;
 bld_base-op_actions[TGSI_OPCODE_RCP].fetch_args = scalar_unary_fetch_args;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 95633ab..36e49ac 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -213,6 +213,23 @@ static void lp_exec_break(struct lp_exec_mask *mask)
 lp_exec_mask_update(mask);
  }

+
+static void lp_exec_break_condition(struct lp_exec_mask *mask, LLVMValueRef 
cond)
+{
+   LLVMBuilderRef builder = mask-bld-gallivm-builder;
+   LLVMValueRef exec_mask = LLVMBuildNot(builder,
+ mask-exec_mask,
+ break);
+
+   exec_mask = LLVMBuildAnd(builder, exec_mask, cond, );
+
+   mask-break_mask = LLVMBuildAnd(builder,
+   mask-break_mask,
+   exec_mask, break_full);
+
+   lp_exec_mask_update(mask);
+}
+
  static void lp_exec_continue(struct lp_exec_mask *mask)
  {
 LLVMBuilderRef builder = mask-bld-gallivm-builder;
@@ -2190,6 +2207,7 @@ emit_vertex(
   LLVMBuildAdd(builder, bld-emitted_vertices_vec, masked_ones, );
bld-total_emitted_vertices_vec =
   LLVMBuildAdd(builder, bld-total_emitted_vertices_vec, masked_ones, 
);
+  bld-pending_end_primitive = TRUE;
 }
  }

@@ -2212,6 +2230,7 @@ end_primitive(
bld-emitted_prims_vec =
   LLVMBuildAdd(builder, bld-emitted_prims_vec, masked_ones, );
bld-emitted_vertices_vec = bld_base-uint_bld.zero;
+  bld-pending_end_primitive = FALSE;
 }
  }

@@ -2250,6 +2269,17 @@ brk_emit(
  }

  static void
+breakc_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   lp_exec_break_condition(bld-exec_mask, emit_data-args[0]);
+}
+
+static void
  if_emit(
 const struct lp_build_tgsi_action * action,
 struct lp_build_tgsi_context * bld_base,
@@ -2504,6 +2534,12 @@ static void emit_epilogue(struct lp_build_tgsi_context * 
bld_base)
 /* If we have indirect addressing in outputs we need to copy our alloca 
array
  * to the outputs slots specified by the caller */
 if (bld-gs_args) {
+  /* flush the accumulated vertices as a primitive */
+  if (bld-pending_end_primitive) {
+ end_primitive(NULL, bld_base, NULL);
+ bld-pending_end_primitive = FALSE;
+  }
+
bld-gs_args-gs_epilogue(bld-bld_base,
  bld-total_emitted_vertices_vec,
  bld-emitted_prims_vec,
@@ -2572,6 

Re: [Mesa-dev] [PATCH 3/3] llvmpipe/draw: Fix texture sampling in geometry shaders

2013-03-27 Thread Brian Paul

On 03/26/2013 06:56 PM, Zack Rusin wrote:

We weren't correctly propagating the samplers and sampler views
when they were related to geometry shaders.

Signed-off-by: Zack Rusinza...@vmware.com
---
  src/gallium/auxiliary/draw/draw_context.c   |4 +-
  src/gallium/auxiliary/draw/draw_llvm.c  |   83 ---
  src/gallium/auxiliary/draw/draw_llvm.h  |   31 +++---
  src/gallium/drivers/llvmpipe/lp_context.c   |4 +
  src/gallium/drivers/llvmpipe/lp_context.h   |1 +
  src/gallium/drivers/llvmpipe/lp_draw_arrays.c   |4 +
  src/gallium/drivers/llvmpipe/lp_state.h |8 ++
  src/gallium/drivers/llvmpipe/lp_state_sampler.c |  127 +++
  8 files changed, 205 insertions(+), 57 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_context.c 
b/src/gallium/auxiliary/draw/draw_context.c
index d64b82b..ceb74df 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -792,8 +792,8 @@ draw_set_samplers(struct draw_context *draw,
 draw-num_samplers[shader_stage] = num;

  #ifdef HAVE_LLVM
-   if (draw-llvm  shader_stage == PIPE_SHADER_VERTEX)
-  draw_llvm_set_sampler_state(draw);
+   if (draw-llvm)
+  draw_llvm_set_sampler_state(draw, shader_stage);
  #endif
  }

diff --git a/src/gallium/auxiliary/draw/draw_llvm.c 
b/src/gallium/auxiliary/draw/draw_llvm.c
index f857183..3e47452 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -249,17 +249,17 @@ create_gs_jit_context_type(struct gallivm_state *gallivm,
 elem_types[1] = LLVMPointerType(LLVMArrayType(LLVMArrayType(float_type, 4),
   DRAW_TOTAL_CLIP_PLANES), 0);
 elem_types[2] = LLVMPointerType(float_type, 0); /* viewport */
-
-   elem_types[3] = LLVMPointerType(LLVMPointerType(int_type, 0), 0);
-   elem_types[4] = LLVMPointerType(LLVMVectorType(int_type,
-  vector_length), 0);
-   elem_types[5] = LLVMPointerType(LLVMVectorType(int_type,
-  vector_length), 0);

-   elem_types[6] = LLVMArrayType(texture_type,
+   elem_types[3] = LLVMArrayType(texture_type,
   PIPE_MAX_SHADER_SAMPLER_VIEWS); /* textures 
*/
-   elem_types[7] = LLVMArrayType(sampler_type,
+   elem_types[4] = LLVMArrayType(sampler_type,
   PIPE_MAX_SAMPLERS); /* samplers */
+
+   elem_types[5] = LLVMPointerType(LLVMPointerType(int_type, 0), 0);
+   elem_types[6] = LLVMPointerType(LLVMVectorType(int_type,
+  vector_length), 0);
+   elem_types[7] = LLVMPointerType(LLVMVectorType(int_type,
+  vector_length), 0);

 context_type = LLVMStructTypeInContext(gallivm-context, elem_types,
Elements(elem_types), 0);
@@ -275,18 +275,18 @@ create_gs_jit_context_type(struct gallivm_state *gallivm,
target, context_type, 1);
 LP_CHECK_MEMBER_OFFSET(struct draw_gs_jit_context, viewport,
target, context_type, 2);
-   LP_CHECK_MEMBER_OFFSET(struct draw_gs_jit_context, prim_lengths,
-  target, context_type, 3);
-   LP_CHECK_MEMBER_OFFSET(struct draw_gs_jit_context, emitted_vertices,
-  target, context_type, 4);
-   LP_CHECK_MEMBER_OFFSET(struct draw_gs_jit_context, emitted_prims,
-  target, context_type, 5);
 LP_CHECK_MEMBER_OFFSET(struct draw_gs_jit_context, textures,
target, context_type,
-  DRAW_GS_JIT_CTX_TEXTURES);
+  DRAW_JIT_CTX_TEXTURES);
 LP_CHECK_MEMBER_OFFSET(struct draw_gs_jit_context, samplers,
target, context_type,
-  DRAW_GS_JIT_CTX_SAMPLERS);
+  DRAW_JIT_CTX_SAMPLERS);
+   LP_CHECK_MEMBER_OFFSET(struct draw_gs_jit_context, prim_lengths,
+  target, context_type, 5);
+   LP_CHECK_MEMBER_OFFSET(struct draw_gs_jit_context, emitted_vertices,
+  target, context_type, 6);
+   LP_CHECK_MEMBER_OFFSET(struct draw_gs_jit_context, emitted_prims,
+  target, context_type, 7);
 LP_CHECK_STRUCT_SIZE(struct draw_gs_jit_context,
  target, context_type);

@@ -1721,33 +1721,36 @@ draw_llvm_set_mapped_texture(struct draw_context *draw,


  void
-draw_llvm_set_sampler_state(struct draw_context *draw)
+draw_llvm_set_sampler_state(struct draw_context *draw,
+unsigned shader_type)
  {
 unsigned i;

-   for (i = 0; i  draw-num_samplers[PIPE_SHADER_VERTEX]; i++) {
-  struct draw_jit_sampler *jit_sam =draw-llvm-jit_context.samplers[i];
-
-  if (draw-samplers[i]) {
- 

[Mesa-dev] [PATCH] winsys/radeon: add command stream replay dump for faulty lockup

2013-03-27 Thread j . glisse
From: Jerome Glisse jgli...@redhat.com

Build time option, set RADEON_CS_DUMP_ON_LOCKUP to 1 in radeon_drm_cs.h to
enable it.

When enabled after each cs submission the code will try to detect lockup by
waiting on one of the buffer of the cs to become idle, after a timeout it
will consider that the cs triggered a lockup and will write a radeon_lockup.c
file in current directory that have all information for replaying the cs.

To build this file :
gcc -O0 -g radeon_lockup.c -ldrm -o radeon_lockup -I/usr/include/libdrm

Signed-off-by: Jerome Glisse jgli...@redhat.com
---
 src/gallium/winsys/radeon/drm/Makefile.sources |   1 +
 src/gallium/winsys/radeon/drm/radeon_drm_bo.c  |  80 ++--
 src/gallium/winsys/radeon/drm/radeon_drm_bo.h  |   2 +
 src/gallium/winsys/radeon/drm/radeon_drm_cs.c  |   4 +
 src/gallium/winsys/radeon/drm/radeon_drm_cs.h  |   6 +
 src/gallium/winsys/radeon/drm/radeon_drm_cs_dump.c | 135 +
 6 files changed, 191 insertions(+), 37 deletions(-)
 create mode 100644 src/gallium/winsys/radeon/drm/radeon_drm_cs_dump.c

diff --git a/src/gallium/winsys/radeon/drm/Makefile.sources 
b/src/gallium/winsys/radeon/drm/Makefile.sources
index 1d18d61..4ca5ebb 100644
--- a/src/gallium/winsys/radeon/drm/Makefile.sources
+++ b/src/gallium/winsys/radeon/drm/Makefile.sources
@@ -1,4 +1,5 @@
 C_SOURCES := \
radeon_drm_bo.c \
radeon_drm_cs.c \
+   radeon_drm_cs_dump.c \
radeon_drm_winsys.c
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c 
b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index f4ac526..5a9493a 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -391,14 +391,54 @@ static void radeon_bo_destroy(struct pb_buffer *_buf)
 FREE(bo);
 }
 
+void *radeon_bo_do_map(struct radeon_bo *bo)
+{
+struct drm_radeon_gem_mmap args = {0};
+void *ptr;
+
+/* Return the pointer if it's already mapped. */
+if (bo-ptr)
+return bo-ptr;
+
+/* Map the buffer. */
+pipe_mutex_lock(bo-map_mutex);
+/* Return the pointer if it's already mapped (in case of a race). */
+if (bo-ptr) {
+pipe_mutex_unlock(bo-map_mutex);
+return bo-ptr;
+}
+args.handle = bo-handle;
+args.offset = 0;
+args.size = (uint64_t)bo-base.size;
+if (drmCommandWriteRead(bo-rws-fd,
+DRM_RADEON_GEM_MMAP,
+args,
+sizeof(args))) {
+pipe_mutex_unlock(bo-map_mutex);
+fprintf(stderr, radeon: gem_mmap failed: %p 0x%08X\n,
+bo, bo-handle);
+return NULL;
+}
+
+ptr = os_mmap(0, args.size, PROT_READ|PROT_WRITE, MAP_SHARED,
+   bo-rws-fd, args.addr_ptr);
+if (ptr == MAP_FAILED) {
+pipe_mutex_unlock(bo-map_mutex);
+fprintf(stderr, radeon: mmap failed, errno: %i\n, errno);
+return NULL;
+}
+bo-ptr = ptr;
+pipe_mutex_unlock(bo-map_mutex);
+
+return bo-ptr;
+}
+
 static void *radeon_bo_map(struct radeon_winsys_cs_handle *buf,
struct radeon_winsys_cs *rcs,
enum pipe_transfer_usage usage)
 {
 struct radeon_bo *bo = (struct radeon_bo*)buf;
 struct radeon_drm_cs *cs = (struct radeon_drm_cs*)rcs;
-struct drm_radeon_gem_mmap args = {0};
-void *ptr;
 
 /* If it's not unsynchronized bo_map, flush CS if needed and then wait. */
 if (!(usage  PIPE_TRANSFER_UNSYNCHRONIZED)) {
@@ -461,41 +501,7 @@ static void *radeon_bo_map(struct radeon_winsys_cs_handle 
*buf,
 }
 }
 
-/* Return the pointer if it's already mapped. */
-if (bo-ptr)
-return bo-ptr;
-
-/* Map the buffer. */
-pipe_mutex_lock(bo-map_mutex);
-/* Return the pointer if it's already mapped (in case of a race). */
-if (bo-ptr) {
-pipe_mutex_unlock(bo-map_mutex);
-return bo-ptr;
-}
-args.handle = bo-handle;
-args.offset = 0;
-args.size = (uint64_t)bo-base.size;
-if (drmCommandWriteRead(bo-rws-fd,
-DRM_RADEON_GEM_MMAP,
-args,
-sizeof(args))) {
-pipe_mutex_unlock(bo-map_mutex);
-fprintf(stderr, radeon: gem_mmap failed: %p 0x%08X\n,
-bo, bo-handle);
-return NULL;
-}
-
-ptr = os_mmap(0, args.size, PROT_READ|PROT_WRITE, MAP_SHARED,
-   bo-rws-fd, args.addr_ptr);
-if (ptr == MAP_FAILED) {
-pipe_mutex_unlock(bo-map_mutex);
-fprintf(stderr, radeon: mmap failed, errno: %i\n, errno);
-return NULL;
-}
-bo-ptr = ptr;
-pipe_mutex_unlock(bo-map_mutex);
-
-return bo-ptr;
+return radeon_bo_do_map(bo);
 }
 
 static void radeon_bo_unmap(struct radeon_winsys_cs_handle *_buf)
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.h 
b/src/gallium/winsys/radeon/drm/radeon_drm_bo.h
index 

[Mesa-dev] [PATCH 3/4] radeonsi: add start instance support

2013-03-27 Thread Christian König
From: Christian König christian.koe...@amd.com

This works different than on R600, we need to add the start instance manually.

Signed-off-by: Christian König christian.koe...@amd.com
Reviewed-by: Michel Dänzer michel.daen...@amd.com
Tested-by: Michel Dänzer michel.daen...@amd.com
---
 src/gallium/drivers/radeonsi/radeonsi_shader.c |   13 -
 src/gallium/drivers/radeonsi/radeonsi_shader.h |   12 +++-
 src/gallium/drivers/radeonsi/si_state_draw.c   |6 ++
 3 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.c 
b/src/gallium/drivers/radeonsi/radeonsi_shader.c
index 62f478e..0512528 100644
--- a/src/gallium/drivers/radeonsi/radeonsi_shader.c
+++ b/src/gallium/drivers/radeonsi/radeonsi_shader.c
@@ -351,11 +351,15 @@ static void declare_system_value(
unsigned index,
const struct tgsi_full_declaration *decl)
 {
+   struct gallivm_state * gallivm = radeon_bld-soa.bld_base.base.gallivm;
+
LLVMValueRef value = 0;
 
switch (decl-Semantic.Name) {
case TGSI_SEMANTIC_INSTANCEID:
value = LLVMGetParam(radeon_bld-main_fn, SI_PARAM_INSTANCE_ID);
+   value = LLVMBuildAdd(gallivm-builder, value,
+   LLVMGetParam(radeon_bld-main_fn, 
SI_PARAM_START_INSTANCE), );
break;
 
case TGSI_SEMANTIC_VERTEXID:
@@ -963,11 +967,12 @@ static void create_function(struct si_shader_context 
*si_shader_ctx)
 
if (si_shader_ctx-type == TGSI_PROCESSOR_VERTEX) {
params[SI_PARAM_VERTEX_BUFFER] = params[SI_PARAM_SAMPLER];
+   params[SI_PARAM_START_INSTANCE] = i32;
params[SI_PARAM_VERTEX_ID] = i32;
params[SI_PARAM_DUMMY_0] = i32;
params[SI_PARAM_DUMMY_1] = i32;
params[SI_PARAM_INSTANCE_ID] = i32;
-   radeon_llvm_create_func(si_shader_ctx-radeon_bld, params, 8);
+   radeon_llvm_create_func(si_shader_ctx-radeon_bld, params, 9);
 
} else {
params[SI_PARAM_PRIM_MASK] = i32;
@@ -995,6 +1000,12 @@ static void create_function(struct si_shader_context 
*si_shader_ctx)
LLVMValueRef P = 
LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, i);
LLVMAddAttribute(P, LLVMInRegAttribute);
}
+
+   if (si_shader_ctx-type == TGSI_PROCESSOR_VERTEX) {
+   LLVMValueRef P = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn,
+ SI_PARAM_START_INSTANCE);
+   LLVMAddAttribute(P, LLVMInRegAttribute);
+   }
 }
 
 static void preload_constants(struct si_shader_context *si_shader_ctx)
diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.h 
b/src/gallium/drivers/radeonsi/radeonsi_shader.h
index 8f5efd0..9dae742 100644
--- a/src/gallium/drivers/radeonsi/radeonsi_shader.h
+++ b/src/gallium/drivers/radeonsi/radeonsi_shader.h
@@ -33,8 +33,9 @@
 #define SI_SGPR_SAMPLER2
 #define SI_SGPR_RESOURCE   4
 #define SI_SGPR_VERTEX_BUFFER  6
+#define SI_SGPR_START_INSTANCE 8
 
-#define SI_VS_NUM_USER_SGPR8
+#define SI_VS_NUM_USER_SGPR9
 #define SI_PS_NUM_USER_SGPR6
 
 /* LLVM function parameter indices */
@@ -44,10 +45,11 @@
 
 /* VS only parameters */
 #define SI_PARAM_VERTEX_BUFFER 3
-#define SI_PARAM_VERTEX_ID 4
-#define SI_PARAM_DUMMY_0   5
-#define SI_PARAM_DUMMY_1   6
-#define SI_PARAM_INSTANCE_ID   7
+#define SI_PARAM_START_INSTANCE4
+#define SI_PARAM_VERTEX_ID 5
+#define SI_PARAM_DUMMY_0   6
+#define SI_PARAM_DUMMY_1   7
+#define SI_PARAM_INSTANCE_ID   8
 
 /* PS only parameters */
 #define SI_PARAM_PRIM_MASK 3
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c 
b/src/gallium/drivers/radeonsi/si_state_draw.c
index 91bcdf8..383d2a0 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -279,10 +279,8 @@ static bool si_update_draw_info_state(struct r600_context 
*rctx,
   info-indexed ? info-index_bias : info-start);
si_pm4_set_reg(pm4, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, 
info-restart_index);
si_pm4_set_reg(pm4, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, 
info-primitive_restart);
-#if 0
-   si_pm4_set_reg(pm4, R_03CFF0_SQ_VTX_BASE_VTX_LOC, 0);
-   si_pm4_set_reg(pm4, R_03CFF4_SQ_VTX_START_INST_LOC, 
info-start_instance);
-#endif
+   si_pm4_set_reg(pm4, R_00B130_SPI_SHADER_USER_DATA_VS_0 + 
SI_SGPR_START_INSTANCE * 4,
+  info-start_instance);
 
 if (prim == V_008958_DI_PT_LINELIST)
 ls_mask = 1;
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/4] radeon/llvm: move system value fetching to common code

2013-03-27 Thread Christian König
From: Christian König christian.koe...@amd.com

This should be used by both SI and R600.

Signed-off-by: Christian König christian.koe...@amd.com
Reviewed-by: Michel Dänzer michel.daen...@amd.com
Tested-by: Michel Dänzer michel.daen...@amd.com
---
 src/gallium/drivers/r600/r600_llvm.c   |   12 
 .../drivers/radeon/radeon_setup_tgsi_llvm.c|   12 
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_llvm.c 
b/src/gallium/drivers/r600/r600_llvm.c
index 6e6edb7..81a60c2 100644
--- a/src/gallium/drivers/r600/r600_llvm.c
+++ b/src/gallium/drivers/r600/r600_llvm.c
@@ -72,17 +72,6 @@ static void llvm_load_system_value(
LLVMReadNoneAttribute);
 }
 
-static LLVMValueRef llvm_fetch_system_value(
-   struct lp_build_tgsi_context * bld_base,
-   const struct tgsi_full_src_register *reg,
-   enum tgsi_opcode_type type,
-   unsigned swizzle)
-{
-   struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
-   LLVMValueRef cval = ctx-system_values[reg-Register.Index];
-   return bitcast(bld_base, type, cval);
-}
-
 static LLVMValueRef
 llvm_load_input_helper(
struct radeon_llvm_context * ctx,
@@ -529,7 +518,6 @@ LLVMModuleRef r600_tgsi_llvm(
bld_base-info = shader_info;
bld_base-userdata = ctx;
bld_base-emit_fetch_funcs[TGSI_FILE_CONSTANT] = llvm_fetch_const;
-   bld_base-emit_fetch_funcs[TGSI_FILE_SYSTEM_VALUE] = 
llvm_fetch_system_value;
bld_base-emit_prologue = llvm_emit_prologue;
bld_base-emit_epilogue = llvm_emit_epilogue;
ctx-userdata = ctx;
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c 
b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index e2a6bee..314c963 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -200,6 +200,17 @@ emit_fetch(
return bitcast(bld_base, type, result);
 }
 
+static LLVMValueRef fetch_system_value(
+   struct lp_build_tgsi_context * bld_base,
+   const struct tgsi_full_src_register *reg,
+   enum tgsi_opcode_type type,
+   unsigned swizzle)
+{
+   struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
+   LLVMValueRef cval = ctx-system_values[reg-Register.Index];
+   return bitcast(bld_base, type, cval);
+}
+
 static void emit_declaration(
struct lp_build_tgsi_context * bld_base,
const struct tgsi_full_declaration *decl)
@@ -1153,6 +1164,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context 
* ctx)
bld_base-emit_fetch_funcs[TGSI_FILE_INPUT] = emit_fetch;
bld_base-emit_fetch_funcs[TGSI_FILE_TEMPORARY] = emit_fetch;
bld_base-emit_fetch_funcs[TGSI_FILE_OUTPUT] = emit_fetch;
+   bld_base-emit_fetch_funcs[TGSI_FILE_SYSTEM_VALUE] = fetch_system_value;
 
/* Allocate outputs */
ctx-soa.outputs = ctx-outputs;
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/4] radeonsi: add instanceid support

2013-03-27 Thread Christian König
From: Christian König christian.koe...@amd.com

Signed-off-by: Christian König christian.koe...@amd.com
Reviewed-by: Michel Dänzer michel.daen...@amd.com
Tested-by: Michel Dänzer michel.daen...@amd.com
---
 src/gallium/drivers/radeonsi/radeonsi_pipe.c   |2 +-
 src/gallium/drivers/radeonsi/radeonsi_shader.c |   35 ++--
 src/gallium/drivers/radeonsi/radeonsi_shader.h |6 +++-
 src/gallium/drivers/radeonsi/si_state_draw.c   |7 +++--
 4 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/radeonsi_pipe.c 
b/src/gallium/drivers/radeonsi/radeonsi_pipe.c
index 672017a..63ed66b 100644
--- a/src/gallium/drivers/radeonsi/radeonsi_pipe.c
+++ b/src/gallium/drivers/radeonsi/radeonsi_pipe.c
@@ -330,6 +330,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum 
pipe_cap param)
case PIPE_CAP_START_INSTANCE:
case PIPE_CAP_NPOT_TEXTURES:
 case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
+   case PIPE_CAP_TGSI_INSTANCEID:
return 1;
case PIPE_CAP_TGSI_TEXCOORD:
return 0;
@@ -344,7 +345,6 @@ static int r600_get_param(struct pipe_screen* pscreen, enum 
pipe_cap param)
return debug_get_bool_option(R600_GLSL130, FALSE) ? 130 : 120;
 
/* Unsupported features. */
-   case PIPE_CAP_TGSI_INSTANCEID:
case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
case PIPE_CAP_SCALED_RESOLVE:
diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.c 
b/src/gallium/drivers/radeonsi/radeonsi_shader.c
index 840537a..62f478e 100644
--- a/src/gallium/drivers/radeonsi/radeonsi_shader.c
+++ b/src/gallium/drivers/radeonsi/radeonsi_shader.c
@@ -141,7 +141,7 @@ static void declare_input_vs(
 
/* Load the buffer index, which is always stored in VGPR0
 * for Vertex Shaders */
-   buffer_index_reg = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, 
SI_PARAM_VERTEX_INDEX);
+   buffer_index_reg = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, 
SI_PARAM_VERTEX_ID);
 
vec4_type = LLVMVectorType(base-elem_type, 4);
args[0] = t_list;
@@ -346,6 +346,30 @@ static void declare_input(
}
 }
 
+static void declare_system_value(
+   struct radeon_llvm_context * radeon_bld,
+   unsigned index,
+   const struct tgsi_full_declaration *decl)
+{
+   LLVMValueRef value = 0;
+
+   switch (decl-Semantic.Name) {
+   case TGSI_SEMANTIC_INSTANCEID:
+   value = LLVMGetParam(radeon_bld-main_fn, SI_PARAM_INSTANCE_ID);
+   break;
+
+   case TGSI_SEMANTIC_VERTEXID:
+   value = LLVMGetParam(radeon_bld-main_fn, SI_PARAM_VERTEX_ID);
+   break;
+
+   default:
+   assert(!unknown system value);
+   return;
+   }
+
+   radeon_bld-system_values[index] = value;
+}
+
 static LLVMValueRef fetch_constant(
struct lp_build_tgsi_context * bld_base,
const struct tgsi_full_src_register *reg,
@@ -939,8 +963,11 @@ static void create_function(struct si_shader_context 
*si_shader_ctx)
 
if (si_shader_ctx-type == TGSI_PROCESSOR_VERTEX) {
params[SI_PARAM_VERTEX_BUFFER] = params[SI_PARAM_SAMPLER];
-   params[SI_PARAM_VERTEX_INDEX] = i32;
-   radeon_llvm_create_func(si_shader_ctx-radeon_bld, params, 5);
+   params[SI_PARAM_VERTEX_ID] = i32;
+   params[SI_PARAM_DUMMY_0] = i32;
+   params[SI_PARAM_DUMMY_1] = i32;
+   params[SI_PARAM_INSTANCE_ID] = i32;
+   radeon_llvm_create_func(si_shader_ctx-radeon_bld, params, 8);
 
} else {
params[SI_PARAM_PRIM_MASK] = i32;
@@ -1064,6 +1091,7 @@ int si_pipe_shader_create(
 
tgsi_scan_shader(sel-tokens, shader_info);
shader-shader.uses_kill = shader_info.uses_kill;
+   shader-shader.uses_instanceid = shader_info.uses_instanceid;
bld_base-info = shader_info;
bld_base-emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
bld_base-emit_epilogue = si_llvm_emit_epilogue;
@@ -1074,6 +1102,7 @@ int si_pipe_shader_create(
bld_base-op_actions[TGSI_OPCODE_TXP] = tex_action;
 
si_shader_ctx.radeon_bld.load_input = declare_input;
+   si_shader_ctx.radeon_bld.load_system_value = declare_system_value;
si_shader_ctx.tokens = sel-tokens;
tgsi_parse_init(si_shader_ctx.parse, si_shader_ctx.tokens);
si_shader_ctx.shader = shader;
diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.h 
b/src/gallium/drivers/radeonsi/radeonsi_shader.h
index fe771ce..8f5efd0 100644
--- a/src/gallium/drivers/radeonsi/radeonsi_shader.h
+++ b/src/gallium/drivers/radeonsi/radeonsi_shader.h
@@ -44,7 +44,10 @@
 
 /* VS only parameters */
 #define SI_PARAM_VERTEX_BUFFER 3
-#define SI_PARAM_VERTEX_INDEX  4
+#define SI_PARAM_VERTEX_ID 4
+#define SI_PARAM_DUMMY_0   5
+#define 

[Mesa-dev] [PATCH 4/4] radeonsi: add instance divisor support v3

2013-03-27 Thread Christian König
From: Christian König christian.koe...@amd.com

v2: reduce key size, don't copy key around to much.
v3: remove key size reduction

Signed-off-by: Christian König christian.koe...@amd.com
---
 src/gallium/drivers/radeonsi/radeonsi_shader.c |   67 +++-
 src/gallium/drivers/radeonsi/radeonsi_shader.h |   24 +
 src/gallium/drivers/radeonsi/si_state.c|   44 +---
 src/gallium/drivers/radeonsi/si_state_draw.c   |   18 +--
 4 files changed, 94 insertions(+), 59 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.c 
b/src/gallium/drivers/radeonsi/radeonsi_shader.c
index 0512528..5fdf46e 100644
--- a/src/gallium/drivers/radeonsi/radeonsi_shader.c
+++ b/src/gallium/drivers/radeonsi/radeonsi_shader.c
@@ -54,11 +54,9 @@
 struct si_shader_context
 {
struct radeon_llvm_context radeon_bld;
-   struct r600_context *rctx;
struct tgsi_parse_context parse;
struct tgsi_token * tokens;
struct si_pipe_shader *shader;
-   struct si_shader_key key;
unsigned type; /* TGSI_PROCESSOR_* specifies the type of shader. */
LLVMValueRef const_md;
LLVMValueRef const_resource;
@@ -112,22 +110,41 @@ static LLVMValueRef build_indexed_load(
return result;
 }
 
+static LLVMValueRef get_instance_index(
+   struct radeon_llvm_context * radeon_bld,
+   unsigned divisor)
+{
+   struct gallivm_state * gallivm = radeon_bld-soa.bld_base.base.gallivm;
+
+   LLVMValueRef result = LLVMGetParam(radeon_bld-main_fn, 
SI_PARAM_INSTANCE_ID);
+   result = LLVMBuildAdd(gallivm-builder, result, LLVMGetParam(
+   radeon_bld-main_fn, SI_PARAM_START_INSTANCE), );
+
+   if (divisor  1)
+   result = LLVMBuildUDiv(gallivm-builder, result,
+   lp_build_const_int32(gallivm, divisor), );
+
+   return result;
+}
+
 static void declare_input_vs(
struct si_shader_context * si_shader_ctx,
unsigned input_index,
const struct tgsi_full_declaration *decl)
 {
+   struct lp_build_context * base = 
si_shader_ctx-radeon_bld.soa.bld_base.base;
+   unsigned divisor = 
si_shader_ctx-shader-key.vs.instance_divisors[input_index];
+
+   unsigned chan;
+
LLVMValueRef t_list_ptr;
LLVMValueRef t_offset;
LLVMValueRef t_list;
LLVMValueRef attribute_offset;
-   LLVMValueRef buffer_index_reg;
+   LLVMValueRef buffer_index;
LLVMValueRef args[3];
LLVMTypeRef vec4_type;
LLVMValueRef input;
-   struct lp_build_context * base = 
si_shader_ctx-radeon_bld.soa.bld_base.base;
-   //struct pipe_vertex_element *velem = 
rctx-vertex_elements-elements[input_index];
-   unsigned chan;
 
/* Load the T list */
t_list_ptr = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, 
SI_PARAM_VERTEX_BUFFER);
@@ -139,14 +156,20 @@ static void declare_input_vs(
/* Build the attribute offset */
attribute_offset = lp_build_const_int32(base-gallivm, 0);
 
-   /* Load the buffer index, which is always stored in VGPR0
-* for Vertex Shaders */
-   buffer_index_reg = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, 
SI_PARAM_VERTEX_ID);
+   if (divisor) {
+   /* Build index from instance ID, start instance and divisor */
+   si_shader_ctx-shader-shader.uses_instanceid = true;
+   buffer_index = get_instance_index(si_shader_ctx-radeon_bld, 
divisor);
+   } else {
+   /* Load the buffer index, which is always stored in VGPR0
+* for Vertex Shaders */
+   buffer_index = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, 
SI_PARAM_VERTEX_ID);
+   }
 
vec4_type = LLVMVectorType(base-elem_type, 4);
args[0] = t_list;
args[1] = attribute_offset;
-   args[2] = buffer_index_reg;
+   args[2] = buffer_index;
input = build_intrinsic(base-gallivm-builder,
llvm.SI.vs.load.input, vec4_type, args, 3,
LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
@@ -239,7 +262,7 @@ static void declare_input_fs(
/* XXX: Handle all possible interpolation modes */
switch (decl-Interp.Interpolate) {
case TGSI_INTERPOLATE_COLOR:
-   if (si_shader_ctx-key.flatshade) {
+   if (si_shader_ctx-shader-key.ps.flatshade) {
interp_param = 0;
} else {
if (decl-Interp.Centroid)
@@ -272,7 +295,7 @@ static void declare_input_fs(
 
/* XXX: Could there be more than TGSI_NUM_CHANNELS (4) ? */
if (decl-Semantic.Name == TGSI_SEMANTIC_COLOR 
-   si_shader_ctx-key.color_two_side) {
+   si_shader_ctx-shader-key.ps.color_two_side) {
LLVMValueRef args[4];
LLVMValueRef face, is_face_positive;
LLVMValueRef back_attr_number =
@@ -351,15 +374,12 @@ static void 

Re: [Mesa-dev] [PATCH] winsys/radeon: add command stream replay dump for faulty lockup

2013-03-27 Thread Jerome Glisse
On Wed, Mar 27, 2013 at 11:27 AM,  j.gli...@gmail.com wrote:
 From: Jerome Glisse jgli...@redhat.com

 Build time option, set RADEON_CS_DUMP_ON_LOCKUP to 1 in radeon_drm_cs.h to
 enable it.

 When enabled after each cs submission the code will try to detect lockup by
 waiting on one of the buffer of the cs to become idle, after a timeout it
 will consider that the cs triggered a lockup and will write a radeon_lockup.c
 file in current directory that have all information for replaying the cs.

 To build this file :
 gcc -O0 -g radeon_lockup.c -ldrm -o radeon_lockup -I/usr/include/libdrm

 Signed-off-by: Jerome Glisse jgli...@redhat.com

Maybe i should add the radeon_ctx.h file to winsys dir as you need it
to build the radeon_lockup.c i did not wanted to printf the whole
helper. For example you can check radeon_lockup.c and radeon_ctx.h
here :
http://people.freedesktop.org/~glisse/rlockup/

Note this is a radeon si verde capture for a 2d tiling that lockup
(can be hard lockup sometimes so be careful).

Cheers,
Jerome

 ---
  src/gallium/winsys/radeon/drm/Makefile.sources |   1 +
  src/gallium/winsys/radeon/drm/radeon_drm_bo.c  |  80 ++--
  src/gallium/winsys/radeon/drm/radeon_drm_bo.h  |   2 +
  src/gallium/winsys/radeon/drm/radeon_drm_cs.c  |   4 +
  src/gallium/winsys/radeon/drm/radeon_drm_cs.h  |   6 +
  src/gallium/winsys/radeon/drm/radeon_drm_cs_dump.c | 135 
 +
  6 files changed, 191 insertions(+), 37 deletions(-)
  create mode 100644 src/gallium/winsys/radeon/drm/radeon_drm_cs_dump.c

 diff --git a/src/gallium/winsys/radeon/drm/Makefile.sources 
 b/src/gallium/winsys/radeon/drm/Makefile.sources
 index 1d18d61..4ca5ebb 100644
 --- a/src/gallium/winsys/radeon/drm/Makefile.sources
 +++ b/src/gallium/winsys/radeon/drm/Makefile.sources
 @@ -1,4 +1,5 @@
  C_SOURCES := \
 radeon_drm_bo.c \
 radeon_drm_cs.c \
 +   radeon_drm_cs_dump.c \
 radeon_drm_winsys.c
 diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c 
 b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
 index f4ac526..5a9493a 100644
 --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
 +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
 @@ -391,14 +391,54 @@ static void radeon_bo_destroy(struct pb_buffer *_buf)
  FREE(bo);
  }

 +void *radeon_bo_do_map(struct radeon_bo *bo)
 +{
 +struct drm_radeon_gem_mmap args = {0};
 +void *ptr;
 +
 +/* Return the pointer if it's already mapped. */
 +if (bo-ptr)
 +return bo-ptr;
 +
 +/* Map the buffer. */
 +pipe_mutex_lock(bo-map_mutex);
 +/* Return the pointer if it's already mapped (in case of a race). */
 +if (bo-ptr) {
 +pipe_mutex_unlock(bo-map_mutex);
 +return bo-ptr;
 +}
 +args.handle = bo-handle;
 +args.offset = 0;
 +args.size = (uint64_t)bo-base.size;
 +if (drmCommandWriteRead(bo-rws-fd,
 +DRM_RADEON_GEM_MMAP,
 +args,
 +sizeof(args))) {
 +pipe_mutex_unlock(bo-map_mutex);
 +fprintf(stderr, radeon: gem_mmap failed: %p 0x%08X\n,
 +bo, bo-handle);
 +return NULL;
 +}
 +
 +ptr = os_mmap(0, args.size, PROT_READ|PROT_WRITE, MAP_SHARED,
 +   bo-rws-fd, args.addr_ptr);
 +if (ptr == MAP_FAILED) {
 +pipe_mutex_unlock(bo-map_mutex);
 +fprintf(stderr, radeon: mmap failed, errno: %i\n, errno);
 +return NULL;
 +}
 +bo-ptr = ptr;
 +pipe_mutex_unlock(bo-map_mutex);
 +
 +return bo-ptr;
 +}
 +
  static void *radeon_bo_map(struct radeon_winsys_cs_handle *buf,
 struct radeon_winsys_cs *rcs,
 enum pipe_transfer_usage usage)
  {
  struct radeon_bo *bo = (struct radeon_bo*)buf;
  struct radeon_drm_cs *cs = (struct radeon_drm_cs*)rcs;
 -struct drm_radeon_gem_mmap args = {0};
 -void *ptr;

  /* If it's not unsynchronized bo_map, flush CS if needed and then wait. 
 */
  if (!(usage  PIPE_TRANSFER_UNSYNCHRONIZED)) {
 @@ -461,41 +501,7 @@ static void *radeon_bo_map(struct 
 radeon_winsys_cs_handle *buf,
  }
  }

 -/* Return the pointer if it's already mapped. */
 -if (bo-ptr)
 -return bo-ptr;
 -
 -/* Map the buffer. */
 -pipe_mutex_lock(bo-map_mutex);
 -/* Return the pointer if it's already mapped (in case of a race). */
 -if (bo-ptr) {
 -pipe_mutex_unlock(bo-map_mutex);
 -return bo-ptr;
 -}
 -args.handle = bo-handle;
 -args.offset = 0;
 -args.size = (uint64_t)bo-base.size;
 -if (drmCommandWriteRead(bo-rws-fd,
 -DRM_RADEON_GEM_MMAP,
 -args,
 -sizeof(args))) {
 -pipe_mutex_unlock(bo-map_mutex);
 -fprintf(stderr, radeon: gem_mmap failed: %p 0x%08X\n,
 -bo, bo-handle);
 -return NULL;
 -}
 -
 -ptr = os_mmap(0, 

Re: [Mesa-dev] [PATCH] winsys/radeon: add command stream replay dump for faulty lockup

2013-03-27 Thread Marek Olšák
On Wed, Mar 27, 2013 at 4:38 PM, Jerome Glisse j.gli...@gmail.com wrote:
 On Wed, Mar 27, 2013 at 11:27 AM,  j.gli...@gmail.com wrote:
 From: Jerome Glisse jgli...@redhat.com

 Build time option, set RADEON_CS_DUMP_ON_LOCKUP to 1 in radeon_drm_cs.h to
 enable it.

 When enabled after each cs submission the code will try to detect lockup by
 waiting on one of the buffer of the cs to become idle, after a timeout it
 will consider that the cs triggered a lockup and will write a radeon_lockup.c
 file in current directory that have all information for replaying the cs.

 To build this file :
 gcc -O0 -g radeon_lockup.c -ldrm -o radeon_lockup -I/usr/include/libdrm

 Signed-off-by: Jerome Glisse jgli...@redhat.com

 Maybe i should add the radeon_ctx.h file to winsys dir as you need it
 to build the radeon_lockup.c i did not wanted to printf the whole
 helper. For example you can check radeon_lockup.c and radeon_ctx.h
 here :
 http://people.freedesktop.org/~glisse/rlockup/

It would be better to put the file in a separate directory (maybe a
subdirectory in the winsys) and add a note that it's supposed to be
used by out-of-tree code.

Otherwise it looks good.

Marek


 Note this is a radeon si verde capture for a 2d tiling that lockup
 (can be hard lockup sometimes so be careful).

 Cheers,
 Jerome

 ---
  src/gallium/winsys/radeon/drm/Makefile.sources |   1 +
  src/gallium/winsys/radeon/drm/radeon_drm_bo.c  |  80 ++--
  src/gallium/winsys/radeon/drm/radeon_drm_bo.h  |   2 +
  src/gallium/winsys/radeon/drm/radeon_drm_cs.c  |   4 +
  src/gallium/winsys/radeon/drm/radeon_drm_cs.h  |   6 +
  src/gallium/winsys/radeon/drm/radeon_drm_cs_dump.c | 135 
 +
  6 files changed, 191 insertions(+), 37 deletions(-)
  create mode 100644 src/gallium/winsys/radeon/drm/radeon_drm_cs_dump.c

 diff --git a/src/gallium/winsys/radeon/drm/Makefile.sources 
 b/src/gallium/winsys/radeon/drm/Makefile.sources
 index 1d18d61..4ca5ebb 100644
 --- a/src/gallium/winsys/radeon/drm/Makefile.sources
 +++ b/src/gallium/winsys/radeon/drm/Makefile.sources
 @@ -1,4 +1,5 @@
  C_SOURCES := \
 radeon_drm_bo.c \
 radeon_drm_cs.c \
 +   radeon_drm_cs_dump.c \
 radeon_drm_winsys.c
 diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c 
 b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
 index f4ac526..5a9493a 100644
 --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
 +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
 @@ -391,14 +391,54 @@ static void radeon_bo_destroy(struct pb_buffer *_buf)
  FREE(bo);
  }

 +void *radeon_bo_do_map(struct radeon_bo *bo)
 +{
 +struct drm_radeon_gem_mmap args = {0};
 +void *ptr;
 +
 +/* Return the pointer if it's already mapped. */
 +if (bo-ptr)
 +return bo-ptr;
 +
 +/* Map the buffer. */
 +pipe_mutex_lock(bo-map_mutex);
 +/* Return the pointer if it's already mapped (in case of a race). */
 +if (bo-ptr) {
 +pipe_mutex_unlock(bo-map_mutex);
 +return bo-ptr;
 +}
 +args.handle = bo-handle;
 +args.offset = 0;
 +args.size = (uint64_t)bo-base.size;
 +if (drmCommandWriteRead(bo-rws-fd,
 +DRM_RADEON_GEM_MMAP,
 +args,
 +sizeof(args))) {
 +pipe_mutex_unlock(bo-map_mutex);
 +fprintf(stderr, radeon: gem_mmap failed: %p 0x%08X\n,
 +bo, bo-handle);
 +return NULL;
 +}
 +
 +ptr = os_mmap(0, args.size, PROT_READ|PROT_WRITE, MAP_SHARED,
 +   bo-rws-fd, args.addr_ptr);
 +if (ptr == MAP_FAILED) {
 +pipe_mutex_unlock(bo-map_mutex);
 +fprintf(stderr, radeon: mmap failed, errno: %i\n, errno);
 +return NULL;
 +}
 +bo-ptr = ptr;
 +pipe_mutex_unlock(bo-map_mutex);
 +
 +return bo-ptr;
 +}
 +
  static void *radeon_bo_map(struct radeon_winsys_cs_handle *buf,
 struct radeon_winsys_cs *rcs,
 enum pipe_transfer_usage usage)
  {
  struct radeon_bo *bo = (struct radeon_bo*)buf;
  struct radeon_drm_cs *cs = (struct radeon_drm_cs*)rcs;
 -struct drm_radeon_gem_mmap args = {0};
 -void *ptr;

  /* If it's not unsynchronized bo_map, flush CS if needed and then wait. 
 */
  if (!(usage  PIPE_TRANSFER_UNSYNCHRONIZED)) {
 @@ -461,41 +501,7 @@ static void *radeon_bo_map(struct 
 radeon_winsys_cs_handle *buf,
  }
  }

 -/* Return the pointer if it's already mapped. */
 -if (bo-ptr)
 -return bo-ptr;
 -
 -/* Map the buffer. */
 -pipe_mutex_lock(bo-map_mutex);
 -/* Return the pointer if it's already mapped (in case of a race). */
 -if (bo-ptr) {
 -pipe_mutex_unlock(bo-map_mutex);
 -return bo-ptr;
 -}
 -args.handle = bo-handle;
 -args.offset = 0;
 -args.size = (uint64_t)bo-base.size;
 -if (drmCommandWriteRead(bo-rws-fd,
 -DRM_RADEON_GEM_MMAP,
 

[Mesa-dev] [PATCH] winsys/radeon: add command stream replay dump for faulty lockup v2

2013-03-27 Thread j . glisse
From: Jerome Glisse jgli...@redhat.com

Build time option, set RADEON_CS_DUMP_ON_LOCKUP to 1 in radeon_drm_cs.h to
enable it.

When enabled after each cs submission the code will try to detect lockup by
waiting on one of the buffer of the cs to become idle, after a timeout it
will consider that the cs triggered a lockup and will write a radeon_lockup.c
file in current directory that have all information for replaying the cs.

To build this file :
gcc -O0 -g radeon_lockup.c -ldrm -o radeon_lockup -I/usr/include/libdrm

v2: Add radeon_ctx.h file to mesa git tree

Signed-off-by: Jerome Glisse jgli...@redhat.com
---
 src/gallium/winsys/radeon/drm/Makefile.sources |   1 +
 src/gallium/winsys/radeon/drm/radeon_drm_bo.c  |  80 +++
 src/gallium/winsys/radeon/drm/radeon_drm_bo.h  |   2 +
 src/gallium/winsys/radeon/drm/radeon_drm_cs.c  |   4 +
 src/gallium/winsys/radeon/drm/radeon_drm_cs.h  |   6 +
 src/gallium/winsys/radeon/drm/radeon_drm_cs_dump.c | 141 
 src/gallium/winsys/radeon/tools/radeon_ctx.h   | 237 +
 7 files changed, 434 insertions(+), 37 deletions(-)
 create mode 100644 src/gallium/winsys/radeon/drm/radeon_drm_cs_dump.c
 create mode 100644 src/gallium/winsys/radeon/tools/radeon_ctx.h

diff --git a/src/gallium/winsys/radeon/drm/Makefile.sources 
b/src/gallium/winsys/radeon/drm/Makefile.sources
index 1d18d61..4ca5ebb 100644
--- a/src/gallium/winsys/radeon/drm/Makefile.sources
+++ b/src/gallium/winsys/radeon/drm/Makefile.sources
@@ -1,4 +1,5 @@
 C_SOURCES := \
radeon_drm_bo.c \
radeon_drm_cs.c \
+   radeon_drm_cs_dump.c \
radeon_drm_winsys.c
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c 
b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index f4ac526..5a9493a 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -391,14 +391,54 @@ static void radeon_bo_destroy(struct pb_buffer *_buf)
 FREE(bo);
 }
 
+void *radeon_bo_do_map(struct radeon_bo *bo)
+{
+struct drm_radeon_gem_mmap args = {0};
+void *ptr;
+
+/* Return the pointer if it's already mapped. */
+if (bo-ptr)
+return bo-ptr;
+
+/* Map the buffer. */
+pipe_mutex_lock(bo-map_mutex);
+/* Return the pointer if it's already mapped (in case of a race). */
+if (bo-ptr) {
+pipe_mutex_unlock(bo-map_mutex);
+return bo-ptr;
+}
+args.handle = bo-handle;
+args.offset = 0;
+args.size = (uint64_t)bo-base.size;
+if (drmCommandWriteRead(bo-rws-fd,
+DRM_RADEON_GEM_MMAP,
+args,
+sizeof(args))) {
+pipe_mutex_unlock(bo-map_mutex);
+fprintf(stderr, radeon: gem_mmap failed: %p 0x%08X\n,
+bo, bo-handle);
+return NULL;
+}
+
+ptr = os_mmap(0, args.size, PROT_READ|PROT_WRITE, MAP_SHARED,
+   bo-rws-fd, args.addr_ptr);
+if (ptr == MAP_FAILED) {
+pipe_mutex_unlock(bo-map_mutex);
+fprintf(stderr, radeon: mmap failed, errno: %i\n, errno);
+return NULL;
+}
+bo-ptr = ptr;
+pipe_mutex_unlock(bo-map_mutex);
+
+return bo-ptr;
+}
+
 static void *radeon_bo_map(struct radeon_winsys_cs_handle *buf,
struct radeon_winsys_cs *rcs,
enum pipe_transfer_usage usage)
 {
 struct radeon_bo *bo = (struct radeon_bo*)buf;
 struct radeon_drm_cs *cs = (struct radeon_drm_cs*)rcs;
-struct drm_radeon_gem_mmap args = {0};
-void *ptr;
 
 /* If it's not unsynchronized bo_map, flush CS if needed and then wait. */
 if (!(usage  PIPE_TRANSFER_UNSYNCHRONIZED)) {
@@ -461,41 +501,7 @@ static void *radeon_bo_map(struct radeon_winsys_cs_handle 
*buf,
 }
 }
 
-/* Return the pointer if it's already mapped. */
-if (bo-ptr)
-return bo-ptr;
-
-/* Map the buffer. */
-pipe_mutex_lock(bo-map_mutex);
-/* Return the pointer if it's already mapped (in case of a race). */
-if (bo-ptr) {
-pipe_mutex_unlock(bo-map_mutex);
-return bo-ptr;
-}
-args.handle = bo-handle;
-args.offset = 0;
-args.size = (uint64_t)bo-base.size;
-if (drmCommandWriteRead(bo-rws-fd,
-DRM_RADEON_GEM_MMAP,
-args,
-sizeof(args))) {
-pipe_mutex_unlock(bo-map_mutex);
-fprintf(stderr, radeon: gem_mmap failed: %p 0x%08X\n,
-bo, bo-handle);
-return NULL;
-}
-
-ptr = os_mmap(0, args.size, PROT_READ|PROT_WRITE, MAP_SHARED,
-   bo-rws-fd, args.addr_ptr);
-if (ptr == MAP_FAILED) {
-pipe_mutex_unlock(bo-map_mutex);
-fprintf(stderr, radeon: mmap failed, errno: %i\n, errno);
-return NULL;
-}
-bo-ptr = ptr;
-pipe_mutex_unlock(bo-map_mutex);
-
-return bo-ptr;
+return radeon_bo_do_map(bo);
 }
 
 static void 

[Mesa-dev] [PATCH 0/4] Begin some ir_dereference_array-of-a-vector rework

2013-03-27 Thread Ian Romanick
This is the first of three patch series that I'm going to send related
to reworking ir_dereference_array of vectors.  The final series end with
ir_dereference_array of vectors being completely removed from the IR.

The first series is a set of fixes for bugs that I discovered along the
way.  The first two are definitely candidates for the stable branches.
The second two might be, but I could be convinced either way.

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/4] glsl: Add missing bool case in glsl_type::get_scalar_type

2013-03-27 Thread Ian Romanick
From: Ian Romanick ian.d.roman...@intel.com

Since the case was missing bec4-get_scalar_type() would return bvec4,
but vec4-get_scalar_type() would return float.

NOTE: This is a candidate for stable branches.

Signed-off-by: Ian Romanick ian.d.roman...@intel.com
---
 src/glsl/glsl_types.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/glsl/glsl_types.cpp b/src/glsl/glsl_types.cpp
index 8b0a248..419761a 100644
--- a/src/glsl/glsl_types.cpp
+++ b/src/glsl/glsl_types.cpp
@@ -476,6 +476,8 @@ const glsl_type *glsl_type::get_scalar_type() const
   return int_type;
case GLSL_TYPE_FLOAT:
   return float_type;
+   case GLSL_TYPE_BOOL:
+  return bool_type;
default:
   /* Handle everything else */
   return type;
-- 
1.8.1.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 4/4] glsl: Generated masked write instead of vector array index for UBO lowering

2013-03-27 Thread Ian Romanick
From: Ian Romanick ian.d.roman...@intel.com

When reading a column from a row-major matrix, we would slot the single
value read into the vector using an ir_dereference_array of the vector
with a constant index.  This will (eventually) get optimized to a
masked-write, so just generate the masked write in the first place.

Signed-off-by: Ian Romanick ian.d.roman...@intel.com
Cc: Eric Anholt e...@anholt.net
---
 src/glsl/lower_ubo_reference.cpp | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/glsl/lower_ubo_reference.cpp b/src/glsl/lower_ubo_reference.cpp
index 026197d..9e5e951 100644
--- a/src/glsl/lower_ubo_reference.cpp
+++ b/src/glsl/lower_ubo_reference.cpp
@@ -357,17 +357,14 @@ 
lower_ubo_reference_visitor::emit_ubo_loads(ir_dereference *deref,
 
   for (unsigned i = 0; i  deref-type-vector_elements; i++) {
 ir_rvalue *chan = new(mem_ctx) ir_constant((int)i);
-ir_dereference *deref_chan =
-   new(mem_ctx) ir_dereference_array(deref-clone(mem_ctx, NULL),
- chan);
-
 ir_rvalue *chan_offset =
add(base_offset,
new(mem_ctx) ir_constant(deref_offset + i * matrix_stride));
 
-base_ir-insert_before(assign(deref_chan,
+base_ir-insert_before(assign(deref-clone(mem_ctx, NULL),
   ubo_load(glsl_type::float_type,
-   chan_offset)));
+   chan_offset),
+  (1U  i)));
   }
}
 }
-- 
1.8.1.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/4] glsl: Replace open-coded dot-product with dot

2013-03-27 Thread Ian Romanick
From: Ian Romanick ian.d.roman...@intel.com

Signed-off-by: Ian Romanick ian.d.roman...@intel.com
Cc: Eric Anholt e...@anholt.net
Cc: Paul Berry stereotype...@gmail.com
---
 src/glsl/builtins/glsl/determinant.glsl | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/glsl/builtins/glsl/determinant.glsl 
b/src/glsl/builtins/glsl/determinant.glsl
index 78751a6..0800d40 100644
--- a/src/glsl/builtins/glsl/determinant.glsl
+++ b/src/glsl/builtins/glsl/determinant.glsl
@@ -22,6 +22,10 @@
  */
 
 #version 120
+
+// Forward declaration because builtins don't know about other builtins.
+float dot(vec4, vec4);
+
 float determinant(mat2 m)
 {
return m[0].x * m[1].y - m[1].x * m[0].y;
@@ -63,8 +67,5 @@ float determinant(mat4 m)
adj_0.z = + (m[1].x * SubFactor01 - m[1].y * SubFactor03 + m[1].w * 
SubFactor05);
adj_0.w = - (m[1].x * SubFactor02 - m[1].y * SubFactor04 + m[1].z * 
SubFactor05);
 
-   return (+ m[0].x * adj_0.x
-   + m[0].y * adj_0.y
-   + m[0].z * adj_0.z
-   + m[0].w * adj_0.w);
+   return dot(m[0], adj_0);
 }
-- 
1.8.1.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/4] glsl: Replace constant-index vector array accesses with swizzles

2013-03-27 Thread Ian Romanick
From: Ian Romanick ian.d.roman...@intel.com

Search and replace:

][0] - ].x
][1] - ].y
][2] - ].z
][3] - ].w

Fixes piglit tests inverse-mat[234].{vert,frag}.  These tests call the
inverse function with constant parameters and expect proper constant
folding to happen.  My suspicion is that this patch papers over some bug
in constant propagation involving array accesses.

Either way, all of these accesses eventually get lowered to swizzles.
This cuts out the middle man (saving a trivial amount of CPU).

NOTE: This is a candidate for the 9.1 branch.

Signed-off-by: Ian Romanick ian.d.roman...@intel.com
Cc: Eric Anholt e...@anholt.net
Cc: Paul Berry stereotype...@gmail.com
---
 src/glsl/builtins/glsl/determinant.glsl |  62 +-
 src/glsl/builtins/glsl/inverse.glsl | 112 
 2 files changed, 87 insertions(+), 87 deletions(-)

diff --git a/src/glsl/builtins/glsl/determinant.glsl 
b/src/glsl/builtins/glsl/determinant.glsl
index 32695a8..78751a6 100644
--- a/src/glsl/builtins/glsl/determinant.glsl
+++ b/src/glsl/builtins/glsl/determinant.glsl
@@ -24,47 +24,47 @@
 #version 120
 float determinant(mat2 m)
 {
-   return m[0][0] * m[1][1] - m[1][0] * m[0][1];
+   return m[0].x * m[1].y - m[1].x * m[0].y;
 }
 
 float determinant(mat3 m)
 {
-   return (+ m[0][0] * (m[1][1] * m[2][2] - m[1][2] * m[2][1])
-   - m[0][1] * (m[1][0] * m[2][2] - m[1][2] * m[2][0])
-   + m[0][2] * (m[1][0] * m[2][1] - m[1][1] * m[2][0]));
+   return (+ m[0].x * (m[1].y * m[2].z - m[1].z * m[2].y)
+   - m[0].y * (m[1].x * m[2].z - m[1].z * m[2].x)
+   + m[0].z * (m[1].x * m[2].y - m[1].y * m[2].x));
 }
 
 float determinant(mat4 m)
 {
-   float SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
-   float SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
-   float SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
-   float SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
-   float SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
-   float SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
-   float SubFactor06 = m[1][2] * m[3][3] - m[3][2] * m[1][3];
-   float SubFactor07 = m[1][1] * m[3][3] - m[3][1] * m[1][3];
-   float SubFactor08 = m[1][1] * m[3][2] - m[3][1] * m[1][2];
-   float SubFactor09 = m[1][0] * m[3][3] - m[3][0] * m[1][3];
-   float SubFactor10 = m[1][0] * m[3][2] - m[3][0] * m[1][2];
-   float SubFactor11 = m[1][1] * m[3][3] - m[3][1] * m[1][3];
-   float SubFactor12 = m[1][0] * m[3][1] - m[3][0] * m[1][1];
-   float SubFactor13 = m[1][2] * m[2][3] - m[2][2] * m[1][3];
-   float SubFactor14 = m[1][1] * m[2][3] - m[2][1] * m[1][3];
-   float SubFactor15 = m[1][1] * m[2][2] - m[2][1] * m[1][2];
-   float SubFactor16 = m[1][0] * m[2][3] - m[2][0] * m[1][3];
-   float SubFactor17 = m[1][0] * m[2][2] - m[2][0] * m[1][2];
-   float SubFactor18 = m[1][0] * m[2][1] - m[2][0] * m[1][1];
+   float SubFactor00 = m[2].z * m[3].w - m[3].z * m[2].w;
+   float SubFactor01 = m[2].y * m[3].w - m[3].y * m[2].w;
+   float SubFactor02 = m[2].y * m[3].z - m[3].y * m[2].z;
+   float SubFactor03 = m[2].x * m[3].w - m[3].x * m[2].w;
+   float SubFactor04 = m[2].x * m[3].z - m[3].x * m[2].z;
+   float SubFactor05 = m[2].x * m[3].y - m[3].x * m[2].y;
+   float SubFactor06 = m[1].z * m[3].w - m[3].z * m[1].w;
+   float SubFactor07 = m[1].y * m[3].w - m[3].y * m[1].w;
+   float SubFactor08 = m[1].y * m[3].z - m[3].y * m[1].z;
+   float SubFactor09 = m[1].x * m[3].w - m[3].x * m[1].w;
+   float SubFactor10 = m[1].x * m[3].z - m[3].x * m[1].z;
+   float SubFactor11 = m[1].y * m[3].w - m[3].y * m[1].w;
+   float SubFactor12 = m[1].x * m[3].y - m[3].x * m[1].y;
+   float SubFactor13 = m[1].z * m[2].w - m[2].z * m[1].w;
+   float SubFactor14 = m[1].y * m[2].w - m[2].y * m[1].w;
+   float SubFactor15 = m[1].y * m[2].z - m[2].y * m[1].z;
+   float SubFactor16 = m[1].x * m[2].w - m[2].x * m[1].w;
+   float SubFactor17 = m[1].x * m[2].z - m[2].x * m[1].z;
+   float SubFactor18 = m[1].x * m[2].y - m[2].x * m[1].y;
 
vec4 adj_0;
 
-   adj_0[0] = + (m[1][1] * SubFactor00 - m[1][2] * SubFactor01 + m[1][3] * 
SubFactor02);
-   adj_0[1] = - (m[1][0] * SubFactor00 - m[1][2] * SubFactor03 + m[1][3] * 
SubFactor04);
-   adj_0[2] = + (m[1][0] * SubFactor01 - m[1][1] * SubFactor03 + m[1][3] * 
SubFactor05);
-   adj_0[3] = - (m[1][0] * SubFactor02 - m[1][1] * SubFactor04 + m[1][2] * 
SubFactor05);
+   adj_0.x = + (m[1].y * SubFactor00 - m[1].z * SubFactor01 + m[1].w * 
SubFactor02);
+   adj_0.y = - (m[1].x * SubFactor00 - m[1].z * SubFactor03 + m[1].w * 
SubFactor04);
+   adj_0.z = + (m[1].x * SubFactor01 - m[1].y * SubFactor03 + m[1].w * 
SubFactor05);
+   adj_0.w = - (m[1].x * SubFactor02 - m[1].y * SubFactor04 + m[1].z * 
SubFactor05);
 
-   return (+ m[0][0] * adj_0[0]
-   + m[0][1] * adj_0[1]
-   + m[0][2] * adj_0[2]
-   + m[0][3] * adj_0[3]);
+   return (+ m[0].x * adj_0.x
+   + m[0].y * adj_0.y
+   + m[0].z * adj_0.z

[Mesa-dev] [PATCH] r600g: fix range handling for tgsi input/output declarations

2013-03-27 Thread Vadim Girlin
Signed-off-by: Vadim Girlin vadimgir...@gmail.com
---
 src/gallium/drivers/r600/r600_shader.c | 19 +++
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c 
b/src/gallium/drivers/r600/r600_shader.c
index 29facf7..d4c9c03 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -874,12 +874,12 @@ static int select_twoside_color(struct r600_shader_ctx 
*ctx, int front, int back
 static int tgsi_declaration(struct r600_shader_ctx *ctx)
 {
struct tgsi_full_declaration *d = ctx-parse.FullToken.FullDeclaration;
-   unsigned i;
-   int r;
+   int r, i, j, count = d-Range.Last - d-Range.First + 1;
 
switch (d-Declaration.File) {
case TGSI_FILE_INPUT:
-   i = ctx-shader-ninput++;
+   i = ctx-shader-ninput;
+   ctx-shader-ninput += count;
ctx-shader-input[i].name = d-Semantic.Name;
ctx-shader-input[i].sid = d-Semantic.Index;
ctx-shader-input[i].interpolate = d-Interp.Interpolate;
@@ -903,9 +903,15 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
return r;
}
}
+   for (j = 1; j  count; ++j) {
+   memcpy(ctx-shader-input[i + j], 
ctx-shader-input[i],
+  sizeof(struct r600_shader_io));
+   ctx-shader-input[i + j].gpr += j;
+   }
break;
case TGSI_FILE_OUTPUT:
-   i = ctx-shader-noutput++;
+   i = ctx-shader-noutput;
+   ctx-shader-noutput += count;
ctx-shader-output[i].name = d-Semantic.Name;
ctx-shader-output[i].sid = d-Semantic.Index;
ctx-shader-output[i].gpr = ctx-file_offset[TGSI_FILE_OUTPUT] 
+ d-Range.First;
@@ -933,6 +939,11 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
break;
}
}
+   for (j = 1; j  count; ++j) {
+   memcpy(ctx-shader-output[i + j], 
ctx-shader-output[i],
+  sizeof(struct r600_shader_io));
+   ctx-shader-output[i + j].gpr += j;
+   }
break;
case TGSI_FILE_CONSTANT:
case TGSI_FILE_TEMPORARY:
-- 
1.8.1.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/4] glsl: Add missing bool case in glsl_type::get_scalar_type

2013-03-27 Thread Matt Turner
On Wed, Mar 27, 2013 at 9:30 AM, Ian Romanick i...@freedesktop.org wrote:
 From: Ian Romanick ian.d.roman...@intel.com

 Since the case was missing bec4-get_scalar_type() would return bvec4,

bvec4.

Series is Reviewed-by: Matt Turner matts...@gmail.com

 but vec4-get_scalar_type() would return float.

 NOTE: This is a candidate for stable branches.

 Signed-off-by: Ian Romanick ian.d.roman...@intel.com
 ---
  src/glsl/glsl_types.cpp | 2 ++
  1 file changed, 2 insertions(+)

 diff --git a/src/glsl/glsl_types.cpp b/src/glsl/glsl_types.cpp
 index 8b0a248..419761a 100644
 --- a/src/glsl/glsl_types.cpp
 +++ b/src/glsl/glsl_types.cpp
 @@ -476,6 +476,8 @@ const glsl_type *glsl_type::get_scalar_type() const
return int_type;
 case GLSL_TYPE_FLOAT:
return float_type;
 +   case GLSL_TYPE_BOOL:
 +  return bool_type;
 default:
/* Handle everything else */
return type;
 --
 1.8.1.4

 ___
 mesa-dev mailing list
 mesa-dev@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 4/4] glsl: Generated masked write instead of vector array index for UBO lowering

2013-03-27 Thread Kenneth Graunke

On 03/27/2013 09:30 AM, Ian Romanick wrote:

From: Ian Romanick ian.d.roman...@intel.com

When reading a column from a row-major matrix, we would slot the single
value read into the vector using an ir_dereference_array of the vector
with a constant index.  This will (eventually) get optimized to a
masked-write, so just generate the masked write in the first place.

Signed-off-by: Ian Romanick ian.d.roman...@intel.com
Cc: Eric Anholt e...@anholt.net
---
  src/glsl/lower_ubo_reference.cpp | 9 +++--
  1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/glsl/lower_ubo_reference.cpp b/src/glsl/lower_ubo_reference.cpp
index 026197d..9e5e951 100644
--- a/src/glsl/lower_ubo_reference.cpp
+++ b/src/glsl/lower_ubo_reference.cpp
@@ -357,17 +357,14 @@ 
lower_ubo_reference_visitor::emit_ubo_loads(ir_dereference *deref,

for (unsigned i = 0; i  deref-type-vector_elements; i++) {
 ir_rvalue *chan = new(mem_ctx) ir_constant((int)i);


lower_ubo_reference.cpp:359:14: warning: unused variable 'chan' 
[-Wunused-variable]


You should delete this too.  Otherwise,

For the series:
Reviewed-by: Kenneth Graunke kenn...@whitecape.org


-ir_dereference *deref_chan =
-   new(mem_ctx) ir_dereference_array(deref-clone(mem_ctx, NULL),
- chan);
-
 ir_rvalue *chan_offset =
add(base_offset,
new(mem_ctx) ir_constant(deref_offset + i * matrix_stride));

-base_ir-insert_before(assign(deref_chan,
+base_ir-insert_before(assign(deref-clone(mem_ctx, NULL),
   ubo_load(glsl_type::float_type,
-   chan_offset)));
+   chan_offset),
+  (1U  i)));
}
 }
  }



___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/3] i965/fs: Generate LOD sampler message from ir_lod.

2013-03-27 Thread Kenneth Graunke

On 03/19/2013 11:51 AM, Matt Turner wrote:

---
  src/mesa/drivers/dri/i965/brw_defines.h|2 ++
  src/mesa/drivers/dri/i965/brw_fs.cpp   |4 +++-
  src/mesa/drivers/dri/i965/brw_fs_emit.cpp  |6 ++
  src/mesa/drivers/dri/i965/brw_fs_visitor.cpp   |9 +
  src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp |3 +++
  5 files changed, 23 insertions(+), 1 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
b/src/mesa/drivers/dri/i965/brw_defines.h
index d9b7f9a..24a1f00 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -710,6 +710,7 @@ enum opcode {
 SHADER_OPCODE_TXS,
 FS_OPCODE_TXB,
 SHADER_OPCODE_TXF_MS,
+   SHADER_OPCODE_LOD,

 SHADER_OPCODE_SHADER_TIME_ADD,

@@ -894,6 +895,7 @@ enum brw_message_target {
  #define GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE 5
  #define GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE  6
  #define GEN5_SAMPLER_MESSAGE_SAMPLE_LD   7
+#define GEN6_SAMPLER_MESSAGE_LOD 9


This message is actually available on Ironlake too, AFAICT, so why not 
GEN5_SAMPLER_MESSAGE_LOD?



  #define GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO  10
  #define HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE 20
  #define GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS   29
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 927cf13..4d2e17c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -338,7 +338,8 @@ fs_inst::is_tex()
 opcode == SHADER_OPCODE_TXF ||
 opcode == SHADER_OPCODE_TXF_MS ||
 opcode == SHADER_OPCODE_TXL ||
-   opcode == SHADER_OPCODE_TXS);
+   opcode == SHADER_OPCODE_TXS ||
+   opcode == SHADER_OPCODE_LOD);
  }

  bool
@@ -744,6 +745,7 @@ fs_visitor::implied_mrf_writes(fs_inst *inst)
 case SHADER_OPCODE_TXF_MS:
 case SHADER_OPCODE_TXL:
 case SHADER_OPCODE_TXS:
+   case SHADER_OPCODE_LOD:
return 1;
 case SHADER_OPCODE_SHADER_TIME_ADD:
return 0;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
index 2391ad1..039589c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
@@ -404,6 +404,11 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg 
dst, struct brw_reg src
   else
  msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
   break;
+  case SHADER_OPCODE_LOD:
+ /* Gen6+. Otherwise ARB_texture_query_lod not exposed. */
+ assert(intel-gen = 6);
+ msg_type = GEN6_SAMPLER_MESSAGE_LOD;


It seems trivial to support on Ironlake (literally change your assert), 
so why not?



+ break;
default:
 assert(!not reached);
 break;
@@ -1245,6 +1250,7 @@ fs_generator::generate_code(exec_list *instructions)
case SHADER_OPCODE_TXF_MS:
case SHADER_OPCODE_TXL:
case SHADER_OPCODE_TXS:
+  case SHADER_OPCODE_LOD:
 generate_tex(inst, dst, src[0]);
 break;
case FS_OPCODE_DDX:
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 92bc621..1d744d1 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -944,6 +944,9 @@ fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, 
fs_reg coordinate,
 case ir_txf:
inst = emit(SHADER_OPCODE_TXF, dst);
break;
+   case ir_lod:
+  inst = emit(SHADER_OPCODE_LOD, dst);
+  break;


If you aren't supporting the extension on Gen4, why do you have code for 
it here?



 default:
fail(unrecognized texture opcode);
 }
@@ -1084,6 +1087,9 @@ fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, 
fs_reg coordinate,
mlen += reg_width;
inst = emit(SHADER_OPCODE_TXF_MS, dst);
break;
+   case ir_lod:
+  inst = emit(SHADER_OPCODE_LOD, dst);
+  break;
 }
 inst-base_mrf = base_mrf;
 inst-mlen = mlen;
@@ -1124,6 +1130,7 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, 
fs_reg coordinate,
 /* Set up the LOD info */
 switch (ir-op) {
 case ir_tex:
+   case ir_lod:
break;
 case ir_txb:
emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
@@ -1237,6 +1244,7 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, 
fs_reg coordinate,
 case ir_txf: inst = emit(SHADER_OPCODE_TXF, dst); break;
 case ir_txf_ms: inst = emit(SHADER_OPCODE_TXF_MS, dst); break;
 case ir_txs: inst = emit(SHADER_OPCODE_TXS, dst); break;
+   case ir_lod: inst = emit(SHADER_OPCODE_LOD, dst); break;
 }
 inst-base_mrf = base_mrf;
 inst-mlen = mlen;
@@ -1388,6 +1396,7 @@ fs_visitor::visit(ir_texture *ir)
 fs_reg lod, lod2, sample_index;
 switch (ir-op) {
 case ir_tex:
+   case ir_lod:
break;
 case ir_txb:
   

[Mesa-dev] [PATCH 1/2] R600: Emit native instructions for tex

2013-03-27 Thread Vincent Lejeune
---
 lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp | 169 +
 lib/Target/R600/R600Instructions.td| 156 +++
 2 files changed, 196 insertions(+), 129 deletions(-)

diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp 
b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index d207160..00ebb44 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -66,8 +66,6 @@ private:
   void EmitSrcISA(const MCInst MI, unsigned RegOpIdx, unsigned SelOpIdx,
 raw_ostream OS) const;
   void EmitDst(const MCInst MI, raw_ostream OS) const;
-  void EmitTexInstr(const MCInst MI, SmallVectorImplMCFixup Fixups,
-raw_ostream OS) const;
   void EmitFCInstr(const MCInst MI, raw_ostream OS) const;
 
   void EmitNullBytes(unsigned int byteCount, raw_ostream OS) const;
@@ -140,9 +138,7 @@ MCCodeEmitter *llvm::createR600MCCodeEmitter(const 
MCInstrInfo MCII,
 
 void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS,
SmallVectorImplMCFixup Fixups) const 
{
-  if (isTexOp(MI.getOpcode())) {
-EmitTexInstr(MI, Fixups, OS);
-  } else if (isFCOp(MI.getOpcode())){
+  if (isFCOp(MI.getOpcode())){
 EmitFCInstr(MI, OS);
   } else if (MI.getOpcode() == AMDGPU::RETURN ||
 MI.getOpcode() == AMDGPU::BUNDLE ||
@@ -175,6 +171,76 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst 
MI, raw_ostream OS,
   Emit(InstWord2, OS);
   break;
 }
+case AMDGPU::TEX_LD:
+case AMDGPU::TEX_GET_TEXTURE_RESINFO:
+case AMDGPU::TEX_SAMPLE:
+case AMDGPU::TEX_SAMPLE_C:
+case AMDGPU::TEX_SAMPLE_L:
+case AMDGPU::TEX_SAMPLE_C_L:
+case AMDGPU::TEX_SAMPLE_LB:
+case AMDGPU::TEX_SAMPLE_C_LB:
+case AMDGPU::TEX_SAMPLE_G:
+case AMDGPU::TEX_SAMPLE_C_G:
+case AMDGPU::TEX_GET_GRADIENTS_H:
+case AMDGPU::TEX_GET_GRADIENTS_V:
+case AMDGPU::TEX_SET_GRADIENTS_H:
+case AMDGPU::TEX_SET_GRADIENTS_V: {
+  unsigned Opcode = MI.getOpcode();
+  bool hasOffsets = (Opcode == AMDGPU::TEX_LD);
+  unsigned OpOffset = hasOffsets ? 3 : 0;
+  int64_t Sampler = MI.getOperand(OpOffset + 3).getImm();
+  int64_t TextureType = MI.getOperand(OpOffset + 4).getImm();
+
+  uint32_t srcSelect[4] = {0, 1, 2, 3};
+  uint32_t Offsets[3] = {0 , 0, 0};
+  uint64_t coordType[4] = {1, 1, 1, 1};
+
+  if (hasOffsets)
+for (unsigned i = 0; i  3; i++)
+  Offsets[i] = MI.getOperand(i + 2).getImm();
+
+  if (TextureType == TEXTURE_RECT
+|| TextureType == TEXTURE_SHADOWRECT) {
+coordType[ELEMENT_X] = 0;
+coordType[ELEMENT_Y] = 0;
+  }
+
+  if (TextureType == TEXTURE_1D_ARRAY
+  || TextureType == TEXTURE_SHADOW1D_ARRAY) {
+if (Opcode == AMDGPU::TEX_SAMPLE_C_L || Opcode == 
AMDGPU::TEX_SAMPLE_C_LB) {
+  coordType[ELEMENT_Y] = 0;
+} else {
+  coordType[ELEMENT_Z] = 0;
+  srcSelect[ELEMENT_Z] = ELEMENT_Y;
+}
+  } else if (TextureType == TEXTURE_2D_ARRAY
+ || TextureType == TEXTURE_SHADOW2D_ARRAY) {
+coordType[ELEMENT_Z] = 0;
+  }
+
+
+  if ((TextureType == TEXTURE_SHADOW1D
+  || TextureType == TEXTURE_SHADOW2D
+  || TextureType == TEXTURE_SHADOWRECT
+  || TextureType == TEXTURE_SHADOW1D_ARRAY)
+   Opcode != AMDGPU::TEX_SAMPLE_C_L
+   Opcode != AMDGPU::TEX_SAMPLE_C_LB) {
+srcSelect[ELEMENT_W] = ELEMENT_Z;
+  }
+
+  uint64_t Word01 = getBinaryCodeForInstr(MI, Fixups) |
+  coordType[ELEMENT_X]  60 | coordType[ELEMENT_Y]  61 |
+  coordType[ELEMENT_Z]  62 | coordType[ELEMENT_W]  63;
+  uint32_t Word2 = Sampler  15 | srcSelect[ELEMENT_X]  20 |
+  srcSelect[ELEMENT_Y]  23 | srcSelect[ELEMENT_Z]  26 |
+  srcSelect[ELEMENT_W]  29 | Offsets[0]  0 | Offsets[1]  5 |
+  Offsets[2]  10;
+
+  EmitByte(INSTR_TEX, OS);
+  Emit(Word01, OS);
+  Emit(Word2, OS);
+  break;
+}
 case AMDGPU::EG_ExportSwz:
 case AMDGPU::R600_ExportSwz:
 case AMDGPU::EG_ExportBuf:
@@ -334,99 +400,6 @@ void R600MCCodeEmitter::EmitSrcISA(const MCInst MI, 
unsigned RegOpIdx,
   Emit(InlineConstant.i, OS);
 }
 
-void R600MCCodeEmitter::EmitTexInstr(const MCInst MI,
- SmallVectorImplMCFixup Fixups,
- raw_ostream OS) const {
-
-  unsigned Opcode = MI.getOpcode();
-  bool hasOffsets = (Opcode == AMDGPU::TEX_LD);
-  unsigned OpOffset = hasOffsets ? 3 : 0;
-  int64_t Resource = MI.getOperand(OpOffset + 2).getImm();
-  int64_t Sampler = MI.getOperand(OpOffset + 3).getImm();
-  int64_t TextureType = MI.getOperand(OpOffset + 4).getImm();
-  unsigned srcSelect[4] = {0, 1, 2, 3};
-
-  // Emit instruction type
-  EmitByte(1, OS);
-
-  // Emit instruction
-  EmitByte(getBinaryCodeForInstr(MI, 

[Mesa-dev] [PATCH 2/2] R600: Emit CF_ALU and use true kcache register.

2013-03-27 Thread Vincent Lejeune
---
 lib/Target/R600/AMDGPU.h   |   1 +
 lib/Target/R600/AMDGPUTargetMachine.cpp|   1 +
 lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp |  10 +-
 lib/Target/R600/R600EmitClauseMarkers.cpp  | 243 +
 lib/Target/R600/R600Instructions.td|  83 ++-
 lib/Target/R600/R600RegisterInfo.td|  63 ++
 6 files changed, 389 insertions(+), 12 deletions(-)
 create mode 100644 lib/Target/R600/R600EmitClauseMarkers.cpp

diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
index e099a9f..3cd792a 100644
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h
@@ -23,6 +23,7 @@ class AMDGPUTargetMachine;
 // R600 Passes
 FunctionPass* createR600KernelParametersPass(const DataLayout *TD);
 FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine tm);
+FunctionPass *createR600EmitClauseMarkers(TargetMachine tm);
 
 // SI Passes
 FunctionPass *createSIAnnotateControlFlowPass();
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp 
b/lib/Target/R600/AMDGPUTargetMachine.cpp
index 0185747..45b1be0 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -151,6 +151,7 @@ bool AMDGPUPassConfig::addPreEmitPass() {
   if (ST.device()-getGeneration() = AMDGPUDeviceInfo::HD6XXX) {
 addPass(createAMDGPUCFGPreparationPass(*TM));
 addPass(createAMDGPUCFGStructurizerPass(*TM));
+addPass(createR600EmitClauseMarkers(*TM));
 addPass(createR600ExpandSpecialInstrsPass(*TM));
 addPass(FinalizeMachineBundlesID);
   } else {
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp 
b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index 00ebb44..cf43f3f 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -101,7 +101,8 @@ enum InstrTypes {
   INSTR_FC,
   INSTR_NATIVE,
   INSTR_VTX,
-  INSTR_EXPORT
+  INSTR_EXPORT,
+  INSTR_CFALU
 };
 
 enum FCInstr {
@@ -250,6 +251,13 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst 
MI, raw_ostream OS,
   Emit(Inst, OS);
   break;
 }
+case AMDGPU::CF_ALU:
+case AMDGPU::CF_ALU_PUSH_BEFORE: {
+  uint64_t Inst = getBinaryCodeForInstr(MI, Fixups);
+  EmitByte(INSTR_CFALU, OS);
+  Emit(Inst, OS);
+  break;
+}
 
 default:
   EmitALUInstr(MI, Fixups, OS);
diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp 
b/lib/Target/R600/R600EmitClauseMarkers.cpp
new file mode 100644
index 000..b869c88
--- /dev/null
+++ b/lib/Target/R600/R600EmitClauseMarkers.cpp
@@ -0,0 +1,243 @@
+//===-- R600EmitClauseMarkers.cpp - Emit CF_ALU 
---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===--===//
+//
+/// \file
+/// Add CF_ALU. R600 Alu instructions are grouped in clause which can hold
+/// 128 Alu instructions ; these instructions can access up to 4 prefetched
+/// 4 lines of 16 registers from constant buffers. Such ALU clauses are
+/// initiated by CF_ALU instructions.
+//===--===//
+
+#include AMDGPU.h
+#include R600Defines.h
+#include R600InstrInfo.h
+#include R600MachineFunctionInfo.h
+#include R600RegisterInfo.h
+#include llvm/CodeGen/MachineFunctionPass.h
+#include llvm/CodeGen/MachineInstrBuilder.h
+#include llvm/CodeGen/MachineRegisterInfo.h
+
+namespace llvm {
+
+class R600EmitClauseMarkersPass : public MachineFunctionPass {
+
+private:
+  static char ID;
+  const R600InstrInfo *TII;
+
+  unsigned OccupiedDwords(MachineInstr *MI) const {
+switch (MI-getOpcode()) {
+case AMDGPU::INTERP_PAIR_XY:
+case AMDGPU::INTERP_PAIR_ZW:
+case AMDGPU::INTERP_VEC_LOAD:
+case AMDGPU::DOT4_eg_pseudo:
+case AMDGPU::DOT4_r600_pseudo:
+  return 4;
+case AMDGPU::KILL:
+  return 0;
+default:
+  break;
+}
+
+if(TII-isVector(*MI) ||
+TII-isCubeOp(MI-getOpcode()) ||
+TII-isReductionOp(MI-getOpcode()))
+  return 4;
+
+unsigned NumLiteral = 0;
+for (MachineInstr::mop_iterator It = MI-operands_begin(),
+E = MI-operands_end(); It != E; ++It) {
+  MachineOperand MO = *It;
+  if (MO.isReg()  MO.getReg() == AMDGPU::ALU_LITERAL_X)
+++NumLiteral;
+}
+return 1 + NumLiteral;
+  }
+
+  bool isALU(const MachineInstr *MI) const {
+if (TII-isALUInstr(MI-getOpcode()))
+  return true;
+if (TII-isVector(*MI) || TII-isCubeOp(MI-getOpcode()))
+  return true;
+switch (MI-getOpcode()) {
+case AMDGPU::INTERP_PAIR_XY:
+case AMDGPU::INTERP_PAIR_ZW:
+case AMDGPU::INTERP_VEC_LOAD:
+case AMDGPU::COPY:
+case AMDGPU::DOT4_eg_pseudo:
+case AMDGPU::DOT4_r600_pseudo:
+  return true;
+default:
+  

[Mesa-dev] [PATCH 1/2] r600g/llvm: use native encode for tex

2013-03-27 Thread Vincent Lejeune
---
 src/gallium/drivers/r600/r600_shader.c | 50 ++
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c 
b/src/gallium/drivers/r600/r600_shader.c
index 29facf7..1e21559 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -489,29 +489,33 @@ static unsigned r600_tex_from_byte_stream(struct 
r600_shader_ctx *ctx,
 {
struct r600_bytecode_tex tex;
 
-   tex.op = r600_isa_fetch_by_opcode(ctx-bc-isa, bytes[bytes_read++]);
-   tex.resource_id = bytes[bytes_read++];
-   tex.src_gpr = bytes[bytes_read++];
-   tex.src_rel = bytes[bytes_read++];
-   tex.dst_gpr = bytes[bytes_read++];
-   tex.dst_rel = bytes[bytes_read++];
-   tex.dst_sel_x = bytes[bytes_read++];
-   tex.dst_sel_y = bytes[bytes_read++];
-   tex.dst_sel_z = bytes[bytes_read++];
-   tex.dst_sel_w = bytes[bytes_read++];
-   tex.lod_bias = bytes[bytes_read++];
-   tex.coord_type_x = bytes[bytes_read++];
-   tex.coord_type_y = bytes[bytes_read++];
-   tex.coord_type_z = bytes[bytes_read++];
-   tex.coord_type_w = bytes[bytes_read++];
-   tex.offset_x = bytes[bytes_read++];
-   tex.offset_y = bytes[bytes_read++];
-   tex.offset_z = bytes[bytes_read++];
-   tex.sampler_id = bytes[bytes_read++];
-   tex.src_sel_x = bytes[bytes_read++];
-   tex.src_sel_y = bytes[bytes_read++];
-   tex.src_sel_z = bytes[bytes_read++];
-   tex.src_sel_w = bytes[bytes_read++];
+   uint32_t word0 = i32_from_byte_stream(bytes, bytes_read);
+   uint32_t word1 = i32_from_byte_stream(bytes, bytes_read);
+   uint32_t word2 = i32_from_byte_stream(bytes, bytes_read);
+
+   tex.op = r600_isa_fetch_by_opcode(ctx-bc-isa, 
G_SQ_TEX_WORD0_TEX_INST(word0));
+   tex.resource_id = G_SQ_TEX_WORD0_RESOURCE_ID(word0);
+   tex.src_gpr = G_SQ_TEX_WORD0_SRC_GPR(word0);
+   tex.src_rel = G_SQ_TEX_WORD0_SRC_REL(word0);
+   tex.dst_gpr = G_SQ_TEX_WORD1_DST_GPR(word1);
+   tex.dst_rel = G_SQ_TEX_WORD1_DST_REL(word1);
+   tex.dst_sel_x = G_SQ_TEX_WORD1_DST_SEL_X(word1);
+   tex.dst_sel_y = G_SQ_TEX_WORD1_DST_SEL_Y(word1);
+   tex.dst_sel_z = G_SQ_TEX_WORD1_DST_SEL_Z(word1);
+   tex.dst_sel_w = G_SQ_TEX_WORD1_DST_SEL_W(word1);
+   tex.lod_bias = G_SQ_TEX_WORD1_LOD_BIAS(word1);
+   tex.coord_type_x = G_SQ_TEX_WORD1_COORD_TYPE_X(word1);
+   tex.coord_type_y = G_SQ_TEX_WORD1_COORD_TYPE_Y(word1);
+   tex.coord_type_z = G_SQ_TEX_WORD1_COORD_TYPE_Z(word1);
+   tex.coord_type_w = G_SQ_TEX_WORD1_COORD_TYPE_W(word1);
+   tex.offset_x = G_SQ_TEX_WORD2_OFFSET_X(word2);
+   tex.offset_y = G_SQ_TEX_WORD2_OFFSET_Y(word2);
+   tex.offset_z = G_SQ_TEX_WORD2_OFFSET_Z(word2);
+   tex.sampler_id = G_SQ_TEX_WORD2_SAMPLER_ID(word2);
+   tex.src_sel_x = G_SQ_TEX_WORD2_SRC_SEL_X(word2);
+   tex.src_sel_y = G_SQ_TEX_WORD2_SRC_SEL_Y(word2);
+   tex.src_sel_z = G_SQ_TEX_WORD2_SRC_SEL_Z(word2);
+   tex.src_sel_w = G_SQ_TEX_WORD2_SRC_SEL_W(word2);
 
tex.inst_mod = 0;
 
-- 
1.8.1.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/2] r600g/llvm: Add support for cf_alu native encode

2013-03-27 Thread Vincent Lejeune
---
 src/gallium/drivers/r600/r600_asm.c|  2 +-
 src/gallium/drivers/r600/r600_asm.h|  1 +
 src/gallium/drivers/r600/r600_shader.c | 14 ++
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/r600/r600_asm.c 
b/src/gallium/drivers/r600/r600_asm.c
index 0d570ca..65c705d 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -106,7 +106,7 @@ void r600_bytecode_init(struct r600_bytecode *bc,
bc-msaa_texture_mode = msaa_texture_mode;
 }
 
-static int r600_bytecode_add_cf(struct r600_bytecode *bc)
+int r600_bytecode_add_cf(struct r600_bytecode *bc)
 {
struct r600_bytecode_cf *cf = r600_bytecode_cf();
 
diff --git a/src/gallium/drivers/r600/r600_asm.h 
b/src/gallium/drivers/r600/r600_asm.h
index 1465c31..c1aa3ba 100644
--- a/src/gallium/drivers/r600/r600_asm.h
+++ b/src/gallium/drivers/r600/r600_asm.h
@@ -227,6 +227,7 @@ int r600_bytecode_add_tex(struct r600_bytecode *bc,
 int r600_bytecode_add_output(struct r600_bytecode *bc,
const struct r600_bytecode_output *output);
 int r600_bytecode_build(struct r600_bytecode *bc);
+int r600_bytecode_add_cf(struct r600_bytecode *bc);
 int r600_bytecode_add_cfinst(struct r600_bytecode *bc,
unsigned op);
 int r600_bytecode_add_alu_type(struct r600_bytecode *bc,
diff --git a/src/gallium/drivers/r600/r600_shader.c 
b/src/gallium/drivers/r600/r600_shader.c
index 1e21559..6fd1f42 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -626,6 +626,20 @@ static void r600_bytecode_from_byte_stream(struct 
r600_shader_ctx *ctx,
 bytes_read = r600_export_from_byte_stream(ctx, bytes,
 bytes_read);
 break;
+   case 6: {
+   int32_t word0 = i32_from_byte_stream(bytes, 
bytes_read);
+   int32_t word1 = i32_from_byte_stream(bytes, 
bytes_read);
+
+   r600_bytecode_add_cf(ctx-bc);
+   ctx-bc-cf_last-op = 
r600_isa_cf_by_opcode(ctx-bc-isa, G_SQ_CF_ALU_WORD1_CF_INST(word1), 1);
+   ctx-bc-cf_last-kcache[0].bank = 
G_SQ_CF_ALU_WORD0_KCACHE_BANK0(word0);
+   ctx-bc-cf_last-kcache[0].addr = 
G_SQ_CF_ALU_WORD1_KCACHE_ADDR0(word1);
+   ctx-bc-cf_last-kcache[0].mode = 
G_SQ_CF_ALU_WORD0_KCACHE_MODE0(word0);
+   ctx-bc-cf_last-kcache[1].bank = 
G_SQ_CF_ALU_WORD0_KCACHE_BANK1(word0);
+   ctx-bc-cf_last-kcache[1].addr = 
G_SQ_CF_ALU_WORD1_KCACHE_ADDR1(word1);
+   ctx-bc-cf_last-kcache[1].mode = 
G_SQ_CF_ALU_WORD1_KCACHE_MODE1(word1);
+   break;
+  }
default:
/* XXX: Error here */
break;
-- 
1.8.1.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] intel: add and use the always_have_depth_buffer DRI config option

2013-03-27 Thread Brian Paul
When set to true, create all GLX visual configs with a depth buffer.
Used to allow apps such as Topogun to work properly on Linux.
This is just like the option recently added for gallium DRI drivers.

--

Note: I only compile-tested this.  It would be great if an Intel
developer could test.  A free demo of Topogun is available.
---
 src/mesa/drivers/dri/intel/intel_screen.c |   60 +++-
 1 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/src/mesa/drivers/dri/intel/intel_screen.c 
b/src/mesa/drivers/dri/intel/intel_screen.c
index 3ca10c8..80b925e 100644
--- a/src/mesa/drivers/dri/intel/intel_screen.c
+++ b/src/mesa/drivers/dri/intel/intel_screen.c
@@ -93,6 +93,10 @@ PUBLIC const char __driConfigOptions[] =
 DRI_CONF_DESC(en, Perform code generation at shader link time.)
   DRI_CONF_OPT_END
DRI_CONF_SECTION_END
+
+   DRI_CONF_SECTION_MISCELLANEOUS
+  DRI_CONF_ALWAYS_HAVE_DEPTH_BUFFER(false)
+   DRI_CONF_SECTION_END
 DRI_CONF_END;
 
 const GLuint __driNConfigOptions = 17;
@@ -1084,35 +1088,46 @@ intel_screen_make_configs(__DRIscreen *dri_screen)
uint8_t depth_bits[4], stencil_bits[4];
__DRIconfig **configs = NULL;
 
+   bool always_have_depth_buffer =
+  driQueryOptionb(screen-optionCache, always_have_depth_buffer);
+
/* Generate singlesample configs without accumulation buffer. */
for (int i = 0; i  ARRAY_SIZE(formats); i++) {
   __DRIconfig **new_configs;
-  int num_depth_stencil_bits = 2;
+  int num_ds_modes;
 
   /* Starting with DRI2 protocol version 1.1 we can request a depth/stencil
* buffer that has a different number of bits per pixel than the color
* buffer, gen = 6 supports this.
*/
-  depth_bits[0] = 0;
-  stencil_bits[0] = 0;
+  if (always_have_depth_buffer) {
+ num_ds_modes = 0;
+  }
+  else {
+ depth_bits[0] = 0;
+ stencil_bits[0] = 0;
+ num_ds_modes = 1;
+  }
 
   if (formats[i] == MESA_FORMAT_RGB565) {
- depth_bits[1] = 16;
- stencil_bits[1] = 0;
+ depth_bits[num_ds_modes] = 16;
+ stencil_bits[num_ds_modes] = 0;
+ num_ds_modes++;
  if (screen-gen = 6) {
- depth_bits[2] = 24;
- stencil_bits[2] = 8;
- num_depth_stencil_bits = 3;
+ depth_bits[num_ds_modes] = 24;
+ stencil_bits[num_ds_modes] = 8;
+ num_ds_modes++;
  }
   } else {
- depth_bits[1] = 24;
- stencil_bits[1] = 8;
+ depth_bits[num_ds_modes] = 24;
+ stencil_bits[num_ds_modes] = 8;
+ num_ds_modes++;
   }
 
   new_configs = driCreateConfigs(formats[i],
  depth_bits,
  stencil_bits,
- num_depth_stencil_bits,
+ num_ds_modes,
  back_buffer_modes, 2,
  singlesample_samples, 1,
  false);
@@ -1159,19 +1174,26 @@ intel_screen_make_configs(__DRIscreen *dri_screen)
  break;
 
   __DRIconfig **new_configs;
-  const int num_depth_stencil_bits = 2;
+  int num_ds_modes;
   int num_msaa_modes = 0;
 
-  depth_bits[0] = 0;
-  stencil_bits[0] = 0;
+  if (always_have_depth_buffer) {
+ num_ds_modes = 0;
+  }
+  else {
+ depth_bits[0] = 0;
+ stencil_bits[0] = 0;
+ num_ds_modes = 1;
+  }
 
   if (formats[i] == MESA_FORMAT_RGB565) {
- depth_bits[1] = 16;
- stencil_bits[1] = 0;
+ depth_bits[num_ds_modes] = 16;
+ stencil_bits[num_ds_modes] = 0;
   } else {
- depth_bits[1] = 24;
- stencil_bits[1] = 8;
+ depth_bits[num_ds_modes] = 24;
+ stencil_bits[num_ds_modes] = 8;
   }
+  num_ds_modes++;
 
   if (screen-gen = 7)
  num_msaa_modes = 2;
@@ -1181,7 +1203,7 @@ intel_screen_make_configs(__DRIscreen *dri_screen)
   new_configs = driCreateConfigs(formats[i],
  depth_bits,
  stencil_bits,
- num_depth_stencil_bits,
+ num_ds_modes,
  back_buffer_modes, 1,
  multisample_samples,
  num_msaa_modes,
-- 
1.7.3.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] docs: add a new page documenting known application issues

2013-03-27 Thread Brian Paul
Let's try to update this when we find other broken applications...
---
 docs/application-issues.html |   83 ++
 docs/contents.html   |1 +
 2 files changed, 84 insertions(+), 0 deletions(-)
 create mode 100644 docs/application-issues.html

diff --git a/docs/application-issues.html b/docs/application-issues.html
new file mode 100644
index 000..6db0865
--- /dev/null
+++ b/docs/application-issues.html
@@ -0,0 +1,83 @@
+!DOCTYPE HTML PUBLIC -//W3C//DTD HTML 4.01 Transitional//EN 
http://www.w3.org/TR/html4/loose.dtd;
+html lang=en
+head
+  meta http-equiv=content-type content=text/html; charset=utf-8
+  titleApplication Issues/title
+  link rel=stylesheet type=text/css href=mesa.css
+/head
+body
+
+div class=header
+  h1The Mesa 3D Graphics Library/h1
+/div
+
+iframe src=contents.html/iframe
+div class=content
+
+h1Application Issues/h1
+
+p
+This page documents known issues with some OpenGL applications.
+/p
+
+
+h2Topogun/h2
+
+p
+a href=http://www.topogun.com/;Topogun/a for Linux (version 2, at least)
+creates a GLX visual without requesting a depth buffer.
+This causes bad rendering if the OpenGL driver happens to choose a visual
+without a depth buffer.
+/p
+
+p
+Mesa 9.1.2 and later (will) support a DRI configuration option to work around
+this issue.
+Using the a href=http://dri.freedesktop.org/wiki/DriConf;driconf/a tool,
+set the Create all visuals with a depth buffer option before running Topogun.
+Then, all GLX visuals will be created with a depth buffer.
+/p
+
+
+h2Old OpenGL games/h2
+
+p
+Some old OpenGL games (approx. ten years or older) may crash during
+start-up because of an extension string buffer-overflow problem.
+/p
+
+p
+The problem is a modern OpenGL driver will return a very long string
+for the glGetString(GL_EXTENSIONS) query and if the application
+naively copies the string into a fixed-size buffer it can overflow the
+buffer and crash the application.
+/p
+
+p
+The work-around is to set the MESA_EXTENSION_MAX_YEAR environment variable
+to the approximate release year of the game.
+This will cause the glGetString(GL_EXTENSIONS) query to only report extensions
+older than the given year.
+/p
+
+p
+For example, if the game was released in 2001, do
+pre
+export MESA_EXTENSION_MAX_YEAR=2001
+/pre
+before running the game.
+/p
+
+
+
+h2Viewperf/h2
+
+p
+See the a href=viewperf.htmlViewperf issues/a page for a detailed list
+of Viewperf issues.
+/p
+
+
+/div
+/body
+/html
diff --git a/docs/contents.html b/docs/contents.html
index 5752299..50c0d59 100644
--- a/docs/contents.html
+++ b/docs/contents.html
@@ -71,6 +71,7 @@
 lia href=llvmpipe.html target=_parentGallium llvmpipe driver/a
 lia href=vmware-guest.html target=_parentVMware SVGA3D guest driver/a
 lia href=postprocess.html target=_parentGallium post-processing/a
+lia href=application-issues.html target=_parentApplication Issues/a
 lia href=viewperf.html target=_parentViewperf Issues/a
 /ul
 
-- 
1.7.3.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 0/6] Gallivm GS and related cleanups

2013-03-27 Thread Zack Rusin
 Wow, that's a lot of code to digest, but it looks good AFAICT (without
 spending hours on it).  I just found some minor nits.
 
 Nice work!
 
 Reviewed-by: Brian Paul bri...@vmware.com

Thanks Brian! I'm sorry about the large patch, it was one of those just sit
down and do it types of coding sessions and by the time I was done with it
the diff got rather large. I've split the breakc/end prim (to answer your
question - they weren't related at all, they were just both visible in the 
same application so I just fixed both in one shot), added some comments and
fixed the things you've found. Thanks a lot for the review!

z
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] gallivm: cleanup the gs interface

2013-03-27 Thread Zack Rusin
Instead of void pointers use a base interface.

Signed-off-by: Zack Rusin za...@vmware.com
---
 src/gallium/auxiliary/draw/draw_llvm.c  |   77 ---
 src/gallium/auxiliary/gallivm/lp_bld_tgsi.h |   25 
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c |   31 -
 3 files changed, 83 insertions(+), 50 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_llvm.c 
b/src/gallium/auxiliary/draw/draw_llvm.c
index 3ce48d8..efbcb04 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -64,6 +64,13 @@ draw_llvm_generate(struct draw_llvm *llvm, struct 
draw_llvm_variant *var,
boolean elts);
 
 
+struct draw_gs_llvm_iface {
+   struct lp_build_tgsi_gs_iface base;
+
+   struct draw_gs_llvm_variant *variant;
+   LLVMValueRef input;
+};
+
 /**
  * Create LLVM type for struct draw_jit_texture
  */
@@ -1237,14 +1244,39 @@ clipmask_booli32(struct gallivm_state *gallivm,
return ret;
 }
 
+static LLVMValueRef
+draw_gs_llvm_fetch_input(const struct lp_build_tgsi_gs_iface *gs_iface,
+ struct lp_build_tgsi_context * bld_base,
+ LLVMValueRef vertex_index,
+ LLVMValueRef attrib_index,
+ LLVMValueRef swizzle_index)
+{
+   const struct draw_gs_llvm_iface *gs =
+  (const struct draw_gs_llvm_iface *)gs_iface;
+   struct gallivm_state *gallivm = bld_base-base.gallivm;
+   LLVMBuilderRef builder = gallivm-builder;
+   LLVMValueRef indices[3];
+   LLVMValueRef res;
+
+   indices[0] = vertex_index;
+   indices[1] = attrib_index;
+   indices[2] = swizzle_index;
+   
+   res = LLVMBuildGEP(builder, gs-input, indices, 3, );
+   res = LLVMBuildLoad(builder, res, );
+
+   return res;
+}
+
 static void
-draw_gs_llvm_emit_vertex(struct lp_build_tgsi_context * bld_base,
+draw_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base,
+ struct lp_build_tgsi_context * bld_base,
  LLVMValueRef (*outputs)[4],
- LLVMValueRef emitted_vertices_vec,
- void *user_data)
+ LLVMValueRef emitted_vertices_vec)
 {
-   struct draw_gs_llvm_variant *variant =
-  (struct draw_gs_llvm_variant *)user_data;
+   const struct draw_gs_llvm_iface *gs_iface =
+  (const struct draw_gs_llvm_iface *)gs_base;
+   struct draw_gs_llvm_variant *variant = gs_iface-variant;
struct gallivm_state *gallivm = variant-gallivm;
LLVMBuilderRef builder = gallivm-builder;
struct lp_type gs_type = bld_base-base.type;
@@ -1272,13 +1304,14 @@ draw_gs_llvm_emit_vertex(struct lp_build_tgsi_context * 
bld_base,
 }
 
 static void
-draw_gs_llvm_end_primitive(struct lp_build_tgsi_context * bld_base,
+draw_gs_llvm_end_primitive(const struct lp_build_tgsi_gs_iface *gs_base,
+   struct lp_build_tgsi_context * bld_base,
LLVMValueRef verts_per_prim_vec,
-   LLVMValueRef emitted_prims_vec,
-   void *user_data)
+   LLVMValueRef emitted_prims_vec)
 {
-   struct draw_gs_llvm_variant *variant =
-  (struct draw_gs_llvm_variant *)user_data;
+   const struct draw_gs_llvm_iface *gs_iface =
+  (const struct draw_gs_llvm_iface *)gs_base;
+   struct draw_gs_llvm_variant *variant = gs_iface-variant;
struct gallivm_state *gallivm = variant-gallivm;
LLVMBuilderRef builder = gallivm-builder;
LLVMValueRef prim_lengts_ptr =
@@ -1301,13 +1334,14 @@ draw_gs_llvm_end_primitive(struct lp_build_tgsi_context 
* bld_base,
 }
 
 static void
-draw_gs_llvm_epilogue(struct lp_build_tgsi_context * bld_base,
+draw_gs_llvm_epilogue(const struct lp_build_tgsi_gs_iface *gs_base,
+  struct lp_build_tgsi_context * bld_base,
   LLVMValueRef total_emitted_vertices_vec,
-  LLVMValueRef emitted_prims_vec,
-  void *user_data)
+  LLVMValueRef emitted_prims_vec)
 {
-   struct draw_gs_llvm_variant *variant =
-  (struct draw_gs_llvm_variant *)user_data;
+   const struct draw_gs_llvm_iface *gs_iface =
+  (const struct draw_gs_llvm_iface *)gs_base;
+   struct draw_gs_llvm_variant *variant = gs_iface-variant;
struct gallivm_state *gallivm = variant-gallivm;
LLVMBuilderRef builder = gallivm-builder;
LLVMValueRef emitted_verts_ptr =
@@ -1867,7 +1901,7 @@ draw_gs_llvm_generate(struct draw_llvm *llvm,
struct lp_bld_tgsi_system_values system_values;
struct lp_type gs_type;
unsigned i;
-   struct lp_build_tgsi_gs_iface gs_iface;
+   struct draw_gs_llvm_iface gs_iface;
const struct tgsi_token *tokens = variant-shader-base.state.tokens;
LLVMValueRef consts_ptr;
LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS];
@@ -1912,11 +1946,12 @@ draw_gs_llvm_generate(struct draw_llvm *llvm,
variant-io_ptr = 

Re: [Mesa-dev] [PATCH 1/2] R600: Emit native instructions for tex

2013-03-27 Thread Tom Stellard
On Thu, Mar 28, 2013 at 12:40:18AM +0100, Vincent Lejeune wrote:
 ---

Just a few style issues, with those changes, this patch is:

Reviewed-by: Tom Stellard thomas.stell...@amd.com

  lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp | 169 
 +
  lib/Target/R600/R600Instructions.td| 156 +++
  2 files changed, 196 insertions(+), 129 deletions(-)
 
 diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp 
 b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
 index d207160..00ebb44 100644
 --- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
 +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
 @@ -66,8 +66,6 @@ private:
void EmitSrcISA(const MCInst MI, unsigned RegOpIdx, unsigned SelOpIdx,
  raw_ostream OS) const;
void EmitDst(const MCInst MI, raw_ostream OS) const;
 -  void EmitTexInstr(const MCInst MI, SmallVectorImplMCFixup Fixups,
 -raw_ostream OS) const;
void EmitFCInstr(const MCInst MI, raw_ostream OS) const;
  
void EmitNullBytes(unsigned int byteCount, raw_ostream OS) const;
 @@ -140,9 +138,7 @@ MCCodeEmitter *llvm::createR600MCCodeEmitter(const 
 MCInstrInfo MCII,
  
  void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS,
 SmallVectorImplMCFixup Fixups) 
 const {
 -  if (isTexOp(MI.getOpcode())) {
 -EmitTexInstr(MI, Fixups, OS);
 -  } else if (isFCOp(MI.getOpcode())){
 +  if (isFCOp(MI.getOpcode())){
  EmitFCInstr(MI, OS);
} else if (MI.getOpcode() == AMDGPU::RETURN ||
  MI.getOpcode() == AMDGPU::BUNDLE ||
 @@ -175,6 +171,76 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst 
 MI, raw_ostream OS,
Emit(InstWord2, OS);
break;
  }
 +case AMDGPU::TEX_LD:
 +case AMDGPU::TEX_GET_TEXTURE_RESINFO:
 +case AMDGPU::TEX_SAMPLE:
 +case AMDGPU::TEX_SAMPLE_C:
 +case AMDGPU::TEX_SAMPLE_L:
 +case AMDGPU::TEX_SAMPLE_C_L:
 +case AMDGPU::TEX_SAMPLE_LB:
 +case AMDGPU::TEX_SAMPLE_C_LB:
 +case AMDGPU::TEX_SAMPLE_G:
 +case AMDGPU::TEX_SAMPLE_C_G:
 +case AMDGPU::TEX_GET_GRADIENTS_H:
 +case AMDGPU::TEX_GET_GRADIENTS_V:
 +case AMDGPU::TEX_SET_GRADIENTS_H:
 +case AMDGPU::TEX_SET_GRADIENTS_V: {
 +  unsigned Opcode = MI.getOpcode();
 +  bool hasOffsets = (Opcode == AMDGPU::TEX_LD);

Might as well fix the coding style errors while you are moving the code
around: hasOffsets = HasOffsets

 +  unsigned OpOffset = hasOffsets ? 3 : 0;
 +  int64_t Sampler = MI.getOperand(OpOffset + 3).getImm();
 +  int64_t TextureType = MI.getOperand(OpOffset + 4).getImm();
 +
 +  uint32_t srcSelect[4] = {0, 1, 2, 3};

srcSelect = SrcSelect

 +  uint32_t Offsets[3] = {0 , 0, 0};

Extra space before the first comma.

 +  uint64_t coordType[4] = {1, 1, 1, 1};
 +

coordType = CoordType

 +  if (hasOffsets)
 +for (unsigned i = 0; i  3; i++)
 +  Offsets[i] = MI.getOperand(i + 2).getImm();
 +
 +  if (TextureType == TEXTURE_RECT
 +|| TextureType == TEXTURE_SHADOWRECT) {
 +coordType[ELEMENT_X] = 0;
 +coordType[ELEMENT_Y] = 0;
 +  }
 +
 +  if (TextureType == TEXTURE_1D_ARRAY
 +  || TextureType == TEXTURE_SHADOW1D_ARRAY) {

According to LLVM style, the || needs to go on the previous line.

 +if (Opcode == AMDGPU::TEX_SAMPLE_C_L || Opcode == 
 AMDGPU::TEX_SAMPLE_C_LB) {
 +  coordType[ELEMENT_Y] = 0;
 +} else {
 +  coordType[ELEMENT_Z] = 0;
 +  srcSelect[ELEMENT_Z] = ELEMENT_Y;
 +}
 +  } else if (TextureType == TEXTURE_2D_ARRAY
 + || TextureType == TEXTURE_SHADOW2D_ARRAY) {

Same here, || on previous line


 +coordType[ELEMENT_Z] = 0;
 +  }
 +
 +
 +  if ((TextureType == TEXTURE_SHADOW1D
 +  || TextureType == TEXTURE_SHADOW2D
 +  || TextureType == TEXTURE_SHADOWRECT
 +  || TextureType == TEXTURE_SHADOW1D_ARRAY)
 +   Opcode != AMDGPU::TEX_SAMPLE_C_L
 +   Opcode != AMDGPU::TEX_SAMPLE_C_LB) {
 +srcSelect[ELEMENT_W] = ELEMENT_Z;

Same here too, || and  go on the previous line.

 +  }
 +
 +  uint64_t Word01 = getBinaryCodeForInstr(MI, Fixups) |
 +  coordType[ELEMENT_X]  60 | coordType[ELEMENT_Y]  61 |
 +  coordType[ELEMENT_Z]  62 | coordType[ELEMENT_W]  63;
 +  uint32_t Word2 = Sampler  15 | srcSelect[ELEMENT_X]  20 |
 +  srcSelect[ELEMENT_Y]  23 | srcSelect[ELEMENT_Z]  26 |
 +  srcSelect[ELEMENT_W]  29 | Offsets[0]  0 | Offsets[1]  5 |
 +  Offsets[2]  10;
 +
 +  EmitByte(INSTR_TEX, OS);
 +  Emit(Word01, OS);
 +  Emit(Word2, OS);
 +  break;
 +}
  case AMDGPU::EG_ExportSwz:
  case AMDGPU::R600_ExportSwz:
  case AMDGPU::EG_ExportBuf:
 @@ -334,99 +400,6 @@ void R600MCCodeEmitter::EmitSrcISA(const MCInst MI, 
 unsigned RegOpIdx,
Emit(InlineConstant.i, OS);
  }
  
 -void 

Re: [Mesa-dev] [PATCH 2/2] R600: Emit CF_ALU and use true kcache register.

2013-03-27 Thread Tom Stellard
On Thu, Mar 28, 2013 at 12:40:19AM +0100, Vincent Lejeune wrote:
 ---

Thanks for working on this, it is a very nice improvement.  See my
comments inline.

  lib/Target/R600/AMDGPU.h   |   1 +
  lib/Target/R600/AMDGPUTargetMachine.cpp|   1 +
  lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp |  10 +-
  lib/Target/R600/R600EmitClauseMarkers.cpp  | 243 
 +
  lib/Target/R600/R600Instructions.td|  83 ++-
  lib/Target/R600/R600RegisterInfo.td|  63 ++
  6 files changed, 389 insertions(+), 12 deletions(-)
  create mode 100644 lib/Target/R600/R600EmitClauseMarkers.cpp
 
 diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
 index e099a9f..3cd792a 100644
 --- a/lib/Target/R600/AMDGPU.h
 +++ b/lib/Target/R600/AMDGPU.h
 @@ -23,6 +23,7 @@ class AMDGPUTargetMachine;
  // R600 Passes
  FunctionPass* createR600KernelParametersPass(const DataLayout *TD);
  FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine tm);
 +FunctionPass *createR600EmitClauseMarkers(TargetMachine tm);
  
  // SI Passes
  FunctionPass *createSIAnnotateControlFlowPass();
 diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp 
 b/lib/Target/R600/AMDGPUTargetMachine.cpp
 index 0185747..45b1be0 100644
 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp
 +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
 @@ -151,6 +151,7 @@ bool AMDGPUPassConfig::addPreEmitPass() {
if (ST.device()-getGeneration() = AMDGPUDeviceInfo::HD6XXX) {
  addPass(createAMDGPUCFGPreparationPass(*TM));
  addPass(createAMDGPUCFGStructurizerPass(*TM));
 +addPass(createR600EmitClauseMarkers(*TM));
  addPass(createR600ExpandSpecialInstrsPass(*TM));
  addPass(FinalizeMachineBundlesID);
} else {
 diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp 
 b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
 index 00ebb44..cf43f3f 100644
 --- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
 +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
 @@ -101,7 +101,8 @@ enum InstrTypes {
INSTR_FC,
INSTR_NATIVE,
INSTR_VTX,
 -  INSTR_EXPORT
 +  INSTR_EXPORT,
 +  INSTR_CFALU
  };
  
  enum FCInstr {
 @@ -250,6 +251,13 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst 
 MI, raw_ostream OS,
Emit(Inst, OS);
break;
  }
 +case AMDGPU::CF_ALU:
 +case AMDGPU::CF_ALU_PUSH_BEFORE: {
 +  uint64_t Inst = getBinaryCodeForInstr(MI, Fixups);
 +  EmitByte(INSTR_CFALU, OS);
 +  Emit(Inst, OS);
 +  break;
 +}
  
  default:
EmitALUInstr(MI, Fixups, OS);
 diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp 
 b/lib/Target/R600/R600EmitClauseMarkers.cpp
 new file mode 100644
 index 000..b869c88
 --- /dev/null
 +++ b/lib/Target/R600/R600EmitClauseMarkers.cpp
 @@ -0,0 +1,243 @@
 +//===-- R600EmitClauseMarkers.cpp - Emit CF_ALU 
 ---===//
 +//
 +// The LLVM Compiler Infrastructure
 +//
 +// This file is distributed under the University of Illinois Open Source
 +// License. See LICENSE.TXT for details.
 +//
 +//===--===//
 +//
 +/// \file
 +/// Add CF_ALU. R600 Alu instructions are grouped in clause which can hold
 +/// 128 Alu instructions ; these instructions can access up to 4 prefetched
 +/// 4 lines of 16 registers from constant buffers. Such ALU clauses are
 +/// initiated by CF_ALU instructions.
 +//===--===//
 +
 +#include AMDGPU.h
 +#include R600Defines.h
 +#include R600InstrInfo.h
 +#include R600MachineFunctionInfo.h
 +#include R600RegisterInfo.h
 +#include llvm/CodeGen/MachineFunctionPass.h
 +#include llvm/CodeGen/MachineInstrBuilder.h
 +#include llvm/CodeGen/MachineRegisterInfo.h
 +
 +namespace llvm {
 +
 +class R600EmitClauseMarkersPass : public MachineFunctionPass {
 +
 +private:
 +  static char ID;
 +  const R600InstrInfo *TII;
 +
 +  unsigned OccupiedDwords(MachineInstr *MI) const {
 +switch (MI-getOpcode()) {
 +case AMDGPU::INTERP_PAIR_XY:
 +case AMDGPU::INTERP_PAIR_ZW:
 +case AMDGPU::INTERP_VEC_LOAD:
 +case AMDGPU::DOT4_eg_pseudo:
 +case AMDGPU::DOT4_r600_pseudo:
 +  return 4;
 +case AMDGPU::KILL:
 +  return 0;
 +default:
 +  break;
 +}
 +
 +if(TII-isVector(*MI) ||
 +TII-isCubeOp(MI-getOpcode()) ||
 +TII-isReductionOp(MI-getOpcode()))
 +  return 4;
 +
 +unsigned NumLiteral = 0;
 +for (MachineInstr::mop_iterator It = MI-operands_begin(),
 +E = MI-operands_end(); It != E; ++It) {
 +  MachineOperand MO = *It;
 +  if (MO.isReg()  MO.getReg() == AMDGPU::ALU_LITERAL_X)
 +++NumLiteral;
 +}
 +return 1 + NumLiteral;
 +  }
 +
 +  bool isALU(const MachineInstr *MI) const {
 +if (TII-isALUInstr(MI-getOpcode()))
 +  return true;
 +if (TII-isVector(*MI) || 

Re: [Mesa-dev] [PATCH 1/2] r600g/llvm: use native encode for tex

2013-03-27 Thread Tom Stellard
For the series:

Reviewed-by: Tom Stellard thomas.stell...@amd.com

When you commit these can you update drivers/radeon/LLVM_REVISION.txt
with the revision number of the llvm commits these patches depend on?

Thanks,
Tom

On Thu, Mar 28, 2013 at 12:40:52AM +0100, Vincent Lejeune wrote:
 ---
  src/gallium/drivers/r600/r600_shader.c | 50 
 ++
  1 file changed, 27 insertions(+), 23 deletions(-)
 
 diff --git a/src/gallium/drivers/r600/r600_shader.c 
 b/src/gallium/drivers/r600/r600_shader.c
 index 29facf7..1e21559 100644
 --- a/src/gallium/drivers/r600/r600_shader.c
 +++ b/src/gallium/drivers/r600/r600_shader.c
 @@ -489,29 +489,33 @@ static unsigned r600_tex_from_byte_stream(struct 
 r600_shader_ctx *ctx,
  {
   struct r600_bytecode_tex tex;
  
 - tex.op = r600_isa_fetch_by_opcode(ctx-bc-isa, bytes[bytes_read++]);
 - tex.resource_id = bytes[bytes_read++];
 - tex.src_gpr = bytes[bytes_read++];
 - tex.src_rel = bytes[bytes_read++];
 - tex.dst_gpr = bytes[bytes_read++];
 - tex.dst_rel = bytes[bytes_read++];
 - tex.dst_sel_x = bytes[bytes_read++];
 - tex.dst_sel_y = bytes[bytes_read++];
 - tex.dst_sel_z = bytes[bytes_read++];
 - tex.dst_sel_w = bytes[bytes_read++];
 - tex.lod_bias = bytes[bytes_read++];
 - tex.coord_type_x = bytes[bytes_read++];
 - tex.coord_type_y = bytes[bytes_read++];
 - tex.coord_type_z = bytes[bytes_read++];
 - tex.coord_type_w = bytes[bytes_read++];
 - tex.offset_x = bytes[bytes_read++];
 - tex.offset_y = bytes[bytes_read++];
 - tex.offset_z = bytes[bytes_read++];
 - tex.sampler_id = bytes[bytes_read++];
 - tex.src_sel_x = bytes[bytes_read++];
 - tex.src_sel_y = bytes[bytes_read++];
 - tex.src_sel_z = bytes[bytes_read++];
 - tex.src_sel_w = bytes[bytes_read++];
 + uint32_t word0 = i32_from_byte_stream(bytes, bytes_read);
 + uint32_t word1 = i32_from_byte_stream(bytes, bytes_read);
 + uint32_t word2 = i32_from_byte_stream(bytes, bytes_read);
 +
 + tex.op = r600_isa_fetch_by_opcode(ctx-bc-isa, 
 G_SQ_TEX_WORD0_TEX_INST(word0));
 + tex.resource_id = G_SQ_TEX_WORD0_RESOURCE_ID(word0);
 + tex.src_gpr = G_SQ_TEX_WORD0_SRC_GPR(word0);
 + tex.src_rel = G_SQ_TEX_WORD0_SRC_REL(word0);
 + tex.dst_gpr = G_SQ_TEX_WORD1_DST_GPR(word1);
 + tex.dst_rel = G_SQ_TEX_WORD1_DST_REL(word1);
 + tex.dst_sel_x = G_SQ_TEX_WORD1_DST_SEL_X(word1);
 + tex.dst_sel_y = G_SQ_TEX_WORD1_DST_SEL_Y(word1);
 + tex.dst_sel_z = G_SQ_TEX_WORD1_DST_SEL_Z(word1);
 + tex.dst_sel_w = G_SQ_TEX_WORD1_DST_SEL_W(word1);
 + tex.lod_bias = G_SQ_TEX_WORD1_LOD_BIAS(word1);
 + tex.coord_type_x = G_SQ_TEX_WORD1_COORD_TYPE_X(word1);
 + tex.coord_type_y = G_SQ_TEX_WORD1_COORD_TYPE_Y(word1);
 + tex.coord_type_z = G_SQ_TEX_WORD1_COORD_TYPE_Z(word1);
 + tex.coord_type_w = G_SQ_TEX_WORD1_COORD_TYPE_W(word1);
 + tex.offset_x = G_SQ_TEX_WORD2_OFFSET_X(word2);
 + tex.offset_y = G_SQ_TEX_WORD2_OFFSET_Y(word2);
 + tex.offset_z = G_SQ_TEX_WORD2_OFFSET_Z(word2);
 + tex.sampler_id = G_SQ_TEX_WORD2_SAMPLER_ID(word2);
 + tex.src_sel_x = G_SQ_TEX_WORD2_SRC_SEL_X(word2);
 + tex.src_sel_y = G_SQ_TEX_WORD2_SRC_SEL_Y(word2);
 + tex.src_sel_z = G_SQ_TEX_WORD2_SRC_SEL_Z(word2);
 + tex.src_sel_w = G_SQ_TEX_WORD2_SRC_SEL_W(word2);
  
   tex.inst_mod = 0;
  
 -- 
 1.8.1.4
 
 ___
 mesa-dev mailing list
 mesa-dev@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev