Re: [Mesa-dev] glxgears is faster but 3D render is so slow

2013-03-21 Thread jupiter
Hi Brian,

On 3/21/13, Brian Paul bri...@vmware.com wrote:
 On 03/20/2013 04:07 AM, jupiter wrote:
 Hi Brian,

 On 3/19/13, Brian Paulbri...@vmware.com  wrote:
 It is fair to say, if running llvm driver in my local machine (a
 32-bit CentOS 6.2 without VNC connection), it was indeed faster than
 the xlib driver.

 Seems to me that the llvm driver broken the xlib VNC connection which
 could be caused by either I haven't configure the llvm correctly, or
 mesa llvm compile process may have bugs.

 I don't understand what you mean by llvm driver broken the xlib VNC
 connection.

 I have tested llvm driver in two platforms:

 (1) A local computer running on CentOS 6.2 which does not have
 hardware acceleration, but I can directly access it. The llvm driver
 is indeed much faster than the swrast, I could run an  application
 with 3D structure rotation.

 (2) A virtual machine running on CentOS 6.2, I have to access it via
 VNC. I was not able to run the 3D application, the graphic jerky and
 could not respond. If I changed to run swrast, the 3D application
 graphic could be run much smoothly and response was normal, but the 3D
 rotation was stopped because it was too slower to rotate the 3D
 structure.

 That was what I mean the llvm broken the xlib VNC connection. Have you
 tested the llvm driver in VNC connection?

 No, I haven't.  I'm really not sure what's happening in this
 situation.  My only totally wild guess is there's competition between
 the VNC server and Mesa for CPU time.  The llvmpipe driver is threaded
 and creates as many threads as there are CPU cores.  You can set the
 LP_NUM_THREADS to tell llvmpipe how many threads to use (0 for no
 threading).  How many CPU cores do you have?

The virtual machine I tested has only one CPU, but we can make it more
cups if it helps. I'll try to set up LP_NUM_THREADS tomorrow, but I
don't expect it caused the problem. One thing I have to address is
that xlib swrast is running very well in VNC connection despite it is
too slower to do 3D structure rotation. May be you can look at the
difference between the xlib LLVM driver and xlib swrast driver.

I'll be happy to help testing or debugging llvm driver on VNC
connection if you are going to resolve the issues seriously and if you
can tell me the procedure and data collection you need.


 (2) Compile llvm driver

 LLVM=/usr/local/libllvm/3.2

 ${SOURCE}/${CONFIGURE} --prefix=${INSTALL} --enable-xlib-glx
 --disable-dri --enable-gallium-llvm --with-gallium-drivers=swrast
 --with-llvm-shared-libs=${LLVM}/lib --with-llvm-prefix=${LLVM}

 Manually change libGL.so and libGL.so.1 to link
 lib/gallium/libGL.so.1.5.0.

 Looks OK to me.

 One more question, how can I build llvm without manually changing the
 libGL.so link? Was I missing something in my compilation? Or is there
 any issue in mesa build and installation process?

 I think that's a deficiency in our configure/install system.  I
 haven't looked into it though.

Good to know.

Thanks Brian,

Kind regards.

Jupiter
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/5] radeonsi: mark all loads as constant

2013-03-21 Thread Christian König
From: Christian König christian.koe...@amd.com

Signed-off-by: Christian König christian.koe...@amd.com
---
 src/gallium/drivers/radeonsi/radeonsi_shader.c |   32 ++--
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.c 
b/src/gallium/drivers/radeonsi/radeonsi_shader.c
index f05f41e..e78cc85 100644
--- a/src/gallium/drivers/radeonsi/radeonsi_shader.c
+++ b/src/gallium/drivers/radeonsi/radeonsi_shader.c
@@ -59,6 +59,7 @@ struct si_shader_context
struct si_pipe_shader *shader;
struct si_shader_key key;
unsigned type; /* TGSI_PROCESSOR_* specifies the type of shader. */
+   LLVMValueRef const_md;
 /* struct list_head inputs; */
 /* unsigned * input_mappings *//* From TGSI to SI hw */
 /* struct tgsi_shader_info info;*/
@@ -95,14 +96,18 @@ static struct si_shader_context * si_shader_context(
  *
  */
 static LLVMValueRef build_indexed_load(
-   struct gallivm_state * gallivm,
+   struct si_shader_context * si_shader_ctx,
LLVMValueRef base_ptr,
LLVMValueRef offset)
 {
+   struct lp_build_context * base = 
si_shader_ctx-radeon_bld.soa.bld_base.base;
+
LLVMValueRef computed_ptr = LLVMBuildGEP(
-   gallivm-builder, base_ptr, offset, 1, );
+   base-gallivm-builder, base_ptr, offset, 1, );
 
-   return LLVMBuildLoad(gallivm-builder, computed_ptr, );
+   LLVMValueRef result = LLVMBuildLoad(base-gallivm-builder, 
computed_ptr, );
+   LLVMSetMetadata(result, 1, si_shader_ctx-const_md);
+   return result;
 }
 
 static void declare_input_vs(
@@ -127,7 +132,7 @@ static void declare_input_vs(
 
t_offset = lp_build_const_int32(base-gallivm, input_index);
 
-   t_list = build_indexed_load(base-gallivm, t_list_ptr, t_offset);
+   t_list = build_indexed_load(si_shader_ctx, t_list_ptr, t_offset);
 
/* Build the attribute offset */
attribute_offset = lp_build_const_int32(base-gallivm, 0);
@@ -362,7 +367,7 @@ static LLVMValueRef fetch_constant(
 
/* Load the resource descriptor */
ptr = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, SI_PARAM_CONST);
-   args[0] = build_indexed_load(base-gallivm, ptr, 
bld_base-uint_bld.zero);
+   args[0] = build_indexed_load(si_shader_ctx, ptr, 
bld_base-uint_bld.zero);
 
args[1] = lp_build_const_int32(base-gallivm, (reg-Register.Index * 4 
+ swizzle) * 4);
if (reg-Register.Indirect) {
@@ -856,14 +861,14 @@ static void tex_fetch_args(
ptr = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, 
SI_PARAM_RESOURCE);
offset = lp_build_const_int32(bld_base-base.gallivm,
  emit_data-inst-Src[1].Register.Index);
-   emit_data-args[2] = build_indexed_load(bld_base-base.gallivm,
+   emit_data-args[2] = build_indexed_load(si_shader_ctx,
ptr, offset);
 
/* Sampler */
ptr = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, SI_PARAM_SAMPLER);
offset = lp_build_const_int32(bld_base-base.gallivm,
  emit_data-inst-Src[1].Register.Index);
-   emit_data-args[3] = build_indexed_load(bld_base-base.gallivm,
+   emit_data-args[3] = build_indexed_load(si_shader_ctx,
ptr, offset);
 
/* Dimensions */
@@ -910,6 +915,18 @@ static const struct lp_build_tgsi_action txl_action = {
.intr_name = llvm.SI.samplel.
 };
 
+static void create_meta_data(struct si_shader_context *si_shader_ctx)
+{
+   struct gallivm_state *gallivm = 
si_shader_ctx-radeon_bld.soa.bld_base.base.gallivm;
+   LLVMValueRef args[3];
+
+   args[0] = LLVMMDStringInContext(gallivm-context, const, 5);
+   args[1] = 0;
+   args[2] = lp_build_const_int32(gallivm, 1);
+
+   si_shader_ctx-const_md = LLVMMDNodeInContext(gallivm-context, args, 
3);
+}
+
 static void create_function(struct si_shader_context *si_shader_ctx)
 {
struct gallivm_state *gallivm = 
si_shader_ctx-radeon_bld.soa.bld_base.base.gallivm;
@@ -1005,6 +1022,7 @@ int si_pipe_shader_create(
si_shader_ctx.type = si_shader_ctx.parse.FullHeader.Processor.Processor;
si_shader_ctx.rctx = rctx;
 
+   create_meta_data(si_shader_ctx);
create_function(si_shader_ctx);
 
shader-shader.nr_cbufs = rctx-framebuffer.nr_cbufs;
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/5] radeonsi: remove wqm intrinsic

2013-03-21 Thread Christian König
From: Christian König christian.koe...@amd.com

Now the backend handles that itself.

Signed-off-by: Christian König christian.koe...@amd.com
---
 src/gallium/drivers/radeonsi/radeonsi_shader.c |9 -
 1 file changed, 9 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.c 
b/src/gallium/drivers/radeonsi/radeonsi_shader.c
index 110bfb0..f05f41e 100644
--- a/src/gallium/drivers/radeonsi/radeonsi_shader.c
+++ b/src/gallium/drivers/radeonsi/radeonsi_shader.c
@@ -59,7 +59,6 @@ struct si_shader_context
struct si_pipe_shader *shader;
struct si_shader_key key;
unsigned type; /* TGSI_PROCESSOR_* specifies the type of shader. */
-   unsigned ninput_emitted;
 /* struct list_head inputs; */
 /* unsigned * input_mappings *//* From TGSI to SI hw */
 /* struct tgsi_shader_info info;*/
@@ -261,14 +260,6 @@ static void declare_input_fs(
return;
}
 
-   if (!si_shader_ctx-ninput_emitted++) {
-   /* Enable whole quad mode */
-   lp_build_intrinsic(gallivm-builder,
-  llvm.SI.wqm,
-  LLVMVoidTypeInContext(gallivm-context),
-  NULL, 0);
-   }
-
intr_name = interp_param ? llvm.SI.fs.interp : llvm.SI.fs.constant;
 
/* XXX: Could there be more than TGSI_NUM_CHANNELS (4) ? */
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/5] radeonsi: mark most intrinsics as readnone/nounwind

2013-03-21 Thread Christian König
From: Christian König christian.koe...@amd.com

Signed-off-by: Christian König christian.koe...@amd.com
---
 src/gallium/drivers/radeonsi/radeonsi_shader.c |   18 ++
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.c 
b/src/gallium/drivers/radeonsi/radeonsi_shader.c
index e78cc85..062e833 100644
--- a/src/gallium/drivers/radeonsi/radeonsi_shader.c
+++ b/src/gallium/drivers/radeonsi/radeonsi_shader.c
@@ -145,8 +145,9 @@ static void declare_input_vs(
args[0] = t_list;
args[1] = attribute_offset;
args[2] = buffer_index_reg;
-   input = lp_build_intrinsic(base-gallivm-builder,
-   llvm.SI.vs.load.input, vec4_type, args, 3);
+   input = build_intrinsic(base-gallivm-builder,
+   llvm.SI.vs.load.input, vec4_type, args, 3,
+   LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 
/* Break up the vec4 into individual components */
for (chan = 0; chan  4; chan++) {
@@ -294,12 +295,12 @@ static void declare_input_fs(
args[1] = attr_number;
front = build_intrinsic(base-gallivm-builder, 
intr_name,
input_type, args, args[3] ? 4 : 
3,
-   LLVMReadOnlyAttribute | 
LLVMNoUnwindAttribute);
+   LLVMReadNoneAttribute | 
LLVMNoUnwindAttribute);
 
args[1] = back_attr_number;
back = build_intrinsic(base-gallivm-builder, 
intr_name,
   input_type, args, args[3] ? 4 : 
3,
-  LLVMReadOnlyAttribute | 
LLVMNoUnwindAttribute);
+  LLVMReadNoneAttribute | 
LLVMNoUnwindAttribute);
 
si_shader_ctx-radeon_bld.inputs[soa_index] =
LLVMBuildSelect(gallivm-builder,
@@ -322,7 +323,7 @@ static void declare_input_fs(
si_shader_ctx-radeon_bld.inputs[soa_index] =
build_intrinsic(base-gallivm-builder, 
intr_name,
input_type, args, args[3] ? 4 : 
3,
-   LLVMReadOnlyAttribute | 
LLVMNoUnwindAttribute);
+   LLVMReadNoneAttribute | 
LLVMNoUnwindAttribute);
}
}
 }
@@ -379,7 +380,7 @@ static LLVMValueRef fetch_constant(
}
 
result = build_intrinsic(base-gallivm-builder, llvm.SI.load.const, 
base-elem_type,
- args, 2, LLVMReadOnlyAttribute | 
LLVMNoUnwindAttribute);
+ args, 2, LLVMReadNoneAttribute | 
LLVMNoUnwindAttribute);
 
return bitcast(bld_base, type, result);
 }
@@ -892,9 +893,10 @@ static void build_tex_intrinsic(const struct 
lp_build_tgsi_action * action,
sprintf(intr_name, %sv%ui32, action-intr_name,
LLVMGetVectorSize(LLVMTypeOf(emit_data-args[1])));
 
-   emit_data-output[emit_data-chan] = lp_build_intrinsic(
+   emit_data-output[emit_data-chan] = build_intrinsic(
base-gallivm-builder, intr_name, emit_data-dst_type,
-   emit_data-args, emit_data-arg_count);
+   emit_data-args, emit_data-arg_count,
+   LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 }
 
 static const struct lp_build_tgsi_action tex_action = {
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 4/5] radeonsi: add preloading of all constants

2013-03-21 Thread Christian König
From: Christian König christian.koe...@amd.com

Signed-off-by: Christian König christian.koe...@amd.com
---
 src/gallium/drivers/radeonsi/radeonsi_shader.c |   67 ++--
 1 file changed, 51 insertions(+), 16 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.c 
b/src/gallium/drivers/radeonsi/radeonsi_shader.c
index 062e833..33f79e7 100644
--- a/src/gallium/drivers/radeonsi/radeonsi_shader.c
+++ b/src/gallium/drivers/radeonsi/radeonsi_shader.c
@@ -36,6 +36,7 @@
 #include gallivm/lp_bld_arit.h
 #include radeon_llvm.h
 #include radeon_llvm_emit.h
+#include util/u_memory.h
 #include tgsi/tgsi_info.h
 #include tgsi/tgsi_parse.h
 #include tgsi/tgsi_scan.h
@@ -60,9 +61,8 @@ struct si_shader_context
struct si_shader_key key;
unsigned type; /* TGSI_PROCESSOR_* specifies the type of shader. */
LLVMValueRef const_md;
-/* struct list_head inputs; */
-/* unsigned * input_mappings *//* From TGSI to SI hw */
-/* struct tgsi_shader_info info;*/
+   LLVMValueRef const_resource;
+   LLVMValueRef *constants;
 };
 
 static struct si_shader_context * si_shader_context(
@@ -352,9 +352,11 @@ static LLVMValueRef fetch_constant(
 {
struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
struct lp_build_context * base = bld_base-base;
+   const struct tgsi_ind_register *ireg = reg-Indirect;
+   unsigned idx;
 
-   LLVMValueRef ptr;
LLVMValueRef args[2];
+   LLVMValueRef addr;
LLVMValueRef result;
 
if (swizzle == LP_CHAN_ALL) {
@@ -366,18 +368,16 @@ static LLVMValueRef fetch_constant(
return lp_build_gather_values(bld_base-base.gallivm, values, 
4);
}
 
-   /* Load the resource descriptor */
-   ptr = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, SI_PARAM_CONST);
-   args[0] = build_indexed_load(si_shader_ctx, ptr, 
bld_base-uint_bld.zero);
-
-   args[1] = lp_build_const_int32(base-gallivm, (reg-Register.Index * 4 
+ swizzle) * 4);
-   if (reg-Register.Indirect) {
-   const struct tgsi_ind_register *ireg = reg-Indirect;
-   LLVMValueRef addr = 
si_shader_ctx-radeon_bld.soa.addr[ireg-Index][ireg-Swizzle];
-   LLVMValueRef idx = LLVMBuildLoad(base-gallivm-builder, addr, 
load addr reg);
-   idx = lp_build_mul_imm(bld_base-uint_bld, idx, 16);
-   args[1] = lp_build_add(bld_base-uint_bld, idx, args[1]);
-   }
+   idx = reg-Register.Index * 4 + swizzle;
+   if (!reg-Register.Indirect)
+   return bitcast(bld_base, type, si_shader_ctx-constants[idx]);
+
+   args[0] = si_shader_ctx-const_resource;
+   args[1] = lp_build_const_int32(base-gallivm, idx * 4);
+   addr = si_shader_ctx-radeon_bld.soa.addr[ireg-Index][ireg-Swizzle];
+   addr = LLVMBuildLoad(base-gallivm-builder, addr, load addr reg);
+   addr = lp_build_mul_imm(bld_base-uint_bld, addr, 16);
+   args[1] = lp_build_add(bld_base-uint_bld, addr, args[1]);
 
result = build_intrinsic(base-gallivm-builder, llvm.SI.load.const, 
base-elem_type,
  args, 2, LLVMReadNoneAttribute | 
LLVMNoUnwindAttribute);
@@ -978,6 +978,37 @@ static void create_function(struct si_shader_context 
*si_shader_ctx)
}
 }
 
+static void preload_constants(struct si_shader_context *si_shader_ctx)
+{
+   struct lp_build_tgsi_context * bld_base = 
si_shader_ctx-radeon_bld.soa.bld_base;
+   struct gallivm_state * gallivm = bld_base-base.gallivm;
+   const struct tgsi_shader_info * info = bld_base-info;
+
+   unsigned i, num_const = info-file_max[TGSI_FILE_CONSTANT] + 1;
+
+   LLVMValueRef ptr;
+
+   if (num_const == 0)
+   return;
+
+   /* Allocate space for the constant values */
+   si_shader_ctx-constants = CALLOC(num_const * 4, sizeof(LLVMValueRef));
+
+   /* Load the resource descriptor */
+   ptr = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, SI_PARAM_CONST);
+   si_shader_ctx-const_resource = build_indexed_load(si_shader_ctx, ptr, 
bld_base-uint_bld.zero);
+
+   /* Load the constants, we rely on the code sinking to do the rest */
+   for (i = 0; i  num_const * 4; ++i) {
+   LLVMValueRef args[2] = {
+   si_shader_ctx-const_resource,
+   lp_build_const_int32(gallivm, i * 4)
+   };
+   si_shader_ctx-constants[i] = build_intrinsic(gallivm-builder, 
llvm.SI.load.const,
+   bld_base-base.elem_type, args, 2, 
LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
+   }
+}
+
 int si_pipe_shader_create(
struct pipe_context *ctx,
struct si_pipe_shader *shader,
@@ -1026,6 +1057,7 @@ int si_pipe_shader_create(
 
create_meta_data(si_shader_ctx);
create_function(si_shader_ctx);
+   preload_constants(si_shader_ctx);
 
shader-shader.nr_cbufs = rctx-framebuffer.nr_cbufs;
 
@@ 

[Mesa-dev] [PATCH 5/5] radeonsi: add preloading for all samplers

2013-03-21 Thread Christian König
From: Christian König christian.koe...@amd.com

Signed-off-by: Christian König christian.koe...@amd.com
---
 src/gallium/drivers/radeonsi/radeonsi_shader.c |   57 +++-
 1 file changed, 45 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.c 
b/src/gallium/drivers/radeonsi/radeonsi_shader.c
index 33f79e7..840537a 100644
--- a/src/gallium/drivers/radeonsi/radeonsi_shader.c
+++ b/src/gallium/drivers/radeonsi/radeonsi_shader.c
@@ -63,6 +63,8 @@ struct si_shader_context
LLVMValueRef const_md;
LLVMValueRef const_resource;
LLVMValueRef *constants;
+   LLVMValueRef *resources;
+   LLVMValueRef *samplers;
 };
 
 static struct si_shader_context * si_shader_context(
@@ -740,8 +742,6 @@ static void tex_fetch_args(
const struct tgsi_full_instruction * inst = emit_data-inst;
unsigned opcode = inst-Instruction.Opcode;
unsigned target = inst-Texture.Texture;
-   LLVMValueRef ptr;
-   LLVMValueRef offset;
LLVMValueRef coords[4];
LLVMValueRef address[16];
unsigned count = 0;
@@ -859,18 +859,10 @@ static void tex_fetch_args(
emit_data-args[1] = lp_build_gather_values(gallivm, address, count);
 
/* Resource */
-   ptr = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, 
SI_PARAM_RESOURCE);
-   offset = lp_build_const_int32(bld_base-base.gallivm,
- emit_data-inst-Src[1].Register.Index);
-   emit_data-args[2] = build_indexed_load(si_shader_ctx,
-   ptr, offset);
+   emit_data-args[2] = 
si_shader_ctx-resources[emit_data-inst-Src[1].Register.Index];
 
/* Sampler */
-   ptr = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, SI_PARAM_SAMPLER);
-   offset = lp_build_const_int32(bld_base-base.gallivm,
- emit_data-inst-Src[1].Register.Index);
-   emit_data-args[3] = build_indexed_load(si_shader_ctx,
-   ptr, offset);
+   emit_data-args[3] = 
si_shader_ctx-samplers[emit_data-inst-Src[1].Register.Index];
 
/* Dimensions */
emit_data-args[4] = lp_build_const_int32(bld_base-base.gallivm, 
target);
@@ -1009,6 +1001,40 @@ static void preload_constants(struct si_shader_context 
*si_shader_ctx)
}
 }
 
+static void preload_samplers(struct si_shader_context *si_shader_ctx)
+{
+   struct lp_build_tgsi_context * bld_base = 
si_shader_ctx-radeon_bld.soa.bld_base;
+   struct gallivm_state * gallivm = bld_base-base.gallivm;
+   const struct tgsi_shader_info * info = bld_base-info;
+
+   unsigned i, num_samplers = info-file_max[TGSI_FILE_SAMPLER] + 1;
+
+   LLVMValueRef res_ptr, samp_ptr;
+   LLVMValueRef offset;
+
+   if (num_samplers == 0)
+   return;
+
+   /* Allocate space for the values */
+   si_shader_ctx-resources = CALLOC(num_samplers, sizeof(LLVMValueRef));
+   si_shader_ctx-samplers = CALLOC(num_samplers, sizeof(LLVMValueRef));
+
+   res_ptr = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, 
SI_PARAM_RESOURCE);
+   samp_ptr = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, 
SI_PARAM_SAMPLER);
+
+   /* Load the resources and samplers, we rely on the code sinking to do 
the rest */
+   for (i = 0; i  num_samplers; ++i) {
+
+   /* Resource */
+   offset = lp_build_const_int32(gallivm, i);
+   si_shader_ctx-resources[i] = build_indexed_load(si_shader_ctx, 
res_ptr, offset);
+
+   /* Sampler */
+   offset = lp_build_const_int32(gallivm, i);
+   si_shader_ctx-samplers[i] = build_indexed_load(si_shader_ctx, 
samp_ptr, offset);
+   }
+}
+
 int si_pipe_shader_create(
struct pipe_context *ctx,
struct si_pipe_shader *shader,
@@ -1058,6 +1084,7 @@ int si_pipe_shader_create(
create_meta_data(si_shader_ctx);
create_function(si_shader_ctx);
preload_constants(si_shader_ctx);
+   preload_samplers(si_shader_ctx);
 
shader-shader.nr_cbufs = rctx-framebuffer.nr_cbufs;
 
@@ -1070,6 +1097,8 @@ int si_pipe_shader_create(
if (!lp_build_tgsi_llvm(bld_base, sel-tokens)) {
fprintf(stderr, Failed to translate shader from TGSI to 
LLVM\n);
FREE(si_shader_ctx.constants);
+   FREE(si_shader_ctx.resources);
+   FREE(si_shader_ctx.samplers);
return -EINVAL;
}
 
@@ -1102,6 +1131,8 @@ int si_pipe_shader_create(
   inst_byte_count - 12);
if (shader-bo == NULL) {
FREE(si_shader_ctx.constants);
+   FREE(si_shader_ctx.resources);
+   FREE(si_shader_ctx.samplers);
return -ENOMEM;
}
 
@@ -1116,6 +1147,8 @@ int si_pipe_shader_create(
rctx-ws-buffer_unmap(shader-bo-cs_buf);
 
FREE(si_shader_ctx.constants);

Re: [Mesa-dev] [PATCH 5/5] radeonsi: add preloading for all samplers

2013-03-21 Thread Michel Dänzer

When I tried this earlier, something broke. Unfortunately, I can't seem
to remember or dig up if it was a piglit test or e.g. one of the sampler
demos in mesa/demos/src/glsl/. Did you test the latter with this change?

Similar concern for patch 4, the rest of the series looks good to me.


-- 
Earthling Michel Dänzer   |   http://www.amd.com
Libre software enthusiast |  Debian, X and DRI developer
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 5/5] radeonsi: add preloading for all samplers

2013-03-21 Thread Alex Deucher
On Thu, Mar 21, 2013 at 7:38 AM, Christian König
deathsim...@vodafone.de wrote:
 From: Christian König christian.koe...@amd.com

 Signed-off-by: Christian König christian.koe...@amd.com
 ---
  src/gallium/drivers/radeonsi/radeonsi_shader.c |   57 
 +++-
  1 file changed, 45 insertions(+), 12 deletions(-)

 diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.c 
 b/src/gallium/drivers/radeonsi/radeonsi_shader.c
 index 33f79e7..840537a 100644
 --- a/src/gallium/drivers/radeonsi/radeonsi_shader.c
 +++ b/src/gallium/drivers/radeonsi/radeonsi_shader.c
 @@ -63,6 +63,8 @@ struct si_shader_context
 LLVMValueRef const_md;
 LLVMValueRef const_resource;
 LLVMValueRef *constants;
 +   LLVMValueRef *resources;
 +   LLVMValueRef *samplers;
  };

  static struct si_shader_context * si_shader_context(
 @@ -740,8 +742,6 @@ static void tex_fetch_args(
 const struct tgsi_full_instruction * inst = emit_data-inst;
 unsigned opcode = inst-Instruction.Opcode;
 unsigned target = inst-Texture.Texture;
 -   LLVMValueRef ptr;
 -   LLVMValueRef offset;
 LLVMValueRef coords[4];
 LLVMValueRef address[16];
 unsigned count = 0;
 @@ -859,18 +859,10 @@ static void tex_fetch_args(
 emit_data-args[1] = lp_build_gather_values(gallivm, address, count);

 /* Resource */
 -   ptr = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, 
 SI_PARAM_RESOURCE);
 -   offset = lp_build_const_int32(bld_base-base.gallivm,
 - emit_data-inst-Src[1].Register.Index);
 -   emit_data-args[2] = build_indexed_load(si_shader_ctx,
 -   ptr, offset);
 +   emit_data-args[2] = 
 si_shader_ctx-resources[emit_data-inst-Src[1].Register.Index];

 /* Sampler */
 -   ptr = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, 
 SI_PARAM_SAMPLER);
 -   offset = lp_build_const_int32(bld_base-base.gallivm,
 - emit_data-inst-Src[1].Register.Index);
 -   emit_data-args[3] = build_indexed_load(si_shader_ctx,
 -   ptr, offset);
 +   emit_data-args[3] = 
 si_shader_ctx-samplers[emit_data-inst-Src[1].Register.Index];

 /* Dimensions */
 emit_data-args[4] = lp_build_const_int32(bld_base-base.gallivm, 
 target);
 @@ -1009,6 +1001,40 @@ static void preload_constants(struct si_shader_context 
 *si_shader_ctx)
 }
  }

 +static void preload_samplers(struct si_shader_context *si_shader_ctx)
 +{
 +   struct lp_build_tgsi_context * bld_base = 
 si_shader_ctx-radeon_bld.soa.bld_base;
 +   struct gallivm_state * gallivm = bld_base-base.gallivm;
 +   const struct tgsi_shader_info * info = bld_base-info;
 +
 +   unsigned i, num_samplers = info-file_max[TGSI_FILE_SAMPLER] + 1;
 +
 +   LLVMValueRef res_ptr, samp_ptr;
 +   LLVMValueRef offset;
 +
 +   if (num_samplers == 0)
 +   return;
 +
 +   /* Allocate space for the values */
 +   si_shader_ctx-resources = CALLOC(num_samplers, sizeof(LLVMValueRef));
 +   si_shader_ctx-samplers = CALLOC(num_samplers, sizeof(LLVMValueRef));
 +
 +   res_ptr = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, 
 SI_PARAM_RESOURCE);
 +   samp_ptr = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, 
 SI_PARAM_SAMPLER);
 +
 +   /* Load the resources and samplers, we rely on the code sinking to do 
 the rest */
 +   for (i = 0; i  num_samplers; ++i) {
 +
 +   /* Resource */
 +   offset = lp_build_const_int32(gallivm, i);
 +   si_shader_ctx-resources[i] = 
 build_indexed_load(si_shader_ctx, res_ptr, offset);
 +
 +   /* Sampler */
 +   offset = lp_build_const_int32(gallivm, i);
 +   si_shader_ctx-samplers[i] = 
 build_indexed_load(si_shader_ctx, samp_ptr, offset);
 +   }
 +}
 +


Is there ever a case when num_samplers != num_resources?

Alex

  int si_pipe_shader_create(
 struct pipe_context *ctx,
 struct si_pipe_shader *shader,
 @@ -1058,6 +1084,7 @@ int si_pipe_shader_create(
 create_meta_data(si_shader_ctx);
 create_function(si_shader_ctx);
 preload_constants(si_shader_ctx);
 +   preload_samplers(si_shader_ctx);

 shader-shader.nr_cbufs = rctx-framebuffer.nr_cbufs;

 @@ -1070,6 +1097,8 @@ int si_pipe_shader_create(
 if (!lp_build_tgsi_llvm(bld_base, sel-tokens)) {
 fprintf(stderr, Failed to translate shader from TGSI to 
 LLVM\n);
 FREE(si_shader_ctx.constants);
 +   FREE(si_shader_ctx.resources);
 +   FREE(si_shader_ctx.samplers);
 return -EINVAL;
 }

 @@ -1102,6 +1131,8 @@ int si_pipe_shader_create(
inst_byte_count - 12);
 if (shader-bo == NULL) {
 FREE(si_shader_ctx.constants);
 +   

Re: [Mesa-dev] glxgears is faster but 3D render is so slow

2013-03-21 Thread Brian Paul

On 03/21/2013 03:51 AM, jupiter wrote:

Hi Brian,

On 3/21/13, Brian Paulbri...@vmware.com  wrote:

On 03/20/2013 04:07 AM, jupiter wrote:

Hi Brian,

On 3/19/13, Brian Paulbri...@vmware.com   wrote:

It is fair to say, if running llvm driver in my local machine (a
32-bit CentOS 6.2 without VNC connection), it was indeed faster than
the xlib driver.

Seems to me that the llvm driver broken the xlib VNC connection which
could be caused by either I haven't configure the llvm correctly, or
mesa llvm compile process may have bugs.


I don't understand what you mean by llvm driver broken the xlib VNC
connection.


I have tested llvm driver in two platforms:

(1) A local computer running on CentOS 6.2 which does not have
hardware acceleration, but I can directly access it. The llvm driver
is indeed much faster than the swrast, I could run an  application
with 3D structure rotation.

(2) A virtual machine running on CentOS 6.2, I have to access it via
VNC. I was not able to run the 3D application, the graphic jerky and
could not respond. If I changed to run swrast, the 3D application
graphic could be run much smoothly and response was normal, but the 3D
rotation was stopped because it was too slower to rotate the 3D
structure.

That was what I mean the llvm broken the xlib VNC connection. Have you
tested the llvm driver in VNC connection?


No, I haven't.  I'm really not sure what's happening in this
situation.  My only totally wild guess is there's competition between
the VNC server and Mesa for CPU time.  The llvmpipe driver is threaded
and creates as many threads as there are CPU cores.  You can set the
LP_NUM_THREADS to tell llvmpipe how many threads to use (0 for no
threading).  How many CPU cores do you have?


The virtual machine I tested has only one CPU, but we can make it more
cups if it helps. I'll try to set up LP_NUM_THREADS tomorrow, but I
don't expect it caused the problem. One thing I have to address is
that xlib swrast is running very well in VNC connection despite it is
too slower to do 3D structure rotation. May be you can look at the
difference between the xlib LLVM driver and xlib swrast driver.


The drivers are totally different, but underneath both they render 
into shared X images which are then copied to the on-screen window 
during glXSwapBuffers.  That code is pretty much the same.


I don't know what else would account for the difference you're seeing.



I'll be happy to help testing or debugging llvm driver on VNC
connection if you are going to resolve the issues seriously and if you
can tell me the procedure and data collection you need.


I'm just way too busy right now to dig into this.  Hopefully you can 
make some progress playing with virtual CPUs and LP_NUM_THREADS.


-Brian
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/4] i965: Create a pointer in brw_context to the geometry output VUE map.

2013-03-21 Thread Eric Anholt
Paul Berry stereotype...@gmail.com writes:

 Currently, the GPU pipeline has one active VUE map in effect at any
 given time--the one representing the layout of vertex data coming from
 the vertex shader.  However, when geometry shaders are added, they
 will have their own independent VUE map.  Later pipeline stages (clip,
 sf, fs) will need to consult the geometry shader VUE map if a geometry
 shader is in use, and the vertex shader VUE map otherwise.

 This patch adds a new field to brw_context, vue_map_geom_out, which
 points to whichever VUE map should be used by later pipeline stages.
 It also adds a new state flag, BRW_NEW_VUE_MAP_GEOM_OUT, which is
 signalled whenever this pointer changes.

 Since we don't support geometry shaders yet, vue_map_geom_out is
 currently set only by the brw_vs_prog state atom.
 ---

 diff --git a/src/mesa/drivers/dri/i965/brw_vs.c 
 b/src/mesa/drivers/dri/i965/brw_vs.c
 index d875703..214730d 100644
 --- a/src/mesa/drivers/dri/i965/brw_vs.c
 +++ b/src/mesa/drivers/dri/i965/brw_vs.c
 @@ -314,6 +314,8 @@ do_vs_prog(struct brw_context *brw,
   program, program_size,
   c.prog_data, sizeof(c.prog_data),
   brw-vs.prog_offset, brw-vs.prog_data);
 +   brw-vue_map_geom_out = brw-vs.prog_data-vue_map;
 +   brw-state.dirty.brw |= BRW_NEW_VUE_MAP_GEOM_OUT;
 ralloc_free(mem_ctx);

I think the one below in upload_vs_prog should be sufficient, since it
always happens immediately after this.

  
 return true;
 @@ -488,6 +490,8 @@ static void brw_upload_vs_prog(struct brw_context *brw)
  
assert(success);
 }
 +   brw-vue_map_geom_out = brw-vs.prog_data-vue_map;
 +   brw-state.dirty.brw |= BRW_NEW_VUE_MAP_GEOM_OUT;
  }


pgprwDtLNfoPq.pgp
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 3/4] i965: Use brw.vue_map_geom_out instead of VS output VUE map where appropriate.

2013-03-21 Thread Eric Anholt
Paul Berry stereotype...@gmail.com writes:

 This patch modifies post-GS pipeline stages (transform feedback, clip,
 sf, fs) to refer to the VUE map through brw-vue_map_geom_out rather
 than brw-vs.prog_data-vue_map.  This ensures that when geometry
 shader support is added, these pipeline stages will consult the
 geometry shader output VUE map when appropriate, rather than the
 vertex shader output VUE map.
 ---
  src/mesa/drivers/dri/i965/brw_clip.c   |  7 +++
  src/mesa/drivers/dri/i965/brw_sf.c |  7 +++
  src/mesa/drivers/dri/i965/brw_state.h  |  2 +-
  src/mesa/drivers/dri/i965/brw_wm.c |  6 +++---
  src/mesa/drivers/dri/i965/gen6_sf_state.c  | 10 +-
  src/mesa/drivers/dri/i965/gen7_sf_state.c  |  8 
  src/mesa/drivers/dri/i965/gen7_sol_state.c | 14 +++---
  7 files changed, 26 insertions(+), 28 deletions(-)

 diff --git a/src/mesa/drivers/dri/i965/brw_clip.c 
 b/src/mesa/drivers/dri/i965/brw_clip.c
 index e20f7c2..bc0ebb5 100644
 --- a/src/mesa/drivers/dri/i965/brw_clip.c
 +++ b/src/mesa/drivers/dri/i965/brw_clip.c
 @@ -69,7 +69,7 @@ static void compile_clip_prog( struct brw_context *brw,
 c.func.single_program_flow = 1;
  
 c.key = *key;
 -   c.vue_map = brw-vs.prog_data-vue_map;
 +   c.vue_map = *brw-vue_map_geom_out;
  
 /* nr_regs is the number of registers filled by reading data from the VUE.
  * This program accesses the entire VUE, so nr_regs needs to be the size 
 of
 @@ -146,7 +146,7 @@ brw_upload_clip_prog(struct brw_context *brw)
 /* BRW_NEW_REDUCED_PRIMITIVE */
 key.primitive = brw-intel.reduced_primitive;
 /* CACHE_NEW_VS_PROG (also part of VUE map) */
 -   key.attrs = brw-vs.prog_data-vue_map.slots_valid;
 +   key.attrs = brw-vue_map_geom_out-slots_valid;
 /* _NEW_LIGHT */
 key.do_flat_shading = (ctx-Light.ShadeModel == GL_FLAT);
 key.pv_first = (ctx-Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION);
 @@ -258,8 +258,7 @@ const struct brw_tracked_state brw_clip_prog = {
   _NEW_TRANSFORM |
   _NEW_POLYGON | 
   _NEW_BUFFERS),
 -  .brw   = (BRW_NEW_REDUCED_PRIMITIVE),
 -  .cache = CACHE_NEW_VS_PROG
 +  .brw   = (BRW_NEW_REDUCED_PRIMITIVE | BRW_NEW_VUE_MAP_GEOM_OUT)
 },

Hmm, this is an increase in how much we recalculate SF -- before, we
wouldn't have anything flagged when doing a no-op VS update, but
BRW_NEW_VUE_MAP_GEOM_OUT happens regardless.  Could you add no-op change
detection in the previous commit?


pgpXvqj4Xh2Kv.pgp
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 4/4] i965/fs: Rename vp_outputs_written to input_slots_valid.

2013-03-21 Thread Eric Anholt
Paul Berry stereotype...@gmail.com writes:

 With the introduction of geometry shaders, fragment inputs will no
 longer come exclusively from the vertex shader; sometimes they come
 from the geometry shader.  So the name vp_outputs_written will
 become a misnomer.  This patch renames vp_outputs_written to
 input_slots_valid, to reflect the true meaning of the bitfield from
 the fragment shader's point of view: it indicates which of the
 possible input slots contain valid data that was written by the
 previous shader stage.

Patches 1 and 4 are:

Reviewed-by: Eric Anholt e...@anholt.net


pgpPljOG_ssx4.pgp
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 5/5] radeonsi: add preloading for all samplers

2013-03-21 Thread Christian König

Am 21.03.2013 15:10, schrieb Alex Deucher:

On Thu, Mar 21, 2013 at 7:38 AM, Christian König
deathsim...@vodafone.de wrote:

From: Christian König christian.koe...@amd.com

Signed-off-by: Christian König christian.koe...@amd.com
---
  src/gallium/drivers/radeonsi/radeonsi_shader.c |   57 +++-
  1 file changed, 45 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.c 
b/src/gallium/drivers/radeonsi/radeonsi_shader.c
index 33f79e7..840537a 100644
--- a/src/gallium/drivers/radeonsi/radeonsi_shader.c
+++ b/src/gallium/drivers/radeonsi/radeonsi_shader.c
@@ -63,6 +63,8 @@ struct si_shader_context
 LLVMValueRef const_md;
 LLVMValueRef const_resource;
 LLVMValueRef *constants;
+   LLVMValueRef *resources;
+   LLVMValueRef *samplers;
  };

  static struct si_shader_context * si_shader_context(
@@ -740,8 +742,6 @@ static void tex_fetch_args(
 const struct tgsi_full_instruction * inst = emit_data-inst;
 unsigned opcode = inst-Instruction.Opcode;
 unsigned target = inst-Texture.Texture;
-   LLVMValueRef ptr;
-   LLVMValueRef offset;
 LLVMValueRef coords[4];
 LLVMValueRef address[16];
 unsigned count = 0;
@@ -859,18 +859,10 @@ static void tex_fetch_args(
 emit_data-args[1] = lp_build_gather_values(gallivm, address, count);

 /* Resource */
-   ptr = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, 
SI_PARAM_RESOURCE);
-   offset = lp_build_const_int32(bld_base-base.gallivm,
- emit_data-inst-Src[1].Register.Index);
-   emit_data-args[2] = build_indexed_load(si_shader_ctx,
-   ptr, offset);
+   emit_data-args[2] = 
si_shader_ctx-resources[emit_data-inst-Src[1].Register.Index];

 /* Sampler */
-   ptr = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, SI_PARAM_SAMPLER);
-   offset = lp_build_const_int32(bld_base-base.gallivm,
- emit_data-inst-Src[1].Register.Index);
-   emit_data-args[3] = build_indexed_load(si_shader_ctx,
-   ptr, offset);
+   emit_data-args[3] = 
si_shader_ctx-samplers[emit_data-inst-Src[1].Register.Index];

 /* Dimensions */
 emit_data-args[4] = lp_build_const_int32(bld_base-base.gallivm, 
target);
@@ -1009,6 +1001,40 @@ static void preload_constants(struct si_shader_context 
*si_shader_ctx)
 }
  }

+static void preload_samplers(struct si_shader_context *si_shader_ctx)
+{
+   struct lp_build_tgsi_context * bld_base = 
si_shader_ctx-radeon_bld.soa.bld_base;
+   struct gallivm_state * gallivm = bld_base-base.gallivm;
+   const struct tgsi_shader_info * info = bld_base-info;
+
+   unsigned i, num_samplers = info-file_max[TGSI_FILE_SAMPLER] + 1;
+
+   LLVMValueRef res_ptr, samp_ptr;
+   LLVMValueRef offset;
+
+   if (num_samplers == 0)
+   return;
+
+   /* Allocate space for the values */
+   si_shader_ctx-resources = CALLOC(num_samplers, sizeof(LLVMValueRef));
+   si_shader_ctx-samplers = CALLOC(num_samplers, sizeof(LLVMValueRef));
+
+   res_ptr = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, 
SI_PARAM_RESOURCE);
+   samp_ptr = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, 
SI_PARAM_SAMPLER);
+
+   /* Load the resources and samplers, we rely on the code sinking to do 
the rest */
+   for (i = 0; i  num_samplers; ++i) {
+
+   /* Resource */
+   offset = lp_build_const_int32(gallivm, i);
+   si_shader_ctx-resources[i] = build_indexed_load(si_shader_ctx, 
res_ptr, offset);
+
+   /* Sampler */
+   offset = lp_build_const_int32(gallivm, i);
+   si_shader_ctx-samplers[i] = build_indexed_load(si_shader_ctx, 
samp_ptr, offset);
+   }
+}
+


Is there ever a case when num_samplers != num_resources?


Currently not, so it would be possible to put it into one structure, but 
I like to keep that separate, just in case we get separate 
sampler/resources.


Christian.



Alex


  int si_pipe_shader_create(
 struct pipe_context *ctx,
 struct si_pipe_shader *shader,
@@ -1058,6 +1084,7 @@ int si_pipe_shader_create(
 create_meta_data(si_shader_ctx);
 create_function(si_shader_ctx);
 preload_constants(si_shader_ctx);
+   preload_samplers(si_shader_ctx);

 shader-shader.nr_cbufs = rctx-framebuffer.nr_cbufs;

@@ -1070,6 +1097,8 @@ int si_pipe_shader_create(
 if (!lp_build_tgsi_llvm(bld_base, sel-tokens)) {
 fprintf(stderr, Failed to translate shader from TGSI to 
LLVM\n);
 FREE(si_shader_ctx.constants);
+   FREE(si_shader_ctx.resources);
+   FREE(si_shader_ctx.samplers);
 return -EINVAL;
 }

@@ -1102,6 +1131,8 @@ int si_pipe_shader_create(
   

Re: [Mesa-dev] [PATCH 5/5] radeonsi: add preloading for all samplers

2013-03-21 Thread Christian König

Am 21.03.2013 15:06, schrieb Michel Dänzer:

When I tried this earlier, something broke. Unfortunately, I can't seem
to remember or dig up if it was a piglit test or e.g. one of the sampler
demos in mesa/demos/src/glsl/. Did you test the latter with this change?

Similar concern for patch 4, the rest of the series looks good to me.


It breaks texrectmany or something like that, and that was actually my 
testcase that it now works properly.


I haven't tested the sampler demos with it, but lightsmark stresses that 
quite a bit and I'm pretty sure that it works now.


Christian.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 5/5] radeonsi: add preloading for all samplers

2013-03-21 Thread Michel Dänzer
On Don, 2013-03-21 at 17:12 +0100, Christian König wrote: 
 Am 21.03.2013 15:06, schrieb Michel Dänzer:
  When I tried this earlier, something broke. Unfortunately, I can't seem
  to remember or dig up if it was a piglit test or e.g. one of the sampler
  demos in mesa/demos/src/glsl/. Did you test the latter with this change?
 
  Similar concern for patch 4, the rest of the series looks good to me.
 
 It breaks texrectmany or something like that, and that was actually my 
 testcase that it now works properly.

You mean this change used to break that test but no longer does?


 I haven't tested the sampler demos with it, but lightsmark stresses that 
 quite a bit and I'm pretty sure that it works now.

Please just test them and be 100% sure. :)


-- 
Earthling Michel Dänzer   |   http://www.amd.com
Libre software enthusiast |  Debian, X and DRI developer
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 5/5] radeonsi: add preloading for all samplers

2013-03-21 Thread Alex Deucher
On Thu, Mar 21, 2013 at 12:09 PM, Christian König
deathsim...@vodafone.de wrote:
 Am 21.03.2013 15:10, schrieb Alex Deucher:

 On Thu, Mar 21, 2013 at 7:38 AM, Christian König
 deathsim...@vodafone.de wrote:

 From: Christian König christian.koe...@amd.com

 Signed-off-by: Christian König christian.koe...@amd.com
 ---
   src/gallium/drivers/radeonsi/radeonsi_shader.c |   57
 +++-
   1 file changed, 45 insertions(+), 12 deletions(-)

 diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.c
 b/src/gallium/drivers/radeonsi/radeonsi_shader.c
 index 33f79e7..840537a 100644
 --- a/src/gallium/drivers/radeonsi/radeonsi_shader.c
 +++ b/src/gallium/drivers/radeonsi/radeonsi_shader.c
 @@ -63,6 +63,8 @@ struct si_shader_context
  LLVMValueRef const_md;
  LLVMValueRef const_resource;
  LLVMValueRef *constants;
 +   LLVMValueRef *resources;
 +   LLVMValueRef *samplers;
   };

   static struct si_shader_context * si_shader_context(
 @@ -740,8 +742,6 @@ static void tex_fetch_args(
  const struct tgsi_full_instruction * inst = emit_data-inst;
  unsigned opcode = inst-Instruction.Opcode;
  unsigned target = inst-Texture.Texture;
 -   LLVMValueRef ptr;
 -   LLVMValueRef offset;
  LLVMValueRef coords[4];
  LLVMValueRef address[16];
  unsigned count = 0;
 @@ -859,18 +859,10 @@ static void tex_fetch_args(
  emit_data-args[1] = lp_build_gather_values(gallivm, address,
 count);

  /* Resource */
 -   ptr = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn,
 SI_PARAM_RESOURCE);
 -   offset = lp_build_const_int32(bld_base-base.gallivm,
 -
 emit_data-inst-Src[1].Register.Index);
 -   emit_data-args[2] = build_indexed_load(si_shader_ctx,
 -   ptr, offset);
 +   emit_data-args[2] =
 si_shader_ctx-resources[emit_data-inst-Src[1].Register.Index];

  /* Sampler */
 -   ptr = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn,
 SI_PARAM_SAMPLER);
 -   offset = lp_build_const_int32(bld_base-base.gallivm,
 -
 emit_data-inst-Src[1].Register.Index);
 -   emit_data-args[3] = build_indexed_load(si_shader_ctx,
 -   ptr, offset);
 +   emit_data-args[3] =
 si_shader_ctx-samplers[emit_data-inst-Src[1].Register.Index];

  /* Dimensions */
  emit_data-args[4] =
 lp_build_const_int32(bld_base-base.gallivm, target);
 @@ -1009,6 +1001,40 @@ static void preload_constants(struct
 si_shader_context *si_shader_ctx)
  }
   }

 +static void preload_samplers(struct si_shader_context *si_shader_ctx)
 +{
 +   struct lp_build_tgsi_context * bld_base =
 si_shader_ctx-radeon_bld.soa.bld_base;
 +   struct gallivm_state * gallivm = bld_base-base.gallivm;
 +   const struct tgsi_shader_info * info = bld_base-info;
 +
 +   unsigned i, num_samplers = info-file_max[TGSI_FILE_SAMPLER] + 1;
 +
 +   LLVMValueRef res_ptr, samp_ptr;
 +   LLVMValueRef offset;
 +
 +   if (num_samplers == 0)
 +   return;
 +
 +   /* Allocate space for the values */
 +   si_shader_ctx-resources = CALLOC(num_samplers,
 sizeof(LLVMValueRef));
 +   si_shader_ctx-samplers = CALLOC(num_samplers,
 sizeof(LLVMValueRef));
 +
 +   res_ptr = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn,
 SI_PARAM_RESOURCE);
 +   samp_ptr = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn,
 SI_PARAM_SAMPLER);
 +
 +   /* Load the resources and samplers, we rely on the code sinking
 to do the rest */
 +   for (i = 0; i  num_samplers; ++i) {
 +
 +   /* Resource */
 +   offset = lp_build_const_int32(gallivm, i);
 +   si_shader_ctx-resources[i] =
 build_indexed_load(si_shader_ctx, res_ptr, offset);
 +
 +   /* Sampler */
 +   offset = lp_build_const_int32(gallivm, i);
 +   si_shader_ctx-samplers[i] =
 build_indexed_load(si_shader_ctx, samp_ptr, offset);
 +   }
 +}
 +


 Is there ever a case when num_samplers != num_resources?


 Currently not, so it would be possible to put it into one structure, but I
 like to keep that separate, just in case we get separate sampler/resources.


I was thinking there may be cases with more samplers than resources.

Alex

 Christian.



 Alex

   int si_pipe_shader_create(
  struct pipe_context *ctx,
  struct si_pipe_shader *shader,
 @@ -1058,6 +1084,7 @@ int si_pipe_shader_create(
  create_meta_data(si_shader_ctx);
  create_function(si_shader_ctx);
  preload_constants(si_shader_ctx);
 +   preload_samplers(si_shader_ctx);

  shader-shader.nr_cbufs = rctx-framebuffer.nr_cbufs;

 @@ -1070,6 +1097,8 @@ int si_pipe_shader_create(
  if (!lp_build_tgsi_llvm(bld_base, sel-tokens)) {
  fprintf(stderr, Failed to translate shader from TGSI to
 LLVM\n);
  FREE(si_shader_ctx.constants);
 +   

[Mesa-dev] [PATCH] R600: Use legacy (0 * anything = 0) MUL instructions for pow intrinsics

2013-03-21 Thread Michel Dänzer
From: Michel Dänzer michel.daen...@amd.com

Fixes wrong lighting in some corner cases with r600g and radeonsi, e.g.
manifested by failure of two piglit/glean tests and intermittent black
patches in many apps.

Tested on SI and RS880.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=62012 [radeonsi]
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=58150 [r600g]

NOTE: This is a candidate for the Mesa stable branch.

Signed-off-by: Michel Dänzer michel.daen...@amd.com
---
 lib/Target/R600/R600ISelLowering.cpp | 11 ---
 lib/Target/R600/R600ISelLowering.h   |  1 -
 lib/Target/R600/R600Instructions.td  |  5 -
 lib/Target/R600/SIInstructions.td|  3 +--
 4 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/lib/Target/R600/R600ISelLowering.cpp 
b/lib/Target/R600/R600ISelLowering.cpp
index a73691d..7bdba83 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -58,7 +58,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine TM) :
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
-  setOperationAction(ISD::FPOW, MVT::f32, Custom);
 
   setOperationAction(ISD::ROTL, MVT::i32, Custom);
 
@@ -316,7 +315,6 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, 
SelectionDAG DAG) const
   case ISD::SELECT: return LowerSELECT(Op, DAG);
   case ISD::STORE: return LowerSTORE(Op, DAG);
   case ISD::LOAD: return LowerLOAD(Op, DAG);
-  case ISD::FPOW: return LowerFPOW(Op, DAG);
   case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
   case ISD::INTRINSIC_VOID: {
 SDValue Chain = Op.getOperand(0);
@@ -918,15 +916,6 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, 
SelectionDAG DAG) const
   return DAG.getMergeValues(Ops, 2, DL);
 }
 
-SDValue R600TargetLowering::LowerFPOW(SDValue Op,
-SelectionDAG DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
-  EVT VT = Op.getValueType();
-  SDValue LogBase = DAG.getNode(ISD::FLOG2, DL, VT, Op.getOperand(0));
-  SDValue MulLogBase = DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), 
LogBase);
-  return DAG.getNode(ISD::FEXP2, DL, VT, MulLogBase);
-}
-
 /// XXX Only kernel functions are supported, so we can assume for now that
 /// every function is a kernel function, but in the future we should use
 /// separate calling conventions for kernel and non-kernel functions.
diff --git a/lib/Target/R600/R600ISelLowering.h 
b/lib/Target/R600/R600ISelLowering.h
index 5cb4b91..2c09acb 100644
--- a/lib/Target/R600/R600ISelLowering.h
+++ b/lib/Target/R600/R600ISelLowering.h
@@ -59,7 +59,6 @@ private:
   SDValue LowerSELECT(SDValue Op, SelectionDAG DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG DAG) const;
   SDValue LowerFPTOUINT(SDValue Op, SelectionDAG DAG) const;
-  SDValue LowerFPOW(SDValue Op, SelectionDAG DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG DAG) const;
   SDValue LowerFrameIndex(SDValue Op, SelectionDAG DAG) const;
 
diff --git a/lib/Target/R600/R600Instructions.td 
b/lib/Target/R600/R600Instructions.td
index 8c50d54..a6daadf 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1141,6 +1141,7 @@ let Predicates = [isR600] in {
   def RECIP_UINT_r600 : RECIP_UINT_Common 0x78;
 
   defm DIV_r600 : DIV_CommonRECIP_IEEE_r600;
+  def : POW_Common LOG_IEEE_r600, EXP_IEEE_r600, MUL, R600_Reg32;
   def TGSI_LIT_Z_r600 : TGSI_LIT_Z_CommonMUL_LIT_r600, LOG_CLAMPED_r600, 
EXP_IEEE_r600;
 
   def : Pat(fsqrt R600_Reg32:$src),
@@ -1212,6 +1213,7 @@ def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common0x89;
 def SIN_eg : SIN_Common0x8D;
 def COS_eg : COS_Common0x8E;
 
+def : POW_Common LOG_IEEE_eg, EXP_IEEE_eg, MUL, R600_Reg32;
 def : SIN_PAT SIN_eg;
 def : COS_PAT COS_eg;
 def : Pat(fsqrt R600_Reg32:$src),
@@ -1540,13 +1542,14 @@ def MULLO_UINT_cm : MULLO_UINT_Common0x91;
 def MULHI_UINT_cm : MULHI_UINT_Common0x92;
 def RECIPSQRT_CLAMPED_cm : RECIPSQRT_CLAMPED_Common0x87;
 def EXP_IEEE_cm : EXP_IEEE_Common0x81;
-def LOG_IEEE_ : LOG_IEEE_Common0x83;
+def LOG_IEEE_cm : LOG_IEEE_Common0x83;
 def RECIP_CLAMPED_cm : RECIP_CLAMPED_Common0x84;
 def RECIPSQRT_IEEE_cm : RECIPSQRT_IEEE_Common0x89;
 def SIN_cm : SIN_Common0x8D;
 def COS_cm : COS_Common0x8E;
 } // End isVector = 1
 
+def : POW_Common LOG_IEEE_cm, EXP_IEEE_cm, MUL, R600_Reg32;
 def : SIN_PAT SIN_cm;
 def : COS_PAT COS_cm;
 
diff --git a/lib/Target/R600/SIInstructions.td 
b/lib/Target/R600/SIInstructions.td
index 05b04a9..dc75f01 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -1442,8 +1442,7 @@ def : Pat 
 /** == **/
 
 /* llvm.AMDGPU.pow */
-/* XXX: We are using IEEE MUL, not the 0 * anything = 0 MUL, is this correct? 
*/
-def : POW_Common V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_F32_e32, VReg_32;
+def : POW_Common V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32, VReg_32;
 
 

Re: [Mesa-dev] [PATCH] R600: Use legacy (0 * anything = 0) MUL instructions for pow intrinsics

2013-03-21 Thread Christian König

Am 21.03.2013 17:32, schrieb Michel Dänzer:

From: Michel Dänzer michel.daen...@amd.com

Fixes wrong lighting in some corner cases with r600g and radeonsi, e.g.
manifested by failure of two piglit/glean tests and intermittent black
patches in many apps.

Tested on SI and RS880.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=62012 [radeonsi]
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=58150 [r600g]

NOTE: This is a candidate for the Mesa stable branch.

Signed-off-by: Michel Dänzer michel.daen...@amd.com


Reviewed-by: Christian König christian.koe...@amd.com


---
  lib/Target/R600/R600ISelLowering.cpp | 11 ---
  lib/Target/R600/R600ISelLowering.h   |  1 -
  lib/Target/R600/R600Instructions.td  |  5 -
  lib/Target/R600/SIInstructions.td|  3 +--
  4 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/lib/Target/R600/R600ISelLowering.cpp 
b/lib/Target/R600/R600ISelLowering.cpp
index a73691d..7bdba83 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -58,7 +58,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine TM) :
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
-  setOperationAction(ISD::FPOW, MVT::f32, Custom);
  
setOperationAction(ISD::ROTL, MVT::i32, Custom);
  
@@ -316,7 +315,6 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG DAG) const

case ISD::SELECT: return LowerSELECT(Op, DAG);
case ISD::STORE: return LowerSTORE(Op, DAG);
case ISD::LOAD: return LowerLOAD(Op, DAG);
-  case ISD::FPOW: return LowerFPOW(Op, DAG);
case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
case ISD::INTRINSIC_VOID: {
  SDValue Chain = Op.getOperand(0);
@@ -918,15 +916,6 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG 
DAG) const
return DAG.getMergeValues(Ops, 2, DL);
  }
  
-SDValue R600TargetLowering::LowerFPOW(SDValue Op,

-SelectionDAG DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
-  EVT VT = Op.getValueType();
-  SDValue LogBase = DAG.getNode(ISD::FLOG2, DL, VT, Op.getOperand(0));
-  SDValue MulLogBase = DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), 
LogBase);
-  return DAG.getNode(ISD::FEXP2, DL, VT, MulLogBase);
-}
-
  /// XXX Only kernel functions are supported, so we can assume for now that
  /// every function is a kernel function, but in the future we should use
  /// separate calling conventions for kernel and non-kernel functions.
diff --git a/lib/Target/R600/R600ISelLowering.h 
b/lib/Target/R600/R600ISelLowering.h
index 5cb4b91..2c09acb 100644
--- a/lib/Target/R600/R600ISelLowering.h
+++ b/lib/Target/R600/R600ISelLowering.h
@@ -59,7 +59,6 @@ private:
SDValue LowerSELECT(SDValue Op, SelectionDAG DAG) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG DAG) const;
SDValue LowerFPTOUINT(SDValue Op, SelectionDAG DAG) const;
-  SDValue LowerFPOW(SDValue Op, SelectionDAG DAG) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG DAG) const;
SDValue LowerFrameIndex(SDValue Op, SelectionDAG DAG) const;
  
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td

index 8c50d54..a6daadf 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1141,6 +1141,7 @@ let Predicates = [isR600] in {
def RECIP_UINT_r600 : RECIP_UINT_Common 0x78;
  
defm DIV_r600 : DIV_CommonRECIP_IEEE_r600;

+  def : POW_Common LOG_IEEE_r600, EXP_IEEE_r600, MUL, R600_Reg32;
def TGSI_LIT_Z_r600 : TGSI_LIT_Z_CommonMUL_LIT_r600, LOG_CLAMPED_r600, 
EXP_IEEE_r600;
  
def : Pat(fsqrt R600_Reg32:$src),

@@ -1212,6 +1213,7 @@ def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common0x89;
  def SIN_eg : SIN_Common0x8D;
  def COS_eg : COS_Common0x8E;
  
+def : POW_Common LOG_IEEE_eg, EXP_IEEE_eg, MUL, R600_Reg32;

  def : SIN_PAT SIN_eg;
  def : COS_PAT COS_eg;
  def : Pat(fsqrt R600_Reg32:$src),
@@ -1540,13 +1542,14 @@ def MULLO_UINT_cm : MULLO_UINT_Common0x91;
  def MULHI_UINT_cm : MULHI_UINT_Common0x92;
  def RECIPSQRT_CLAMPED_cm : RECIPSQRT_CLAMPED_Common0x87;
  def EXP_IEEE_cm : EXP_IEEE_Common0x81;
-def LOG_IEEE_ : LOG_IEEE_Common0x83;
+def LOG_IEEE_cm : LOG_IEEE_Common0x83;
  def RECIP_CLAMPED_cm : RECIP_CLAMPED_Common0x84;
  def RECIPSQRT_IEEE_cm : RECIPSQRT_IEEE_Common0x89;
  def SIN_cm : SIN_Common0x8D;
  def COS_cm : COS_Common0x8E;
  } // End isVector = 1
  
+def : POW_Common LOG_IEEE_cm, EXP_IEEE_cm, MUL, R600_Reg32;

  def : SIN_PAT SIN_cm;
  def : COS_PAT COS_cm;
  
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td

index 05b04a9..dc75f01 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -1442,8 +1442,7 @@ def : Pat 
  /** == **/
  
  /* llvm.AMDGPU.pow */

-/* XXX: We are using IEEE MUL, not the 0 * anything = 0 MUL, is this 

[Mesa-dev] [PATCH] r600g: Honour legacy debugging environment variables

2013-03-21 Thread Michel Dänzer
From: Michel Dänzer michel.daen...@amd.com

This helps minimize confusion / effort when moving between branches or
helping others.

Signed-off-by: Michel Dänzer michel.daen...@amd.com
---
 src/gallium/drivers/r600/r600_pipe.c |   10 ++
 1 file changed, 10 insertions(+)

diff --git a/src/gallium/drivers/r600/r600_pipe.c 
b/src/gallium/drivers/r600/r600_pipe.c
index 9ed8814..7a84f3d 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -1088,6 +1088,16 @@ struct pipe_screen *r600_screen_create(struct 
radeon_winsys *ws)
ws-query_info(ws, rscreen-info);
 
rscreen-debug_flags = debug_get_flags_option(R600_DEBUG, 
debug_options, 0);
+   if (debug_get_bool_option(R600_DEBUG_COMPUTE, FALSE))
+   rscreen-debug_flags |= DBG_COMPUTE;
+   if (debug_get_bool_option(R600_DUMP_SHADERS, FALSE))
+   rscreen-debug_flags |= DBG_FS | DBG_VS | DBG_GS | DBG_PS | 
DBG_CS;
+   if (!debug_get_bool_option(R600_HYPERZ, TRUE))
+   rscreen-debug_flags |= DBG_NO_HYPERZ;
+   if (!debug_get_bool_option(R600_LLVM, TRUE))
+   rscreen-debug_flags |= DBG_NO_LLVM;
+   if (debug_get_bool_option(R600_PRINT_TEXDEPTH, FALSE))
+   rscreen-debug_flags |= DBG_TEX_DEPTH;
rscreen-family = rscreen-info.family;
rscreen-chip_class = rscreen-info.chip_class;
 
-- 
1.7.10.4


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] r600g: Honour legacy debugging environment variables

2013-03-21 Thread Alex Deucher
On Thu, Mar 21, 2013 at 12:59 PM, Michel Dänzer mic...@daenzer.net wrote:
 From: Michel Dänzer michel.daen...@amd.com

 This helps minimize confusion / effort when moving between branches or
 helping others.

Reviewed-by: Alex Deucher alexander.deuc...@amd.com


 Signed-off-by: Michel Dänzer michel.daen...@amd.com
 ---
  src/gallium/drivers/r600/r600_pipe.c |   10 ++
  1 file changed, 10 insertions(+)

 diff --git a/src/gallium/drivers/r600/r600_pipe.c 
 b/src/gallium/drivers/r600/r600_pipe.c
 index 9ed8814..7a84f3d 100644
 --- a/src/gallium/drivers/r600/r600_pipe.c
 +++ b/src/gallium/drivers/r600/r600_pipe.c
 @@ -1088,6 +1088,16 @@ struct pipe_screen *r600_screen_create(struct 
 radeon_winsys *ws)
 ws-query_info(ws, rscreen-info);

 rscreen-debug_flags = debug_get_flags_option(R600_DEBUG, 
 debug_options, 0);
 +   if (debug_get_bool_option(R600_DEBUG_COMPUTE, FALSE))
 +   rscreen-debug_flags |= DBG_COMPUTE;
 +   if (debug_get_bool_option(R600_DUMP_SHADERS, FALSE))
 +   rscreen-debug_flags |= DBG_FS | DBG_VS | DBG_GS | DBG_PS | 
 DBG_CS;
 +   if (!debug_get_bool_option(R600_HYPERZ, TRUE))
 +   rscreen-debug_flags |= DBG_NO_HYPERZ;
 +   if (!debug_get_bool_option(R600_LLVM, TRUE))
 +   rscreen-debug_flags |= DBG_NO_LLVM;
 +   if (debug_get_bool_option(R600_PRINT_TEXDEPTH, FALSE))
 +   rscreen-debug_flags |= DBG_TEX_DEPTH;
 rscreen-family = rscreen-info.family;
 rscreen-chip_class = rscreen-info.chip_class;

 --
 1.7.10.4


 ___
 mesa-dev mailing list
 mesa-dev@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] mesa: Delete VERT_ATTRIB_GENERIC_NV and VERT_BIT_GENERIC_NV macros.

2013-03-21 Thread Kenneth Graunke
These haven't been used since we deleted NV_vertex_program support.

Signed-off-by: Kenneth Graunke kenn...@whitecape.org
---
 src/mesa/main/mtypes.h | 10 --
 1 file changed, 10 deletions(-)

diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 7900897..1a35e63 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -146,9 +146,6 @@ typedef enum
  * VERT_ATTRIB_TEX
  *   include the classic texture coordinate attributes.
  *   Is a subset of VERT_ATTRIB_FF.
- * VERT_ATTRIB_GENERIC_NV
- *   include the NV shader attributes.
- *   Is a subset of VERT_ATTRIB_FF.
  * VERT_ATTRIB_GENERIC
  *   include the OpenGL 2.0+ GLSL generic shader attributes.
  *   These alias the generic GL_ARB_vertex_shader attributes.
@@ -159,9 +156,6 @@ typedef enum
 #define VERT_ATTRIB_TEX(i)  (VERT_ATTRIB_TEX0 + (i))
 #define VERT_ATTRIB_TEX_MAX MAX_TEXTURE_COORD_UNITS
 
-#define VERT_ATTRIB_GENERIC_NV(i)   (VERT_ATTRIB_POS + (i))
-#define VERT_ATTRIB_GENERIC_NV_MAX  MAX_VERTEX_GENERIC_ATTRIBS
-
 #define VERT_ATTRIB_GENERIC(i)  (VERT_ATTRIB_GENERIC0 + (i))
 #define VERT_ATTRIB_GENERIC_MAX MAX_VERTEX_GENERIC_ATTRIBS
 
@@ -198,10 +192,6 @@ typedef enum
 #define VERT_BIT_TEX_ALL \
BITFIELD64_RANGE(VERT_ATTRIB_TEX(0), VERT_ATTRIB_TEX_MAX)
 
-#define VERT_BIT_GENERIC_NV(i)   VERT_BIT(VERT_ATTRIB_GENERIC_NV(i))
-#define VERT_BIT_GENERIC_NV_ALL  \
-   BITFIELD64_RANGE(VERT_ATTRIB_GENERIC_NV(0), VERT_ATTRIB_GENERIC_NV_MAX)
-
 #define VERT_BIT_GENERIC(i)  VERT_BIT(VERT_ATTRIB_GENERIC(i))
 #define VERT_BIT_GENERIC_ALL \
BITFIELD64_RANGE(VERT_ATTRIB_GENERIC(0), VERT_ATTRIB_GENERIC_MAX)
-- 
1.8.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] mesa: Rename gl_context::swtnl_im to vbo_context.

2013-03-21 Thread Kenneth Graunke
The main GL context's swtnl_im field is the VBO module's vbo_context
structure.  Using the name swtnl in the name is confusing since
some drivers use hardware texturing and lighting, but still rely on the
VBO module for drawing.

Signed-off-by: Kenneth Graunke kenn...@whitecape.org
---
 src/mesa/main/mtypes.h | 2 +-
 src/mesa/vbo/vbo_context.c | 4 ++--
 src/mesa/vbo/vbo_context.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 8c38aa7..7900897 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -3613,7 +3613,7 @@ struct gl_context
void *swrast_context;
void *swsetup_context;
void *swtnl_context;
-   void *swtnl_im;
+   void *vbo_context;
struct st_context *st;
void *aelt_context;
/*@}*/
diff --git a/src/mesa/vbo/vbo_context.c b/src/mesa/vbo/vbo_context.c
index 7eda31e..a9e4a1e 100644
--- a/src/mesa/vbo/vbo_context.c
+++ b/src/mesa/vbo/vbo_context.c
@@ -152,7 +152,7 @@ GLboolean _vbo_CreateContext( struct gl_context *ctx )
 {
struct vbo_context *vbo = CALLOC_STRUCT(vbo_context);
 
-   ctx-swtnl_im = (void *)vbo;
+   ctx-vbo_context = (void *) vbo;
 
/* Initialize the arrayelt helper
 */
@@ -224,7 +224,7 @@ void _vbo_DestroyContext( struct gl_context *ctx )
   if (ctx-API == API_OPENGL_COMPAT)
  vbo_save_destroy(ctx);
   free(vbo);
-  ctx-swtnl_im = NULL;
+  ctx-vbo_context = NULL;
}
 }
 
diff --git a/src/mesa/vbo/vbo_context.h b/src/mesa/vbo/vbo_context.h
index 1ff6ec0..a8968b2 100644
--- a/src/mesa/vbo/vbo_context.h
+++ b/src/mesa/vbo/vbo_context.h
@@ -91,7 +91,7 @@ struct vbo_context {
 
 static inline struct vbo_context *vbo_context(struct gl_context *ctx) 
 {
-   return (struct vbo_context *)(ctx-swtnl_im);
+   return (struct vbo_context *) ctx-vbo_context;
 }
 
 
-- 
1.8.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] mesa: Rename gl_context::swtnl_im to vbo_context.

2013-03-21 Thread Brian Paul

On 03/21/2013 12:19 PM, Kenneth Graunke wrote:

The main GL context's swtnl_im field is the VBO module's vbo_context
structure.  Using the name swtnl in the name is confusing since
some drivers use hardware texturing and lighting, but still rely on the
VBO module for drawing.

Signed-off-by: Kenneth Graunkekenn...@whitecape.org
---
  src/mesa/main/mtypes.h | 2 +-
  src/mesa/vbo/vbo_context.c | 4 ++--
  src/mesa/vbo/vbo_context.h | 2 +-
  3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 8c38aa7..7900897 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -3613,7 +3613,7 @@ struct gl_context
 void *swrast_context;
 void *swsetup_context;
 void *swtnl_context;
-   void *swtnl_im;
+   void *vbo_context;
 struct st_context *st;
 void *aelt_context;
 /*@}*/
diff --git a/src/mesa/vbo/vbo_context.c b/src/mesa/vbo/vbo_context.c
index 7eda31e..a9e4a1e 100644
--- a/src/mesa/vbo/vbo_context.c
+++ b/src/mesa/vbo/vbo_context.c
@@ -152,7 +152,7 @@ GLboolean _vbo_CreateContext( struct gl_context *ctx )
  {
 struct vbo_context *vbo = CALLOC_STRUCT(vbo_context);

-   ctx-swtnl_im = (void *)vbo;
+   ctx-vbo_context = (void *) vbo;

 /* Initialize the arrayelt helper
  */
@@ -224,7 +224,7 @@ void _vbo_DestroyContext( struct gl_context *ctx )
if (ctx-API == API_OPENGL_COMPAT)
   vbo_save_destroy(ctx);
free(vbo);
-  ctx-swtnl_im = NULL;
+  ctx-vbo_context = NULL;
 }
  }

diff --git a/src/mesa/vbo/vbo_context.h b/src/mesa/vbo/vbo_context.h
index 1ff6ec0..a8968b2 100644
--- a/src/mesa/vbo/vbo_context.h
+++ b/src/mesa/vbo/vbo_context.h
@@ -91,7 +91,7 @@ struct vbo_context {

  static inline struct vbo_context *vbo_context(struct gl_context *ctx)
  {
-   return (struct vbo_context *)(ctx-swtnl_im);
+   return (struct vbo_context *) ctx-vbo_context;
  }




Reviewed-by: Brian Paul bri...@vmware.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] mesa: Delete VERT_ATTRIB_GENERIC_NV and VERT_BIT_GENERIC_NV macros.

2013-03-21 Thread Brian Paul

On 03/21/2013 12:10 PM, Kenneth Graunke wrote:

These haven't been used since we deleted NV_vertex_program support.

Signed-off-by: Kenneth Graunkekenn...@whitecape.org
---
  src/mesa/main/mtypes.h | 10 --
  1 file changed, 10 deletions(-)

diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 7900897..1a35e63 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -146,9 +146,6 @@ typedef enum
   * VERT_ATTRIB_TEX
   *   include the classic texture coordinate attributes.
   *   Is a subset of VERT_ATTRIB_FF.
- * VERT_ATTRIB_GENERIC_NV
- *   include the NV shader attributes.
- *   Is a subset of VERT_ATTRIB_FF.
   * VERT_ATTRIB_GENERIC
   *   include the OpenGL 2.0+ GLSL generic shader attributes.
   *   These alias the generic GL_ARB_vertex_shader attributes.
@@ -159,9 +156,6 @@ typedef enum
  #define VERT_ATTRIB_TEX(i)  (VERT_ATTRIB_TEX0 + (i))
  #define VERT_ATTRIB_TEX_MAX MAX_TEXTURE_COORD_UNITS

-#define VERT_ATTRIB_GENERIC_NV(i)   (VERT_ATTRIB_POS + (i))
-#define VERT_ATTRIB_GENERIC_NV_MAX  MAX_VERTEX_GENERIC_ATTRIBS
-
  #define VERT_ATTRIB_GENERIC(i)  (VERT_ATTRIB_GENERIC0 + (i))
  #define VERT_ATTRIB_GENERIC_MAX MAX_VERTEX_GENERIC_ATTRIBS

@@ -198,10 +192,6 @@ typedef enum
  #define VERT_BIT_TEX_ALL \
 BITFIELD64_RANGE(VERT_ATTRIB_TEX(0), VERT_ATTRIB_TEX_MAX)

-#define VERT_BIT_GENERIC_NV(i)   VERT_BIT(VERT_ATTRIB_GENERIC_NV(i))
-#define VERT_BIT_GENERIC_NV_ALL  \
-   BITFIELD64_RANGE(VERT_ATTRIB_GENERIC_NV(0), VERT_ATTRIB_GENERIC_NV_MAX)
-
  #define VERT_BIT_GENERIC(i)  VERT_BIT(VERT_ATTRIB_GENERIC(i))
  #define VERT_BIT_GENERIC_ALL \
 BITFIELD64_RANGE(VERT_ATTRIB_GENERIC(0), VERT_ATTRIB_GENERIC_MAX)


Reviewed-by: Brian Paul bri...@vmware.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] tgsi: fix regression introduced in 16caeff

2013-03-21 Thread Rob Clark
The recently added tgsi_declaration::Array field was not being
initialized to zero, resulting in nonsense shaders like:

   FRAG
   PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1
   DCL IN[0], ARRAY(48), GENERIC[0], CONSTANT
- DCL NULL[1..0].
 0: MOV OUT[0], IN[0]
 1: END

Signed-off-by: Rob Clark robdcl...@gmail.com
---
 src/gallium/auxiliary/tgsi/tgsi_build.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c 
b/src/gallium/auxiliary/tgsi/tgsi_build.c
index 435d94a..2e49671 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -109,6 +109,7 @@ tgsi_default_declaration( void )
declaration.Semantic = 0;
declaration.Invariant = 0;
declaration.Local = 0;
+   declaration.Array = 0;
declaration.Padding = 0;
 
return declaration;
-- 
1.8.1.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] tgsi: fix regression introduced in 16caeff

2013-03-21 Thread Brian Paul

I already fixed this in commit 460ae8a11a33340a979c7e0721f76e63a990

-Brian

On 03/21/2013 12:39 PM, Rob Clark wrote:

The recently added tgsi_declaration::Array field was not being
initialized to zero, resulting in nonsense shaders like:

FRAG
PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1
DCL IN[0], ARRAY(48), GENERIC[0], CONSTANT
-  DCL NULL[1..0].
  0: MOV OUT[0], IN[0]
  1: END

Signed-off-by: Rob Clarkrobdcl...@gmail.com
---
  src/gallium/auxiliary/tgsi/tgsi_build.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c 
b/src/gallium/auxiliary/tgsi/tgsi_build.c
index 435d94a..2e49671 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -109,6 +109,7 @@ tgsi_default_declaration( void )
 declaration.Semantic = 0;
 declaration.Invariant = 0;
 declaration.Local = 0;
+   declaration.Array = 0;
 declaration.Padding = 0;

 return declaration;


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] tgsi: fix regression introduced in 16caeff

2013-03-21 Thread Rob Clark
ahh, so you did.. looks like I had rebased just before your patch

BR,
-R

On Thu, Mar 21, 2013 at 2:47 PM, Brian Paul bri...@vmware.com wrote:
 I already fixed this in commit 460ae8a11a33340a979c7e0721f76e63a990

 -Brian


 On 03/21/2013 12:39 PM, Rob Clark wrote:

 The recently added tgsi_declaration::Array field was not being
 initialized to zero, resulting in nonsense shaders like:

 FRAG
 PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1
 DCL IN[0], ARRAY(48), GENERIC[0], CONSTANT
 -  DCL NULL[1..0].
   0: MOV OUT[0], IN[0]
   1: END

 Signed-off-by: Rob Clarkrobdcl...@gmail.com
 ---
   src/gallium/auxiliary/tgsi/tgsi_build.c | 1 +
   1 file changed, 1 insertion(+)

 diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c
 b/src/gallium/auxiliary/tgsi/tgsi_build.c
 index 435d94a..2e49671 100644
 --- a/src/gallium/auxiliary/tgsi/tgsi_build.c
 +++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
 @@ -109,6 +109,7 @@ tgsi_default_declaration( void )
  declaration.Semantic = 0;
  declaration.Invariant = 0;
  declaration.Local = 0;
 +   declaration.Array = 0;
  declaration.Padding = 0;

  return declaration;


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] mesa: Delete VERT_ATTRIB_GENERIC_NV and VERT_BIT_GENERIC_NV macros.

2013-03-21 Thread Eric Anholt
Kenneth Graunke kenn...@whitecape.org writes:

 These haven't been used since we deleted NV_vertex_program support.

 Signed-off-by: Kenneth Graunke kenn...@whitecape.org

Reviewed-by: Eric Anholt e...@anholt.net


pgpvMo2WYmc2C.pgp
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] mesa: Rename gl_context::swtnl_im to vbo_context.

2013-03-21 Thread Eric Anholt
Kenneth Graunke kenn...@whitecape.org writes:

 The main GL context's swtnl_im field is the VBO module's vbo_context
 structure.  Using the name swtnl in the name is confusing since
 some drivers use hardware texturing and lighting, but still rely on the
 VBO module for drawing.

 Signed-off-by: Kenneth Graunke kenn...@whitecape.org
 ---
  src/mesa/main/mtypes.h | 2 +-
  src/mesa/vbo/vbo_context.c | 4 ++--
  src/mesa/vbo/vbo_context.h | 2 +-
  3 files changed, 4 insertions(+), 4 deletions(-)

 diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
 index 8c38aa7..7900897 100644
 --- a/src/mesa/main/mtypes.h
 +++ b/src/mesa/main/mtypes.h
 @@ -3613,7 +3613,7 @@ struct gl_context
 void *swrast_context;
 void *swsetup_context;
 void *swtnl_context;
 -   void *swtnl_im;
 +   void *vbo_context;
 struct st_context *st;
 void *aelt_context;
 /*@}*/

Could we forward declare struct vbo_context and use the actual type
here?  That would clarify things even further, and avoid gratuitous
casting.


pgpRpGeLsI2mN.pgp
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] r600g: Honour legacy debugging environment variables

2013-03-21 Thread Marek Olšák
Reviewed-by: Marek Olšák mar...@gmail.com

Marek

On Thu, Mar 21, 2013 at 5:59 PM, Michel Dänzer mic...@daenzer.net wrote:
 From: Michel Dänzer michel.daen...@amd.com

 This helps minimize confusion / effort when moving between branches or
 helping others.

 Signed-off-by: Michel Dänzer michel.daen...@amd.com
 ---
  src/gallium/drivers/r600/r600_pipe.c |   10 ++
  1 file changed, 10 insertions(+)

 diff --git a/src/gallium/drivers/r600/r600_pipe.c 
 b/src/gallium/drivers/r600/r600_pipe.c
 index 9ed8814..7a84f3d 100644
 --- a/src/gallium/drivers/r600/r600_pipe.c
 +++ b/src/gallium/drivers/r600/r600_pipe.c
 @@ -1088,6 +1088,16 @@ struct pipe_screen *r600_screen_create(struct 
 radeon_winsys *ws)
 ws-query_info(ws, rscreen-info);

 rscreen-debug_flags = debug_get_flags_option(R600_DEBUG, 
 debug_options, 0);
 +   if (debug_get_bool_option(R600_DEBUG_COMPUTE, FALSE))
 +   rscreen-debug_flags |= DBG_COMPUTE;
 +   if (debug_get_bool_option(R600_DUMP_SHADERS, FALSE))
 +   rscreen-debug_flags |= DBG_FS | DBG_VS | DBG_GS | DBG_PS | 
 DBG_CS;
 +   if (!debug_get_bool_option(R600_HYPERZ, TRUE))
 +   rscreen-debug_flags |= DBG_NO_HYPERZ;
 +   if (!debug_get_bool_option(R600_LLVM, TRUE))
 +   rscreen-debug_flags |= DBG_NO_LLVM;
 +   if (debug_get_bool_option(R600_PRINT_TEXDEPTH, FALSE))
 +   rscreen-debug_flags |= DBG_TEX_DEPTH;
 rscreen-family = rscreen-info.family;
 rscreen-chip_class = rscreen-info.chip_class;

 --
 1.7.10.4


 ___
 mesa-dev mailing list
 mesa-dev@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/4] cso: add constant buffer save/restore feature for postprocessing

2013-03-21 Thread Marek Olšák
Postprocessing is an internal meta op and should restore the states
it changes.
---
 src/gallium/auxiliary/cso_cache/cso_context.c |   59 +
 src/gallium/auxiliary/cso_cache/cso_context.h |   13 ++
 src/gallium/auxiliary/postprocess/pp_mlaa.c   |4 +-
 src/gallium/auxiliary/postprocess/pp_run.c|4 ++
 src/mesa/state_tracker/st_atom_constbuf.c |8 ++--
 5 files changed, 83 insertions(+), 5 deletions(-)

diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c 
b/src/gallium/auxiliary/cso_cache/cso_context.c
index 3f6fd8c..e46f2ab 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -90,6 +90,9 @@ struct cso_context {
struct pipe_vertex_buffer aux_vertex_buffer_saved;
unsigned aux_vertex_buffer_index;
 
+   struct pipe_constant_buffer aux_constbuf_current[PIPE_SHADER_TYPES];
+   struct pipe_constant_buffer aux_constbuf_saved[PIPE_SHADER_TYPES];
+
unsigned nr_so_targets;
struct pipe_stream_output_target *so_targets[PIPE_MAX_SO_BUFFERS];
 
@@ -329,6 +332,11 @@ void cso_release_all( struct cso_context *ctx )
pipe_resource_reference(ctx-aux_vertex_buffer_current.buffer, NULL);
pipe_resource_reference(ctx-aux_vertex_buffer_saved.buffer, NULL);
 
+   for (i = 0; i  PIPE_SHADER_TYPES; i++) {
+  pipe_resource_reference(ctx-aux_constbuf_current[i].buffer, NULL);
+  pipe_resource_reference(ctx-aux_constbuf_saved[i].buffer, NULL);
+   }
+
for (i = 0; i  PIPE_MAX_SO_BUFFERS; i++) {
   pipe_so_target_reference(ctx-so_targets[i], NULL);
   pipe_so_target_reference(ctx-so_targets_saved[i], NULL);
@@ -1318,6 +1326,57 @@ cso_restore_stream_outputs(struct cso_context *ctx)
ctx-nr_so_targets_saved = 0;
 }
 
+/* constant buffers */
+
+void
+cso_set_constant_buffer(struct cso_context *cso, unsigned shader_stage,
+unsigned index, struct pipe_constant_buffer *cb)
+{
+   struct pipe_context *pipe = cso-pipe;
+
+   pipe-set_constant_buffer(pipe, shader_stage, index, cb);
+
+   if (index == 0) {
+  util_copy_constant_buffer(cso-aux_constbuf_current[shader_stage], cb);
+   }
+}
+
+void
+cso_set_constant_buffer_resource(struct cso_context *cso,
+ unsigned shader_stage,
+ unsigned index,
+ struct pipe_resource *buffer)
+{
+   if (buffer) {
+  struct pipe_constant_buffer cb;
+  cb.buffer = buffer;
+  cb.buffer_offset = 0;
+  cb.buffer_size = buffer-width0;
+  cb.user_buffer = NULL;
+  cso_set_constant_buffer(cso, shader_stage, index, cb);
+   } else {
+  cso_set_constant_buffer(cso, shader_stage, index, NULL);
+   }
+}
+
+void
+cso_save_constant_buffer_slot0(struct cso_context *cso,
+  unsigned shader_stage)
+{
+   util_copy_constant_buffer(cso-aux_constbuf_saved[shader_stage],
+ cso-aux_constbuf_current[shader_stage]);
+}
+
+void
+cso_restore_constant_buffer_slot0(struct cso_context *cso,
+ unsigned shader_stage)
+{
+   cso_set_constant_buffer(cso, shader_stage, 0,
+   cso-aux_constbuf_saved[shader_stage]);
+   pipe_resource_reference(cso-aux_constbuf_saved[shader_stage].buffer,
+   NULL);
+}
+
 /* drawing */
 
 void
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.h 
b/src/gallium/auxiliary/cso_cache/cso_context.h
index e8f5a9f..20ab4ef 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.h
+++ b/src/gallium/auxiliary/cso_cache/cso_context.h
@@ -203,6 +203,19 @@ void
 cso_restore_sampler_views(struct cso_context *cso, unsigned shader_stage);
 
 
+/* constant buffers */
+
+void cso_set_constant_buffer(struct cso_context *cso, unsigned shader_stage,
+ unsigned index, struct pipe_constant_buffer *cb);
+void cso_set_constant_buffer_resource(struct cso_context *cso,
+  unsigned shader_stage,
+  unsigned index,
+  struct pipe_resource *buffer);
+void cso_save_constant_buffer_slot0(struct cso_context *cso,
+unsigned shader_stage);
+void cso_restore_constant_buffer_slot0(struct cso_context *cso,
+   unsigned shader_stage);
+
 
 /* drawing */
 
diff --git a/src/gallium/auxiliary/postprocess/pp_mlaa.c 
b/src/gallium/auxiliary/postprocess/pp_mlaa.c
index 297f3e4..2ec328c 100644
--- a/src/gallium/auxiliary/postprocess/pp_mlaa.c
+++ b/src/gallium/auxiliary/postprocess/pp_mlaa.c
@@ -99,8 +99,8 @@ pp_jimenezmlaa_run(struct pp_queue_t *ppq, struct 
pipe_resource *in,
   dimensions[1] = p-framebuffer.height;
}
 
-   pipe_set_constant_buffer(p-pipe, PIPE_SHADER_VERTEX, 0, constbuf);
-   pipe_set_constant_buffer(p-pipe, PIPE_SHADER_FRAGMENT, 0, constbuf);
+   

[Mesa-dev] [PATCH 2/4] r300g: fix crash while binding a NULL constant buffer

2013-03-21 Thread Marek Olšák
---
 src/gallium/drivers/r300/r300_state.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/r300/r300_state.c 
b/src/gallium/drivers/r300/r300_state.c
index ad93510..2de0fd6 100644
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -2056,7 +2056,7 @@ static void r300_set_constant_buffer(struct pipe_context 
*pipe,
 struct r300_constant_buffer *cbuf;
 uint32_t *mapped;
 
-if (!cb)
+if (!cb || (!cb-buffer  !cb-user_buffer))
 return;
 
 switch (shader) {
-- 
1.7.10.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/4] r600g: fix crash while binding a NULL constant buffer

2013-03-21 Thread Marek Olšák
---
 src/gallium/drivers/r600/r600_state_common.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/r600/r600_state_common.c 
b/src/gallium/drivers/r600/r600_state_common.c
index b0e66ac..34c70ed 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -926,7 +926,7 @@ static void r600_set_constant_buffer(struct pipe_context 
*ctx, uint shader, uint
/* Note that the state tracker can unbind constant buffers by
 * passing NULL here.
 */
-   if (unlikely(!input)) {
+   if (unlikely(!input || (!input-buffer  !input-user_buffer))) {
state-enabled_mask = ~(1  index);
state-dirty_mask = ~(1  index);
pipe_resource_reference(state-cb[index].buffer, NULL);
-- 
1.7.10.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 4/4] radeonsi: fix crash while binding a NULL constant buffer

2013-03-21 Thread Marek Olšák
---
 src/gallium/drivers/radeonsi/si_state.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_state.c 
b/src/gallium/drivers/radeonsi/si_state.c
index a395ec4..fee1b7f 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -2491,7 +2491,7 @@ static void si_set_constant_buffer(struct pipe_context 
*ctx, uint shader, uint i
/* Note that the state tracker can unbind constant buffers by
 * passing NULL here.
 */
-   if (cb == NULL)
+   if (cb == NULL || (!cb-buffer  !cb-user_buffer))
return;
 
pm4 = CALLOC_STRUCT(si_pm4_state);
-- 
1.7.10.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [Bug 62612] New: dispatch_sanity test lumps GL4.0+ functions into 4.3

2013-03-21 Thread bugzilla-daemon
https://bugs.freedesktop.org/show_bug.cgi?id=62612

  Priority: medium
Bug ID: 62612
  Assignee: mesa-dev@lists.freedesktop.org
   Summary: dispatch_sanity test lumps GL4.0+ functions into 4.3
  Severity: normal
Classification: Unclassified
OS: All
  Reporter: matts...@gmail.com
  Hardware: Other
Status: NEW
   Version: unspecified
 Component: Other
   Product: Mesa

I noticed:

   { glDrawTransformFeedback, 43, -1 },
   { glDrawTransformFeedbackStream, 43, -1 },

which are from ARB_transform_feedback2 and 3 respectively are in GL 4.0 but the
test says they must exist by 4.3.

I started to fix this, but then noticed that we don't have lists for 4.0, 4.1,
or 4.2, so the problem is probably much larger than just these two extensions.

Maybe we should strip out all of the lines that say Add to xml and fix up the
others.

I also notice

   /* GL_ARB_internalformat_query */
   { glGetInternalformativ, 30, -1 },

but ARB_internalformat_query is part of 4.2 and not a required part of 3.0. I
think there's some general confusion about what the version field is supposed
to mean.

IIRC, this is why Ian suggested having separate structs for each version, like
how GL ES 2 and 3 are handled.

-- 
You are receiving this mail because:
You are the assignee for the bug.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [Bug 62612] dispatch_sanity test lumps GL4.0+ functions into 4.3

2013-03-21 Thread bugzilla-daemon
https://bugs.freedesktop.org/show_bug.cgi?id=62612

Matt Turner matts...@gmail.com changed:

   What|Removed |Added

 CC||i...@freedesktop.org,
   ||jljus...@gmail.com

-- 
You are receiving this mail because:
You are the assignee for the bug.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/4] i965: Create a pointer in brw_context to the geometry output VUE map.

2013-03-21 Thread Paul Berry
On 21 March 2013 08:56, Eric Anholt e...@anholt.net wrote:

 Paul Berry stereotype...@gmail.com writes:

  Currently, the GPU pipeline has one active VUE map in effect at any
  given time--the one representing the layout of vertex data coming from
  the vertex shader.  However, when geometry shaders are added, they
  will have their own independent VUE map.  Later pipeline stages (clip,
  sf, fs) will need to consult the geometry shader VUE map if a geometry
  shader is in use, and the vertex shader VUE map otherwise.
 
  This patch adds a new field to brw_context, vue_map_geom_out, which
  points to whichever VUE map should be used by later pipeline stages.
  It also adds a new state flag, BRW_NEW_VUE_MAP_GEOM_OUT, which is
  signalled whenever this pointer changes.
 
  Since we don't support geometry shaders yet, vue_map_geom_out is
  currently set only by the brw_vs_prog state atom.
  ---

  diff --git a/src/mesa/drivers/dri/i965/brw_vs.c
 b/src/mesa/drivers/dri/i965/brw_vs.c
  index d875703..214730d 100644
  --- a/src/mesa/drivers/dri/i965/brw_vs.c
  +++ b/src/mesa/drivers/dri/i965/brw_vs.c
  @@ -314,6 +314,8 @@ do_vs_prog(struct brw_context *brw,
program, program_size,
c.prog_data, sizeof(c.prog_data),
brw-vs.prog_offset, brw-vs.prog_data);
  +   brw-vue_map_geom_out = brw-vs.prog_data-vue_map;
  +   brw-state.dirty.brw |= BRW_NEW_VUE_MAP_GEOM_OUT;
  ralloc_free(mem_ctx);

 I think the one below in upload_vs_prog should be sufficient, since it
 always happens immediately after this.


Oops, you're right.  In fact, I think it may lead to bugs to update the VUE
map from do_vs_prog(), since do_vs_prog() is also called during
precompilation.



 
  return true;
  @@ -488,6 +490,8 @@ static void brw_upload_vs_prog(struct brw_context
 *brw)
 
 assert(success);
  }
  +   brw-vue_map_geom_out = brw-vs.prog_data-vue_map;
  +   brw-state.dirty.brw |= BRW_NEW_VUE_MAP_GEOM_OUT;
   }

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] llvmpipe: add EXT_packed_float render target format support

2013-03-21 Thread sroland
From: Roland Scheidegger srol...@vmware.com

New conversion code to handle conversion from/to r11g11b10 AoS to/from
SoA floats, and also add code for conversion from rgb9e5 AoS to float SoA
(which works pretty much the same as r11g11b10 except for the packing).
(This code should also be used for texture sampling instead of
relying on u_format conversion but it's not yet, so rgb9e5 is unused.)
Unfortunately a crazy amount of hacks is necessary to get the conversion
code running in llvmpipe's generate_unswizzled_blend, which isn't well
suited for formats where the storage representation has nothing to do
with what's needed for blending (moreover, the conversion will convert
from packed AoS values, which is the storage format, to float SoA values,
because this is much more natural for the conversion, and likewise from
SoA values to packed AoS values - but the blend (which includes
trivial things like partial mask) works on AoS values, so incoming fs
values will go SoA-AoS, values from destination will go packed
AoS-SoA-AoS, then do blend, then AoS-SoA-packed AoS which probably
isn't the most efficient way though the shuffles are probably bearable).

Passes piglit fbo-blending-formats (with GL_EXT_packed_float parameter),
still need to verify Inf/NaNs (where most of the complexity in the
conversion comes from actually).
---
 src/gallium/auxiliary/gallivm/lp_bld_conv.c |  314 +++
 src/gallium/auxiliary/gallivm/lp_bld_conv.h |   14 ++
 src/gallium/drivers/llvmpipe/lp_screen.c|6 +-
 src/gallium/drivers/llvmpipe/lp_state_fs.c  |  126 +++
 4 files changed, 458 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c 
b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index dc3649d..4fce1bc 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -155,6 +155,320 @@ lp_build_bswap_vec(struct gallivm_state *gallivm,
 
 
 /**
+ * Convert float32 to a float-like value with less exponent and mantissa
+ * bits. The mantissa is still biased, and the mantissa still has an implied 1,
+ * but there's no sign bit.
+ *
+ * @param src (vector) float value to convert
+ * @param mantissa_bits   the number of mantissa bits
+ * @param exponent_bits   the number of exponent bits
+ *
+ * Unlike float_to_half using accurate method here.
+ * This implements round-towards-zero (trunc) hence too large numbers get
+ * converted to largest representable number, not infinity.
+ * Small numbers may get converted to denorms, depending on normal
+ * float denorm handling of the cpu.
+ * Note that compared to the references, below, we skip any rounding bias
+ * and do strict rounding towards zero (if I got the constants right...)
+ * - OpenGL allows rounding towards zero (though not preferred) and
+ * DX10 even seems to require it.
+ * Note that this will not try to pack the values somehow - they will
+ * look like rescaled floats (except for Inf/NaN) (but returned as
+ * (vector) int32).
+ *
+ * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
+ * ref https://gist.github.com/rygorous/2156668
+ */
+static LLVMValueRef
+lp_build_float_to_smallfloat_nosign(struct gallivm_state *gallivm,
+LLVMValueRef src,
+unsigned mantissa_bits,
+unsigned exponent_bits)
+{
+   LLVMBuilderRef builder = gallivm-builder;
+   LLVMTypeRef src_type = LLVMTypeOf(src);
+   LLVMValueRef i32_floatexpmask, i32_smallexpmask, magic, normal;
+   LLVMValueRef clamped, tmp, i32_roundmask, small_max, src_abs;
+   LLVMValueRef isnan, isposinf, isnanorposinf, i32_qnanbit, nanorposinfnum;
+   unsigned length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
+LLVMGetVectorSize(src_type) : 1;
+   struct lp_type f32_type = lp_type_float_vec(32, 32 * length);
+   struct lp_type i32_type = lp_type_int_vec(32, 32 * length);
+   struct lp_build_context f32_bld, i32_bld;
+   LLVMValueRef zero = lp_build_const_vec(gallivm, f32_type, 0.0f);
+
+   lp_build_context_init(f32_bld, gallivm, f32_type);
+   lp_build_context_init(i32_bld, gallivm, i32_type);
+
+   i32_smallexpmask = lp_build_const_int_vec(gallivm, i32_type,
+ ((1  exponent_bits) - 1)  23);
+   i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff  23);
+
+   /* ordinary number */
+   /* clamp to pos range (can still have sign bit if NaN but doesn't matter) */
+   clamped = lp_build_max(f32_bld, src, zero);
+   clamped = LLVMBuildBitCast(builder, clamped, i32_bld.vec_type, );
+   /* get rid of excess mantissa bits */
+   /* really not sure about that constant */
+   i32_roundmask = lp_build_const_int_vec(gallivm, i32_type,
+  ~((1  (23 - mantissa_bits)) - 1));
+
+   tmp = lp_build_and(i32_bld, clamped, i32_roundmask);
+   tmp = LLVMBuildBitCast(builder, tmp, 

Re: [Mesa-dev] [PATCH] llvmpipe: add EXT_packed_float render target format support

2013-03-21 Thread Roland Scheidegger
Ok so before someone else notices that, ignore the rgb9e5 part.
The format isn't quite what I thought it was...

Roland

Am 21.03.2013 23:28, schrieb srol...@vmware.com:
 From: Roland Scheidegger srol...@vmware.com
 
 New conversion code to handle conversion from/to r11g11b10 AoS to/from
 SoA floats, and also add code for conversion from rgb9e5 AoS to float SoA
 (which works pretty much the same as r11g11b10 except for the packing).
 (This code should also be used for texture sampling instead of
 relying on u_format conversion but it's not yet, so rgb9e5 is unused.)
 Unfortunately a crazy amount of hacks is necessary to get the conversion
 code running in llvmpipe's generate_unswizzled_blend, which isn't well
 suited for formats where the storage representation has nothing to do
 with what's needed for blending (moreover, the conversion will convert
 from packed AoS values, which is the storage format, to float SoA values,
 because this is much more natural for the conversion, and likewise from
 SoA values to packed AoS values - but the blend (which includes
 trivial things like partial mask) works on AoS values, so incoming fs
 values will go SoA-AoS, values from destination will go packed
 AoS-SoA-AoS, then do blend, then AoS-SoA-packed AoS which probably
 isn't the most efficient way though the shuffles are probably bearable).
 
 Passes piglit fbo-blending-formats (with GL_EXT_packed_float parameter),
 still need to verify Inf/NaNs (where most of the complexity in the
 conversion comes from actually).
 ---
  src/gallium/auxiliary/gallivm/lp_bld_conv.c |  314 
 +++
  src/gallium/auxiliary/gallivm/lp_bld_conv.h |   14 ++
  src/gallium/drivers/llvmpipe/lp_screen.c|6 +-
  src/gallium/drivers/llvmpipe/lp_state_fs.c  |  126 +++
  4 files changed, 458 insertions(+), 2 deletions(-)
 
 diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c 
 b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
 index dc3649d..4fce1bc 100644
 --- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
 +++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
 @@ -155,6 +155,320 @@ lp_build_bswap_vec(struct gallivm_state *gallivm,
  
  
  /**
 + * Convert float32 to a float-like value with less exponent and mantissa
 + * bits. The mantissa is still biased, and the mantissa still has an implied 
 1,
 + * but there's no sign bit.
 + *
 + * @param src (vector) float value to convert
 + * @param mantissa_bits   the number of mantissa bits
 + * @param exponent_bits   the number of exponent bits
 + *
 + * Unlike float_to_half using accurate method here.
 + * This implements round-towards-zero (trunc) hence too large numbers get
 + * converted to largest representable number, not infinity.
 + * Small numbers may get converted to denorms, depending on normal
 + * float denorm handling of the cpu.
 + * Note that compared to the references, below, we skip any rounding bias
 + * and do strict rounding towards zero (if I got the constants right...)
 + * - OpenGL allows rounding towards zero (though not preferred) and
 + * DX10 even seems to require it.
 + * Note that this will not try to pack the values somehow - they will
 + * look like rescaled floats (except for Inf/NaN) (but returned as
 + * (vector) int32).
 + *
 + * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
 + * ref https://gist.github.com/rygorous/2156668
 + */
 +static LLVMValueRef
 +lp_build_float_to_smallfloat_nosign(struct gallivm_state *gallivm,
 +LLVMValueRef src,
 +unsigned mantissa_bits,
 +unsigned exponent_bits)
 +{
 +   LLVMBuilderRef builder = gallivm-builder;
 +   LLVMTypeRef src_type = LLVMTypeOf(src);
 +   LLVMValueRef i32_floatexpmask, i32_smallexpmask, magic, normal;
 +   LLVMValueRef clamped, tmp, i32_roundmask, small_max, src_abs;
 +   LLVMValueRef isnan, isposinf, isnanorposinf, i32_qnanbit, nanorposinfnum;
 +   unsigned length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
 +LLVMGetVectorSize(src_type) : 1;
 +   struct lp_type f32_type = lp_type_float_vec(32, 32 * length);
 +   struct lp_type i32_type = lp_type_int_vec(32, 32 * length);
 +   struct lp_build_context f32_bld, i32_bld;
 +   LLVMValueRef zero = lp_build_const_vec(gallivm, f32_type, 0.0f);
 +
 +   lp_build_context_init(f32_bld, gallivm, f32_type);
 +   lp_build_context_init(i32_bld, gallivm, i32_type);
 +
 +   i32_smallexpmask = lp_build_const_int_vec(gallivm, i32_type,
 + ((1  exponent_bits) - 1)  
 23);
 +   i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff  23);
 +
 +   /* ordinary number */
 +   /* clamp to pos range (can still have sign bit if NaN but doesn't matter) 
 */
 +   clamped = lp_build_max(f32_bld, src, zero);
 +   clamped = LLVMBuildBitCast(builder, clamped, i32_bld.vec_type, );
 +   /* get rid of excess mantissa bits */
 +   /* really 

[Mesa-dev] [PATCH v2 1/5] i965: Move brw_vs_prog_data::outputs_written into VUE map.

2013-03-21 Thread Paul Berry
Future patches will allow for there to be separate VUE maps when both
a geometry shader and a vertex shader are in use.  When this happens,
we will want to have correspondingly separate outputs_written
bitfields.  Moving outputs_written into the VUE map will make this
easy.

For consistency with the terminology used in the VUE map, the bitfield
is renamed to slots_valid in the process.

Reviewed-by: Eric Anholt e...@anholt.net
---
 src/mesa/drivers/dri/i965/brw_clip.c   |  2 +-
 src/mesa/drivers/dri/i965/brw_context.h|  8 +++-
 src/mesa/drivers/dri/i965/brw_gs.c |  2 +-
 src/mesa/drivers/dri/i965/brw_sf.c |  2 +-
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp |  9 -
 src/mesa/drivers/dri/i965/brw_vs.c | 23 ---
 src/mesa/drivers/dri/i965/brw_wm.c |  2 +-
 7 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_clip.c 
b/src/mesa/drivers/dri/i965/brw_clip.c
index d411208..e20f7c2 100644
--- a/src/mesa/drivers/dri/i965/brw_clip.c
+++ b/src/mesa/drivers/dri/i965/brw_clip.c
@@ -146,7 +146,7 @@ brw_upload_clip_prog(struct brw_context *brw)
/* BRW_NEW_REDUCED_PRIMITIVE */
key.primitive = brw-intel.reduced_primitive;
/* CACHE_NEW_VS_PROG (also part of VUE map) */
-   key.attrs = brw-vs.prog_data-outputs_written;
+   key.attrs = brw-vs.prog_data-vue_map.slots_valid;
/* _NEW_LIGHT */
key.do_flat_shading = (ctx-Light.ShadeModel == GL_FLAT);
key.pv_first = (ctx-Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION);
diff --git a/src/mesa/drivers/dri/i965/brw_context.h 
b/src/mesa/drivers/dri/i965/brw_context.h
index 9f1aaf5..fe6e639 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -354,6 +354,13 @@ typedef enum
  */
 struct brw_vue_map {
/**
+* Bitfield representing all varying slots that are (a) stored in this VUE
+* map, and (b) actually written by the shader.  Does not include any of
+* the additional varying slots defined in brw_varying_slot.
+*/
+   GLbitfield64 slots_valid;
+
+   /**
 * Map from gl_varying_slot value to VUE slot.  For gl_varying_slots that 
are
 * not stored in a slot (because they are not written, or because
 * additional processing is applied before storing them in the VUE), the
@@ -437,7 +444,6 @@ struct brw_vs_prog_data {
GLuint curb_read_length;
GLuint urb_read_length;
GLuint total_grf;
-   GLbitfield64 outputs_written;
GLuint nr_params;   /** number of float params/constants */
GLuint nr_pull_params; /** number of dwords referenced by pull_param[] */
GLuint total_scratch;
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c 
b/src/mesa/drivers/dri/i965/brw_gs.c
index 1328984..e755a10 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -167,7 +167,7 @@ static void populate_key( struct brw_context *brw,
memset(key, 0, sizeof(*key));
 
/* CACHE_NEW_VS_PROG (part of VUE map) */
-   key-attrs = brw-vs.prog_data-outputs_written;
+   key-attrs = brw-vs.prog_data-vue_map.slots_valid;
 
/* BRW_NEW_PRIMITIVE */
key-primitive = brw-primitive;
diff --git a/src/mesa/drivers/dri/i965/brw_sf.c 
b/src/mesa/drivers/dri/i965/brw_sf.c
index fdc6bd7..c8b7033 100644
--- a/src/mesa/drivers/dri/i965/brw_sf.c
+++ b/src/mesa/drivers/dri/i965/brw_sf.c
@@ -145,7 +145,7 @@ brw_upload_sf_prog(struct brw_context *brw)
/* Populate the key, noting state dependencies:
 */
/* CACHE_NEW_VS_PROG */
-   key.attrs = brw-vs.prog_data-outputs_written; 
+   key.attrs = brw-vs.prog_data-vue_map.slots_valid;
 
/* BRW_NEW_REDUCED_PRIMITIVE */
switch (brw-intel.reduced_primitive) {
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp 
b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 60575d7..b0a0dd6 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -2402,7 +2402,7 @@ void
 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
 {
if (intel-gen  6 
-   ((c-prog_data.outputs_written  BITFIELD64_BIT(VARYING_SLOT_PSIZ)) ||
+   ((c-prog_data.vue_map.slots_valid  VARYING_BIT_PSIZ) ||
 c-key.userclip_active || brw-has_negative_rhw_bug)) {
   dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
   dst_reg header1_w = header1;
@@ -2411,7 +2411,7 @@ vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
 
   emit(MOV(header1, 0u));
 
-  if (c-prog_data.outputs_written  BITFIELD64_BIT(VARYING_SLOT_PSIZ)) {
+  if (c-prog_data.vue_map.slots_valid  VARYING_BIT_PSIZ) {
 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
 
 current_annotation = Point size;
@@ -2456,7 +2456,7 @@ vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
   emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
} else {
   emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
-  if 

[Mesa-dev] [PATCH v2 2/5] i965: Store the geometry output VUE map in brw_context.

2013-03-21 Thread Paul Berry
Currently, the GPU pipeline has one active VUE map in effect at any
given time--the one representing the layout of vertex data coming from
the vertex shader.  However, when geometry shaders are added, they
will have their own independent VUE map.  Later pipeline stages (clip,
sf, fs) will need to consult the geometry shader VUE map if a geometry
shader is in use, and the vertex shader VUE map otherwise.

This patch adds a new field to brw_context, vue_map_geom_out, which
contains the VUE map that should be used by later pipeline stages.  It
also adds a new state flag, BRW_NEW_VUE_MAP_GEOM_OUT, which is
signalled whenever the contents of the VUE map changes.

Since we don't support geometry shaders yet, vue_map_geom_out is
currently set only by the brw_vs_prog state atom.

v2: Don't set vue_map_geom_out in do_vs_prog--that's redundant and
possibly problematic for precompiles.  Only set it in
brw_upload_vs_prog.  Also, make a copy instead of using a
pointer--this makes it possible to detect when the VUE map hasn't
changed, so we can avoid redundant state uploads.
---
 src/mesa/drivers/dri/i965/brw_context.h  | 11 +++
 src/mesa/drivers/dri/i965/brw_state_upload.c |  1 +
 src/mesa/drivers/dri/i965/brw_vs.c   |  5 +
 3 files changed, 17 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h 
b/src/mesa/drivers/dri/i965/brw_context.h
index fe6e639..11722e7 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -153,6 +153,7 @@ enum brw_state_id {
BRW_STATE_PROGRAM_CACHE,
BRW_STATE_STATE_BASE_ADDRESS,
BRW_STATE_SOL_INDICES,
+   BRW_STATE_VUE_MAP_GEOM_OUT,
 };
 
 #define BRW_NEW_URB_FENCE   (1  BRW_STATE_URB_FENCE)
@@ -182,6 +183,7 @@ enum brw_state_id {
 #define BRW_NEW_PROGRAM_CACHE  (1  BRW_STATE_PROGRAM_CACHE)
 #define BRW_NEW_STATE_BASE_ADDRESS (1  BRW_STATE_STATE_BASE_ADDRESS)
 #define BRW_NEW_SOL_INDICES(1  BRW_STATE_SOL_INDICES)
+#define BRW_NEW_VUE_MAP_GEOM_OUT   (1  BRW_STATE_VUE_MAP_GEOM_OUT)
 
 struct brw_state_flags {
/** State update flags signalled by mesa internals */
@@ -917,6 +919,15 @@ struct brw_context
   uint32_t offset;
} sampler;
 
+   /**
+* Layout of vertex data exiting the geometry portion of the pipleine.
+* This comes from the geometry shader if one exists, otherwise from the
+* vertex shader.
+*
+* BRW_NEW_VUE_MAP_GEOM_OUT is flagged when the VUE map changes.
+*/
+   struct brw_vue_map vue_map_geom_out;
+
struct {
   struct brw_vs_prog_data *prog_data;
 
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c 
b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 41dfdc3..5c5c05e 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -376,6 +376,7 @@ static struct dirty_bit_map brw_bits[] = {
DEFINE_BIT(BRW_NEW_PROGRAM_CACHE),
DEFINE_BIT(BRW_NEW_STATE_BASE_ADDRESS),
DEFINE_BIT(BRW_NEW_SOL_INDICES),
+   DEFINE_BIT(BRW_NEW_VUE_MAP_GEOM_OUT),
{0, 0, 0}
 };
 
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c 
b/src/mesa/drivers/dri/i965/brw_vs.c
index d875703..c8ca018 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -488,6 +488,11 @@ static void brw_upload_vs_prog(struct brw_context *brw)
 
   assert(success);
}
+   if (memcmp(brw-vs.prog_data-vue_map, brw-vue_map_geom_out,
+  sizeof(brw-vue_map_geom_out)) != 0) {
+  brw-vue_map_geom_out = brw-vs.prog_data-vue_map;
+  brw-state.dirty.brw |= BRW_NEW_VUE_MAP_GEOM_OUT;
+   }
 }
 
 /* See brw_vs.c:
-- 
1.8.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 3/5] i965: Use brw.vue_map_geom_out instead of VS output VUE map where appropriate.

2013-03-21 Thread Paul Berry
This patch modifies post-GS pipeline stages (transform feedback, clip,
sf, fs) to refer to the VUE map through brw-vue_map_geom_out rather
than brw-vs.prog_data-vue_map.  This ensures that when geometry
shader support is added, these pipeline stages will consult the
geometry shader output VUE map when appropriate, rather than the
vertex shader output VUE map.
---
 src/mesa/drivers/dri/i965/brw_clip.c   |  7 +++
 src/mesa/drivers/dri/i965/brw_sf.c |  7 +++
 src/mesa/drivers/dri/i965/brw_state.h  |  2 +-
 src/mesa/drivers/dri/i965/brw_wm.c |  6 +++---
 src/mesa/drivers/dri/i965/gen6_sf_state.c  | 10 +-
 src/mesa/drivers/dri/i965/gen7_sf_state.c  |  8 
 src/mesa/drivers/dri/i965/gen7_sol_state.c | 14 +++---
 7 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_clip.c 
b/src/mesa/drivers/dri/i965/brw_clip.c
index e20f7c2..fa7e85d 100644
--- a/src/mesa/drivers/dri/i965/brw_clip.c
+++ b/src/mesa/drivers/dri/i965/brw_clip.c
@@ -69,7 +69,7 @@ static void compile_clip_prog( struct brw_context *brw,
c.func.single_program_flow = 1;
 
c.key = *key;
-   c.vue_map = brw-vs.prog_data-vue_map;
+   c.vue_map = brw-vue_map_geom_out;
 
/* nr_regs is the number of registers filled by reading data from the VUE.
 * This program accesses the entire VUE, so nr_regs needs to be the size of
@@ -146,7 +146,7 @@ brw_upload_clip_prog(struct brw_context *brw)
/* BRW_NEW_REDUCED_PRIMITIVE */
key.primitive = brw-intel.reduced_primitive;
/* CACHE_NEW_VS_PROG (also part of VUE map) */
-   key.attrs = brw-vs.prog_data-vue_map.slots_valid;
+   key.attrs = brw-vue_map_geom_out.slots_valid;
/* _NEW_LIGHT */
key.do_flat_shading = (ctx-Light.ShadeModel == GL_FLAT);
key.pv_first = (ctx-Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION);
@@ -258,8 +258,7 @@ const struct brw_tracked_state brw_clip_prog = {
_NEW_TRANSFORM |
_NEW_POLYGON | 
_NEW_BUFFERS),
-  .brw   = (BRW_NEW_REDUCED_PRIMITIVE),
-  .cache = CACHE_NEW_VS_PROG
+  .brw   = (BRW_NEW_REDUCED_PRIMITIVE | BRW_NEW_VUE_MAP_GEOM_OUT)
},
.emit = brw_upload_clip_prog
 };
diff --git a/src/mesa/drivers/dri/i965/brw_sf.c 
b/src/mesa/drivers/dri/i965/brw_sf.c
index c8b7033..fc36961 100644
--- a/src/mesa/drivers/dri/i965/brw_sf.c
+++ b/src/mesa/drivers/dri/i965/brw_sf.c
@@ -65,7 +65,7 @@ static void compile_sf_prog( struct brw_context *brw,
brw_init_compile(brw, c.func, mem_ctx);
 
c.key = *key;
-   c.vue_map = brw-vs.prog_data-vue_map;
+   c.vue_map = brw-vue_map_geom_out;
if (c.key.do_point_coord) {
   /*
* gl_PointCoord is a FS instead of VS builtin variable, thus it's
@@ -145,7 +145,7 @@ brw_upload_sf_prog(struct brw_context *brw)
/* Populate the key, noting state dependencies:
 */
/* CACHE_NEW_VS_PROG */
-   key.attrs = brw-vs.prog_data-vue_map.slots_valid;
+   key.attrs = brw-vue_map_geom_out.slots_valid;
 
/* BRW_NEW_REDUCED_PRIMITIVE */
switch (brw-intel.reduced_primitive) {
@@ -216,8 +216,7 @@ const struct brw_tracked_state brw_sf_prog = {
.dirty = {
   .mesa  = (_NEW_HINT | _NEW_LIGHT | _NEW_POLYGON | _NEW_POINT |
 _NEW_TRANSFORM | _NEW_BUFFERS | _NEW_PROGRAM),
-  .brw   = (BRW_NEW_REDUCED_PRIMITIVE),
-  .cache = CACHE_NEW_VS_PROG
+  .brw   = (BRW_NEW_REDUCED_PRIMITIVE | BRW_NEW_VUE_MAP_GEOM_OUT)
},
.emit = brw_upload_sf_prog
 };
diff --git a/src/mesa/drivers/dri/i965/brw_state.h 
b/src/mesa/drivers/dri/i965/brw_state.h
index 02ce57b..1f5e18a 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -227,7 +227,7 @@ void upload_default_color(struct brw_context *brw,
 
 /* gen6_sf_state.c */
 uint32_t
-get_attr_override(struct brw_vue_map *vue_map, int urb_entry_read_offset,
+get_attr_override(const struct brw_vue_map *vue_map, int urb_entry_read_offset,
   int fs_attr, bool two_side_color, uint32_t *max_source_attr);
 
 #ifdef __cplusplus
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c 
b/src/mesa/drivers/dri/i965/brw_wm.c
index e7e9ddc..6053f94 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -481,7 +481,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
 
/* CACHE_NEW_VS_PROG */
if (intel-gen  6)
-  key-vp_outputs_written = brw-vs.prog_data-vue_map.slots_valid;
+  key-vp_outputs_written = brw-vue_map_geom_out.slots_valid;
 
/* The unique fragment program ID */
key-program_string_id = fp-id;
@@ -524,8 +524,8 @@ const struct brw_tracked_state brw_wm_prog = {
_NEW_MULTISAMPLE),
   .brw   = (BRW_NEW_FRAGMENT_PROGRAM |
BRW_NEW_WM_INPUT_DIMENSIONS |
-   BRW_NEW_REDUCED_PRIMITIVE),
-  .cache = CACHE_NEW_VS_PROG,
+   BRW_NEW_REDUCED_PRIMITIVE |
+BRW_NEW_VUE_MAP_GEOM_OUT)
},
.emit = 

[Mesa-dev] [PATCH v2 4/5] i965/fs: Rename vp_outputs_written to input_slots_valid.

2013-03-21 Thread Paul Berry
With the introduction of geometry shaders, fragment inputs will no
longer come exclusively from the vertex shader; sometimes they come
from the geometry shader.  So the name vp_outputs_written will
become a misnomer.  This patch renames vp_outputs_written to
input_slots_valid, to reflect the true meaning of the bitfield from
the fragment shader's point of view: it indicates which of the
possible input slots contain valid data that was written by the
previous shader stage.

Reviewed-by: Eric Anholt e...@anholt.net
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 6 +++---
 src/mesa/drivers/dri/i965/brw_wm.c   | 6 +++---
 src/mesa/drivers/dri/i965/brw_wm.h   | 2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 5a5bfeb..ecce66b 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -1264,7 +1264,7 @@ fs_visitor::calculate_urb_setup()
  if (i == VARYING_SLOT_PSIZ)
 continue;
 
-if (c-key.vp_outputs_written  BITFIELD64_BIT(i)) {
+if (c-key.input_slots_valid  BITFIELD64_BIT(i)) {
/* The back color slot is skipped when the front color is
 * also written to.  In addition, some slots can be
 * written in the vertex shader and not read in the
@@ -2995,7 +2995,7 @@ brw_fs_precompile(struct gl_context *ctx, struct 
gl_shader_program *prog)
}
 
if (intel-gen  6)
-  key.vp_outputs_written |= BITFIELD64_BIT(VARYING_SLOT_POS);
+  key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
 
for (int i = 0; i  VARYING_SLOT_MAX; i++) {
   if (!(fp-Base.InputsRead  BITFIELD64_BIT(i)))
@@ -3006,7 +3006,7 @@ brw_fs_precompile(struct gl_context *ctx, struct 
gl_shader_program *prog)
 
   if (intel-gen  6) {
  if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
-key.vp_outputs_written |= BITFIELD64_BIT(i);
+key.input_slots_valid |= BITFIELD64_BIT(i);
   }
}
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c 
b/src/mesa/drivers/dri/i965/brw_wm.c
index 6053f94..19a95bd 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -289,8 +289,8 @@ brw_wm_debug_recompile(struct brw_context *brw,
   old_key-proj_attrib_mask, key-proj_attrib_mask);
found |= key_debug(intel, renderbuffer height,
   old_key-drawable_height, key-drawable_height);
-   found |= key_debug(intel, vertex shader outputs,
-  old_key-vp_outputs_written, key-vp_outputs_written);
+   found |= key_debug(intel, input slots valid,
+  old_key-input_slots_valid, key-input_slots_valid);
 
found |= brw_debug_recompile_sampler_key(intel, old_key-tex, key-tex);
 
@@ -481,7 +481,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
 
/* CACHE_NEW_VS_PROG */
if (intel-gen  6)
-  key-vp_outputs_written = brw-vue_map_geom_out.slots_valid;
+  key-input_slots_valid = brw-vue_map_geom_out.slots_valid;
 
/* The unique fragment program ID */
key-program_string_id = fp-id;
diff --git a/src/mesa/drivers/dri/i965/brw_wm.h 
b/src/mesa/drivers/dri/i965/brw_wm.h
index 8eb71de..f43d42c 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.h
+++ b/src/mesa/drivers/dri/i965/brw_wm.h
@@ -70,7 +70,7 @@ struct brw_wm_prog_key {
GLbitfield64 proj_attrib_mask; /** one bit per fragment program attribute 
*/
 
GLushort drawable_height;
-   GLbitfield64 vp_outputs_written;
+   GLbitfield64 input_slots_valid;
GLuint program_string_id:32;
 
struct brw_sampler_prog_key_data tex;
-- 
1.8.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 5/5] i965: Shrink brw_vue_map struct.

2013-03-21 Thread Paul Berry
This patch changes the arrays in brw_vue_map (which only ever contain
values from -1 to 58) from ints to signed chars.  This reduces the
size of the struct from 488 bytes to 136 bytes.
---
 src/mesa/drivers/dri/i965/brw_context.h | 4 ++--
 src/mesa/drivers/dri/i965/brw_vs.c  | 6 ++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h 
b/src/mesa/drivers/dri/i965/brw_context.h
index 11722e7..506a957 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -368,7 +368,7 @@ struct brw_vue_map {
 * additional processing is applied before storing them in the VUE), the
 * value is -1.
 */
-   int vert_result_to_slot[BRW_VARYING_SLOT_MAX];
+   signed char vert_result_to_slot[BRW_VARYING_SLOT_MAX];
 
/**
 * Map from VUE slot to gl_varying_slot value.  For slots that do not
@@ -379,7 +379,7 @@ struct brw_vue_map {
 * simplifies code that uses the value stored in slot_to_vert_result to
 * create a bit mask).
 */
-   int slot_to_vert_result[BRW_VARYING_SLOT_MAX];
+   signed char slot_to_vert_result[BRW_VARYING_SLOT_MAX];
 
/**
 * Total number of VUE slots in use
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c 
b/src/mesa/drivers/dri/i965/brw_vs.c
index c8ca018..7e941dd 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -66,6 +66,12 @@ brw_compute_vue_map(struct brw_context *brw, struct 
brw_vs_compile *c,
vue_map-slots_valid = slots_valid;
int i;
 
+   /* Make sure that the values we store in vue_map-vert_result_to_slot and
+* vue_map-slot_to_vert_result won't overflow the signed chars that are
+* used to store them.
+*/
+   STATIC_ASSERT(BRW_VARYING_SLOT_MAX = 128);
+
vue_map-num_slots = 0;
for (i = 0; i  BRW_VARYING_SLOT_MAX; ++i) {
   vue_map-vert_result_to_slot[i] = -1;
-- 
1.8.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] llvmpipe: add EXT_packed_float render target format support

2013-03-21 Thread sroland
From: Roland Scheidegger srol...@vmware.com

New conversion code to handle conversion from/to r11g11b10 AoS to/from
SoA floats, and also add code for conversion from rgb9e5 AoS to float SoA
(which works pretty much the same as r11g11b10 except for the packing).
(This code should also be used for texture sampling instead of
relying on u_format conversion but it's not yet, so rgb9e5 is unused.)
Unfortunately a crazy amount of hacks is necessary to get the conversion
code running in llvmpipe's generate_unswizzled_blend, which isn't well
suited for formats where the storage representation has nothing to do
with what's needed for blending (moreover, the conversion will convert
from packed AoS values, which is the storage format, to float SoA values,
because this is much more natural for the conversion, and likewise from
SoA values to packed AoS values - but the blend (which includes
trivial things like partial mask) works on AoS values, so incoming fs
values will go SoA-AoS, values from destination will go packed
AoS-SoA-AoS, then do blend, then AoS-SoA-packed AoS which probably
isn't the most efficient way though the shuffles are probably bearable).

Passes piglit fbo-blending-formats (with GL_EXT_packed_float parameter),
still need to verify Inf/NaNs (where most of the complexity in the
conversion comes from actually).

v2: drop the (very bogus) rgb9e5 part, and do component extraction
in the helper code for r11g11b10 to float conversion, making the code
slightly more compact (suggested by Jose), now that there are no other
callers left this works quite well. (Could do the same for the
opposite way but it's less than ideal there, final part of packing
needs to be done in caller anyway and there'd be another conditional.)
---
 src/gallium/auxiliary/gallivm/lp_bld_conv.c |  250 +++
 src/gallium/auxiliary/gallivm/lp_bld_conv.h |9 +
 src/gallium/drivers/llvmpipe/lp_screen.c|6 +-
 src/gallium/drivers/llvmpipe/lp_state_fs.c  |  126 ++
 4 files changed, 389 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c 
b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index dc3649d..06d64c7 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -155,6 +155,256 @@ lp_build_bswap_vec(struct gallivm_state *gallivm,
 
 
 /**
+ * Convert float32 to a float-like value with less exponent and mantissa
+ * bits. The mantissa is still biased, and the mantissa still has an implied 1,
+ * but there's no sign bit.
+ *
+ * @param src (vector) float value to convert
+ * @param mantissa_bits   the number of mantissa bits
+ * @param exponent_bits   the number of exponent bits
+ *
+ * Unlike float_to_half using accurate method here.
+ * This implements round-towards-zero (trunc) hence too large numbers get
+ * converted to largest representable number, not infinity.
+ * Small numbers may get converted to denorms, depending on normal
+ * float denorm handling of the cpu.
+ * Note that compared to the references, below, we skip any rounding bias
+ * and do strict rounding towards zero (if I got the constants right...)
+ * - OpenGL allows rounding towards zero (though not preferred) and
+ * DX10 even seems to require it.
+ * Note that this will not try to pack the values somehow - they will
+ * look like rescaled floats (except for Inf/NaN) (but returned as
+ * (vector) int32).
+ *
+ * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
+ * ref https://gist.github.com/rygorous/2156668
+ */
+static LLVMValueRef
+lp_build_float_to_smallfloat_nosign(struct gallivm_state *gallivm,
+LLVMValueRef src,
+unsigned mantissa_bits,
+unsigned exponent_bits)
+{
+   LLVMBuilderRef builder = gallivm-builder;
+   LLVMTypeRef src_type = LLVMTypeOf(src);
+   LLVMValueRef i32_floatexpmask, i32_smallexpmask, magic, normal;
+   LLVMValueRef clamped, tmp, i32_roundmask, small_max, src_abs;
+   LLVMValueRef isnan, isposinf, isnanorposinf, i32_qnanbit, nanorposinfnum;
+   unsigned length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
+LLVMGetVectorSize(src_type) : 1;
+   struct lp_type f32_type = lp_type_float_vec(32, 32 * length);
+   struct lp_type i32_type = lp_type_int_vec(32, 32 * length);
+   struct lp_build_context f32_bld, i32_bld;
+   LLVMValueRef zero = lp_build_const_vec(gallivm, f32_type, 0.0f);
+
+   lp_build_context_init(f32_bld, gallivm, f32_type);
+   lp_build_context_init(i32_bld, gallivm, i32_type);
+
+   i32_smallexpmask = lp_build_const_int_vec(gallivm, i32_type,
+ ((1  exponent_bits) - 1)  23);
+   i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff  23);
+
+   /* ordinary number */
+   /* clamp to pos range (can still have sign bit if NaN but doesn't matter) */
+   clamped = lp_build_max(f32_bld,