Re: [Mesa-dev] [PATCH] ac, radv: fix removing the vec3 restriction on SI

2019-06-03 Thread Matt Arsenault


> On Jun 3, 2019, at 9:13 AM, Samuel Pitoiset  wrote:
> 
> I thought LLVM was able to handle that itself but actually it
> does not. That means we shouldn't try to emit vec3 on SI because
> it's unsupported.
> 

It should. Can you file a bug with an example that doesn’t work?



> Fixes: 6970a9a6ca9 ("ac,radv: remove the vec3 restriction with LLVM 9+")"
> Signed-off-by: Samuel Pitoiset 
> ---
> src/amd/common/ac_llvm_build.c| 12 ++--
> src/amd/common/ac_llvm_util.h |  9 +
> src/amd/common/ac_nir_to_llvm.c   |  3 ++-
> src/amd/vulkan/radv_nir_to_llvm.c |  2 +-
> 4 files changed, 18 insertions(+), 8 deletions(-)
> 
> diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
> index 613c1eef942..7f5c8ef873c 100644
> --- a/src/amd/common/ac_llvm_build.c
> +++ b/src/amd/common/ac_llvm_build.c
> @@ -1167,7 +1167,7 @@ ac_build_llvm8_buffer_store_common(struct 
> ac_llvm_context *ctx,
>   args[idx++] = voffset ? voffset : ctx->i32_0;
>   args[idx++] = soffset ? soffset : ctx->i32_0;
>   args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
> - unsigned func = HAVE_LLVM < 0x900 && num_channels == 3 ? 4 : 
> num_channels;
> + unsigned func = !ac_has_vec3_support(ctx->chip_class) && num_channels 
> == 3 ? 4 : num_channels;
>   const char *indexing_kind = structurized ? "struct" : "raw";
>   char name[256], type_name[8];
> 
> @@ -1227,7 +1227,7 @@ ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
> {
>   /* Split 3 channel stores, because only LLVM 9+ support 3-channel
>* intrinsics. */
> - if (num_channels == 3 && HAVE_LLVM < 0x900) {
> + if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class)) {
>   LLVMValueRef v[3], v01;
> 
>   for (int i = 0; i < 3; i++) {
> @@ -1354,7 +1354,7 @@ ac_build_llvm8_buffer_load_common(struct 
> ac_llvm_context *ctx,
>   args[idx++] = voffset ? voffset : ctx->i32_0;
>   args[idx++] = soffset ? soffset : ctx->i32_0;
>   args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
> - unsigned func = HAVE_LLVM < 0x900 && num_channels == 3 ? 4 : 
> num_channels;
> + unsigned func = !ac_has_vec3_support(ctx->chip_class) && num_channels 
> == 3 ? 4 : num_channels;
>   const char *indexing_kind = structurized ? "struct" : "raw";
>   char name[256], type_name[8];
> 
> @@ -1420,7 +1420,7 @@ ac_build_buffer_load(struct ac_llvm_context *ctx,
>   if (num_channels == 1)
>   return result[0];
> 
> - if (num_channels == 3 && HAVE_LLVM < 0x900)
> + if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class))
>   result[num_channels++] = LLVMGetUndef(ctx->f32);
>   return ac_build_gather_values(ctx, result, num_channels);
>   }
> @@ -1512,7 +1512,7 @@ ac_build_llvm8_tbuffer_load(struct ac_llvm_context *ctx,
>   args[idx++] = soffset ? soffset : ctx->i32_0;
>   args[idx++] = LLVMConstInt(ctx->i32, dfmt | (nfmt << 4), 0);
>   args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
> - unsigned func = HAVE_LLVM < 0x900 && num_channels == 3 ? 4 : 
> num_channels;
> + unsigned func = !ac_has_vec3_support(ctx->chip_class) && num_channels 
> == 3 ? 4 : num_channels;
>   const char *indexing_kind = structurized ? "struct" : "raw";
>   char name[256], type_name[8];
> 
> @@ -2011,7 +2011,7 @@ ac_build_llvm8_tbuffer_store(struct ac_llvm_context 
> *ctx,
>   args[idx++] = soffset ? soffset : ctx->i32_0;
>   args[idx++] = LLVMConstInt(ctx->i32, dfmt | (nfmt << 4), 0);
>   args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
> - unsigned func = HAVE_LLVM < 0x900 && num_channels == 3 ? 4 : 
> num_channels;
> + unsigned func = !ac_has_vec3_support(ctx->chip_class) && num_channels 
> == 3 ? 4 : num_channels;
>   const char *indexing_kind = structurized ? "struct" : "raw";
>   char name[256], type_name[8];
> 
> diff --git a/src/amd/common/ac_llvm_util.h b/src/amd/common/ac_llvm_util.h
> index ca00540da80..a45647a3360 100644
> --- a/src/amd/common/ac_llvm_util.h
> +++ b/src/amd/common/ac_llvm_util.h
> @@ -146,6 +146,15 @@ bool ac_compile_module_to_binary(struct 
> ac_compiler_passes *p, LLVMModuleRef mod
> void ac_llvm_add_barrier_noop_pass(LLVMPassManagerRef passmgr);
> void ac_enable_global_isel(LLVMTargetMachineRef tm);
> 
> +static inline bool
> +ac_has_vec3_support(enum chip_class chip)
> +{
> + if (chip == GFX6)
> + return false;
> +
> + return HAVE_LLVM >= 0x900;
> +}
> +
> #ifdef __cplusplus
> }
> #endif
> diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
> index 51f92a6b062..429dac63d63 100644
> --- a/src/amd/common/ac_nir_to_llvm.c
> +++ b/src/amd/common/ac_nir_to_llvm.c
> @@ -1576,7 +1576,8 @@ static void visit_store_ssbo(struct ac_nir_context *ctx,
> 
>   /* Due to an LLVM 

Re: [Mesa-dev] [PATCH] radv: enable denorms for 64-bit and 16-bit floats

2017-12-28 Thread Matt Arsenault


> On Dec 28, 2017, at 16:55, Samuel Pitoiset  wrote:
> 
> Similar to RadeonSI.
> 
> This fixes:
> dEQP-VK.image.texel_view_compatible.graphic.basic.attachment_read.bc*r16g16b16a16_sfloat
> dEQP-VK.image.extended_usage_bit.attachment_write.r16_sfloat
> 
> Signed-off-by: Samuel Pitoiset 
> ---
> src/amd/common/ac_nir_to_llvm.c | 14 ++
> 1 file changed, 14 insertions(+)
> 
> diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
> index d9f2cb408c..9d9a1f911b 100644
> --- a/src/amd/common/ac_nir_to_llvm.c
> +++ b/src/amd/common/ac_nir_to_llvm.c
> @@ -6879,6 +6879,20 @@ static void 
> ac_compile_llvm_module(LLVMTargetMachineRef tm,
>   /* +3 for scratch wave offset and VCC */
>   config->num_sgprs = MAX2(config->num_sgprs,
>shader_info->num_input_sgprs + 3);
> +
> + /* Enable 64-bit and 16-bit denormals, because there is no performance
> +  * cost.
> +  *
> +  * If denormals are enabled, all floating-point output modifiers are
> +  * ignored.
> +  *
> +  * Don't enable denormals for 32-bit floats, because:
> +  * - Floating-point output modifiers would be ignored by the hw.
> +  * - Some opcodes don't support denormals, such as v_mad_f32. We would
> +  *   have to stop using those.
> +  * - SI & CI would be very slow.
> +  */
> + config->float_mode |= V_00B028_FP_64_DENORMS;
> }

This is set in the program binary. You should use that directly rather than 
ignoring it
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 0/5] Volatile and invariant LDS memory ops

2017-11-09 Thread Matt Arsenault

> On Nov 10, 2017, at 07:41, Marek Olšák  wrote:
> 
> Hi,
> 
> This fixes the TCS gl_ClipDistance piglit failure that was uncovered
> by a recent LLVM change. The solution is to set volatile on loads
> and stores to enforce proper ordering.
> 
> Please review.
> 


Every LDS access certainly should not be volatile. This kills all 
optimizations, like formation of ds_read2_b32. What ordering issue are you 
having?

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] radv: emit fmuladd instead of fma to llvm.

2017-10-04 Thread Matt Arsenault

> On Oct 4, 2017, at 12:50, Marek Olšák  wrote:
> 
> The LLVM backends selects MAD (unfused) for fmuladd, and FMA (fused) for fma.

For f64 and f16 by default it will emit an FMA since mad doesn’t support 
denorms.___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] radv: lower ffma in nir.

2017-10-03 Thread Matt Arsenault

> On Oct 3, 2017, at 13:58, Dave Airlie  wrote:
> 
> From: Dave Airlie 
> 
> So it appears the Vulkan SPIR-V fma opcode can be equivalent to a
> mad operation, and the fma hw opcode on AMD hw is issued like a double
> opcode so is slower. Also the radeonsi stack does this.
> 
> This appears to improve performance on a number of games from Feral,
> and thanks to Feral for noticing the problem.
> 
> Signed-off-by: Dave Airlie 
> ---
> src/amd/vulkan/radv_shader.c | 1 +
> 1 file changed, 1 insertion(+)
> 
> diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
> index ca0ad2d..a37345b 100644
> --- a/src/amd/vulkan/radv_shader.c
> +++ b/src/amd/vulkan/radv_shader.c
> @@ -64,6 +64,7 @@ static const struct nir_shader_compiler_options nir_options 
> = {
>   .lower_unpack_unorm_4x8 = true,
>   .lower_extract_byte = true,
>   .lower_extract_word = true,
> + .lower_ffma = true,
>   .max_unroll_iterations = 32
> };
> 
> -- 
> 2.9.5
> 
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev

If it doesn’t matter this should emit llvm.fmuladd. The backend decides what’s 
best to do based on the specific target whether it’s an FMA or MAD.

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 3/6] ac/nir: rewrite local variable handling

2017-07-07 Thread Matt Arsenault

> On Jul 6, 2017, at 19:02, Connor Abbott <cwabbo...@gmail.com> wrote:
> 
> On Thu, Jul 6, 2017 at 6:36 PM, Matt Arsenault <arse...@gmail.com> wrote:
>> 
>> On Jul 6, 2017, at 18:31, Connor Abbott <cwabbo...@gmail.com> wrote:
>> 
>> After looking into it some more, I think LLVM won't promote allocas to
>> registers at all when there are non-constant indices in the mix, and
>> fixing it seems kinda involved. I guess a better solution for now
>> 
>> 
>> AMDGPUPromoteAlloca does this, but it doesn’t happen very often
> 
> Could we just bump the heuristic on maximum size there to match
> radeonsi? That way we wouldn't need to have these heuristics in
> radeonsi and radv. As-is, we're second-guessing the backend.

There’s also a vague plan to replace this with a machine level pass later that 
has more knowledge of register pressure. The current pass is pretty stupid and 
doesn’t even attempt to decide if it’s a good idea for a specific alloca.

-Matt
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 3/6] ac/nir: rewrite local variable handling

2017-07-06 Thread Matt Arsenault

> On Jul 6, 2017, at 18:31, Connor Abbott  wrote:
> 
> After looking into it some more, I think LLVM won't promote allocas to
> registers at all when there are non-constant indices in the mix, and
> fixing it seems kinda involved. I guess a better solution for now

AMDGPUPromoteAlloca does this, but it doesn’t happen very often___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 3/4] ac/llvm: set xnack like radeonsi does.

2017-07-06 Thread Matt Arsenault

> On Jul 6, 2017, at 13:08, Dave Airlie <airl...@gmail.com> wrote:
> 
> On 7 July 2017 at 05:07, Matt Arsenault <arse...@gmail.com> wrote:
>> 
>>> On Jul 5, 2017, at 19:09, Dave Airlie <airl...@gmail.com> wrote:
>>> 
>>> From: Dave Airlie <airl...@redhat.com>
>>> 
>>> Use family, but only set xnack+ for gfx9.
>>> 
>> 
>> The driver shouldn’t be explicitly setting this. This should be set as part 
>> of the subtarget chosen
> 
> Well I expect Marek knows more (I'm just aligning the drivers).
> 
> If I had to guess, it's probably because Carrizo sets xnack in the
> subtarget but we don't want to use it.
> 
> Dave.

This is the problematic part, if you explicitly disable a subtarget feature 
when it is set in the subtarget’s feature list, it for some reason disables all 
of the features in the subtarget
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 3/4] ac/llvm: set xnack like radeonsi does.

2017-07-06 Thread Matt Arsenault

> On Jul 5, 2017, at 19:09, Dave Airlie  wrote:
> 
> From: Dave Airlie 
> 
> Use family, but only set xnack+ for gfx9.
> 

The driver shouldn’t be explicitly setting this. This should be set as part of 
the subtarget chosen

-Matt
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] radeonsi/gfx9: compile shaders with +xnack

2017-05-18 Thread Matt Arsenault

> On May 18, 2017, at 22:46, Marek Olšák  wrote:
> 
> From: Marek Olšák 
> 
> so that LLVM doesn't allocate SGPRs where XNACK is.
> 
> Cc: 17.1 

You shouldn’t be explicitly enabling xnack. This sounds like a workaround for a 
backend bug, and this has other consequences than changing the reserved 
registers. Do you have an example of where this is a problem?
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] radv: flush f32->f16 conversion denormals to zero.

2017-03-16 Thread Matt Arsenault

> On Mar 16, 2017, at 20:02, Dave Airlie <airl...@gmail.com> wrote:
> 
> From: Dave Airlie <airl...@redhat.com>
> 
> SPIR-V defines the f32->f16 operation as flushing denormals to 0,
> this compares the class using amd class opcode.
> 
> Thanks to Matt Arsenault for figuring it out.
> 
> This fixes:
> dEQP-VK.spirv_assembly.instruction.compute.opquantize.flush_to_zero
> 
> Signed-off-by: Dave Airlie <airl...@redhat.com>
> ---
> src/amd/common/ac_nir_to_llvm.c |  9 -
> src/amd/common/sid.h| 13 +
> 2 files changed, 21 insertions(+), 1 deletion(-)
> 
> diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
> index 77e3a85..ac80677 100644
> --- a/src/amd/common/ac_nir_to_llvm.c
> +++ b/src/amd/common/ac_nir_to_llvm.c
> @@ -1027,11 +1027,18 @@ static LLVMValueRef emit_f2f16(struct 
> nir_to_llvm_context *ctx,
> {
>   LLVMValueRef result;
>   LLVMValueRef cond;
> + LLVMValueRef args[2];
>   src0 = to_float(ctx, src0);
>   result = LLVMBuildFPTrunc(ctx->builder, src0, ctx->f16, "");
> - result = ac_build_intrinsic(>ac, "llvm.canonicalize.f16", 
> ctx->f16, , 1, AC_FUNC_ATTR_READNONE);
> + LLVMValueRef mask = LLVMConstInt(ctx->i32, N_SUBNORMAL | P_SUBNORMAL, 
> false);
> + 

I don’t think you need the canonicalize here. This will also only work on VI+ 
which supports f16 instructions

-Matt
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 10/24] radeonsi: replace SI.packf16 with amdgcn.cvt.pkrtz

2017-02-25 Thread Matt Arsenault

> On Feb 25, 2017, at 15:58, Marek Olšák  wrote:
> 
> }
> +
> +LLVMValueRef ac_emit_cvt_pkrtz_f16(struct ac_llvm_context *ctx,
> +LLVMValueRef args[2])
> +{
> + if (HAVE_LLVM >= 0x0500) {
> + LLVMTypeRef v2f16 =
> + LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2);
> + LLVMValueRef res =
> + ac_emit_llvm_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz",
> +v2f16, args, 2,
> +AC_FUNC_ATTR_READNONE);
> + return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
> + }
> +
> + return ac_emit_llvm_intrinsic(ctx, "llvm.SI.packf16", ctx->i32, args, 2,
> +   AC_FUNC_ATTR_READNONE |
> +   AC_FUNC_ATTR_LEGACY);
> +}
> diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
> index f57acc2..e6bb

I would probably swap where you do the bitcast here, and cast the legacy 
intrinsic to <2 x half>. With the pkrtz -> exp.compr, pattern you’ll be 
emitting a cast to i32 and back. While that will be cleaned up, you’re wasting 
some compile time/memory doing so.

-Matt___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] radv/ac: enable loop unrolling.

2017-02-24 Thread Matt Arsenault

> On Feb 24, 2017, at 14:39, Marek Olšák <mar...@gmail.com> wrote:
> 
> On Fri, Feb 24, 2017 at 7:20 PM, Matt Arsenault <arse...@gmail.com> wrote:
>> 
>> On Feb 24, 2017, at 01:45, Marek Olšák <mar...@gmail.com> wrote:
>> 
>> The main requirement is that if there is indirect indexing inside a
>> loop, we always want to unroll the whole loop to get rid of the
>> indexing, which can decrease scratch usage.
>> 
>> Marek
>> 
>> We boost the unroll thresholds when there is private memory indexed by the
>> induction variable. See AMDGPUTTIImpl::getUnrollingPreferences
> 
> When Samuel Pitoiset was experimenting with the same code as this
> patch but for radeonsi, getUnrollingPreferences wasn't even getting
> called when unrolling. I guess he eventually gave up or didn't see any
> positive effect from it.
> 
> Marek

Then there’s a bug somewhere. It should be getting called
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] radv/ac: enable loop unrolling.

2017-02-24 Thread Matt Arsenault

> On Feb 24, 2017, at 01:45, Marek Olšák  wrote:
> 
> The main requirement is that if there is indirect indexing inside a
> loop, we always want to unroll the whole loop to get rid of the
> indexing, which can decrease scratch usage.
> 
> Marek
We boost the unroll thresholds when there is private memory indexed by the 
induction variable. See AMDGPUTTIImpl::getUnrollingPreferences___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] radv/ac: enable loop unrolling.

2017-02-23 Thread Matt Arsenault

> On Feb 23, 2017, at 19:44, Dave Airlie <airl...@gmail.com> wrote:
> 
> On 24 February 2017 at 13:36, Matt Arsenault <arse...@gmail.com 
> <mailto:arse...@gmail.com>> wrote:
>> 
>> On Feb 23, 2017, at 19:27, Dave Airlie <airl...@gmail.com> wrote:
>> 
>> +static void set_unroll_metadata(struct nir_to_llvm_context *ctx,
>> +LLVMValueRef br)
>> +{
>> + unsigned kind = LLVMGetMDKindIDInContext(ctx->context, "llvm.loop", 9);
>> + LLVMValueRef md_unroll;
>> + LLVMValueRef full_arg = LLVMMDStringInContext(ctx->context,
>> "llvm.loop.unroll.full", 21);
>> + LLVMValueRef full = LLVMMDNodeInContext(ctx->context, _arg, 1);
>> +
>> + LLVMValueRef md_args[] = {NULL, full};
>> + md_unroll = LLVMMDNodeInContext(ctx->context, md_args, 2);
>> + ac_metadata_point_op0_to_itself(md_unroll);
>> +
>> + LLVMSetMetadata(br, kind, md_unroll);
>> +}
>> +
>> 
>> 
>> Why are you forcing full unrolling of all loops?
> 
> Because I copied Marek's code with little idea of what llvm does.
> 
> Should I just drop the full bits, perhaps set a llvm.loop.unroll.count = 32?
> 
> Dave.

The question is more why are you using the unroll metadata at all? It’s for 
implementing user hints like pragma unroll. By default the backend heuristics 
should be making these decisions. If this is helping benchmarks then that’s a 
datapoint that we need to play with those and increase the thresholds or 
something.

-Matt___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] radv/ac: enable loop unrolling.

2017-02-23 Thread Matt Arsenault

> On Feb 23, 2017, at 19:27, Dave Airlie  wrote:
> 
> +static void set_unroll_metadata(struct nir_to_llvm_context *ctx,
> +LLVMValueRef br)
> +{
> + unsigned kind = LLVMGetMDKindIDInContext(ctx->context, "llvm.loop", 9);
> + LLVMValueRef md_unroll;
> + LLVMValueRef full_arg = LLVMMDStringInContext(ctx->context, 
> "llvm.loop.unroll.full", 21);
> + LLVMValueRef full = LLVMMDNodeInContext(ctx->context, _arg, 1);
> +
> + LLVMValueRef md_args[] = {NULL, full};
> + md_unroll = LLVMMDNodeInContext(ctx->context, md_args, 2);
> + ac_metadata_point_op0_to_itself(md_unroll);
> +
> + LLVMSetMetadata(br, kind, md_unroll);
> +}
> +

Why are you forcing full unrolling of all loops?

-Matt___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/1] clover: Dump linked module to a different file

2017-02-22 Thread Matt Arsenault

> On Feb 22, 2017, at 07:51, Jan Vesely  wrote:
> 
> This allows to pass the generated files directly to llc or bugpoint.
> Note that if program links multiple binaries they will still be in the same 
> file, the module name is "link”.

Can you add a counter ID or something to ensure unique files?

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] radeonsi: allow unaligned vertex buffer offsets and strides on CIK-VI

2017-02-14 Thread Matt Arsenault

> On Feb 13, 2017, at 09:01, Marek Olšák  wrote:
> 
> So that we can disable u_vbuf for GL core profiles.
> 
> This is a v2 of the previous VI-only patch.
> It requires SH_MEM_CONFIG.ALIGNMENT_MODE = UNALIGNED on CIK-VI.

Is this enabled? I wasn’t sure, so currently LLVM assumes no. You can start 
adding the +unaligned-buffer-access subtarget feature if it is.

-Matt___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] GLSL IR & TGSI on-disk shader cache

2017-02-13 Thread Matt Arsenault

> On Feb 6, 2017, at 19:42, Timothy Arceri  wrote:
> 
> This series does not include the patch that adds cache support
> to the radeonsi backend, the main reason for this is that llvm
> currently doesn't allow the version to be queried at runtime
> (as far as I'm aware) although it seems like other are interested
> in this feature [1] so I will follow up on that.

This should not be necessary. Old bitcode is supposed to be forward compatible 
and should be autoupgraded. You should not need to check that the runtime 
version matches. The sticking point for AMDGPU specifically is I do not want to 
guarantee compatibility for the legacy intrinsics still in use. They have 
various problems, and some of them make it difficult to support auto upgrading 
them. Going forward I would like to maintain compatibility with new properly 
defined intrinsics.

-Matt___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] Mesa (master): Revert "radeon/llvm: Use alloca instructions for larger arrays"

2016-07-26 Thread Matt Arsenault

> On Jul 26, 2016, at 14:37, Marek Olšák <mar...@gmail.com> wrote:
> 
> On Sat, Jul 23, 2016 at 4:07 PM, Nicolai Hähnle <nhaeh...@gmail.com 
> <mailto:nhaeh...@gmail.com>> wrote:
>> On 22.07.2016 12:08, Michel Dänzer wrote:
>>> 
>>> On 21.07.2016 18:17, Matt Arsenault wrote:
>>>>> 
>>>>> On Jul 21, 2016, at 01:03, Michel Dänzer <mic...@daenzer.net
>>>>> <mailto:mic...@daenzer.net>> wrote:
>>>>> 
>>>>> On 21.07.2016 00:04, Michel Dänzer wrote:
>>>>>> 
>>>>>> On 15.07.2016 05:15, Marek =?UNKNOWN?B?T2zFocOhaw==?= wrote:
>>>>>>> 
>>>>>>> Module: Mesa
>>>>>>> Branch: master
>>>>>>> Commit: f84e9d749fbb6da73a60fb70e6725db773c9b8f8
>>>>>>> URL:
>>>>>>> 
>>>>>>> http://cgit.freedesktop.org/mesa/mesa/commit/?id=f84e9d749fbb6da73a60fb70e6725db773c9b8f8
>>>>>>> 
>>>>>>> Author: Marek Olšák <marek.ol...@amd.com <mailto:marek.ol...@amd.com>>
>>>>>>> Date:   Thu Jul 14 22:07:46 2016 +0200
>>>>>>> 
>>>>>>> Revert "radeon/llvm: Use alloca instructions for larger arrays"
>>>>>>> 
>>>>>>> This reverts commit 513fccdfb68e6a71180e21827f071617c93fd09b.
>>>>>>> 
>>>>>>> Bioshock Infinite hangs with that.
>>>>>> 
>>>>>> 
>>>>>> Unfortunately, this change caused the piglit test
>>>>>> shaders@glsl-fs-vec4-indexing-temp-dst-in-loop (and possibly others) to
>>>>>> hang my Kaveri. Any ideas for how we can get out of this conundrum?
>>>>> 
>>>>> 
>>>>> The hang was introduced by LLVM SVN r275934 ("AMDGPU: Expand register
>>>>> indexing pseudos in custom inserter"). The good/bad (without/with
>>>>> r275934) shader dumps and the GALLIUM_DDEBUG=800 dump corresponding to
>>>>> the hang are attached.
>>>>> 
>>>>> 
>>>>> BTW, even with Marek's change above reverted, I still see some piglit
>>>>> regressions compared to last week, but I'm not sure if those are all
>>>>> related to the same LLVM change.
>>>>> 
>>>>> 
>>>>> --
>>>>> Earthling Michel Dänzer   |
>>>>>  http://www.amd.com <http://www.amd.com/>
>>>>> Libre software enthusiast | Mesa and X developer
>>>>> 
>>>>> 
>>>> 
>>>> 
>>>> This fixes the verifier error in it: https://reviews.llvm.org/D22616
>>> 
>>> 
>>> This seems to fix the hang, thanks!
>>> 
>>> 
>>>> This fixes another issue which may be
>>>> related: https://reviews.llvm.org/D22556
>>> 
>>> 
>>> Even with that applied as well, there are still piglit regressions
>>> compared to early last week, see the attached dumps (look for "LLVM
>>> triggered Diagnostic Handler:").
>> 
>> 
>> Looks like the "rewrite undef" part of the Two Address Instruction Pass also
>> needs to be adjusted -- I've attached a bugpoint-reduced test case.
>> 
>> Also, the hang that motivated the original revert in Mesa should be fixed
>> with https://reviews.llvm.org/D22673 (and the related
>> https://reviews.llvm.org/D22675 is also needed for correctness, though
>> probably not for fixing the hang).
> 
> FYI, I've reverted the revert.
> 
> Marek


It might be nice if this could be an option, since this was probably the main 
stressor of the register indexing code

-Matt

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] Mesa (master): Revert "radeon/llvm: Use alloca instructions for larger arrays"

2016-07-21 Thread Matt Arsenault

> On Jul 21, 2016, at 01:03, Michel Dänzer  wrote:
> 
> On 21.07.2016 00:04, Michel Dänzer wrote:
>> On 15.07.2016 05:15, Marek =?UNKNOWN?B?T2zFocOhaw==?= wrote:
>>> Module: Mesa
>>> Branch: master
>>> Commit: f84e9d749fbb6da73a60fb70e6725db773c9b8f8
>>> URL:
>>> http://cgit.freedesktop.org/mesa/mesa/commit/?id=f84e9d749fbb6da73a60fb70e6725db773c9b8f8
>>> 
>>> Author: Marek Olšák 
>>> Date:   Thu Jul 14 22:07:46 2016 +0200
>>> 
>>> Revert "radeon/llvm: Use alloca instructions for larger arrays"
>>> 
>>> This reverts commit 513fccdfb68e6a71180e21827f071617c93fd09b.
>>> 
>>> Bioshock Infinite hangs with that.
>> 
>> Unfortunately, this change caused the piglit test
>> shaders@glsl-fs-vec4-indexing-temp-dst-in-loop (and possibly others) to
>> hang my Kaveri. Any ideas for how we can get out of this conundrum?
> 
> The hang was introduced by LLVM SVN r275934 ("AMDGPU: Expand register
> indexing pseudos in custom inserter"). The good/bad (without/with
> r275934) shader dumps and the GALLIUM_DDEBUG=800 dump corresponding to
> the hang are attached.
> 
> 
> BTW, even with Marek's change above reverted, I still see some piglit
> regressions compared to last week, but I'm not sure if those are all
> related to the same LLVM change.
> 
> 
> -- 
> Earthling Michel Dänzer   |   http://www.amd.com 
> 
> Libre software enthusiast | Mesa and X developer
> 

This fixes the verifier error in it: https://reviews.llvm.org/D22616 

This fixes another issue which may be related: https://reviews.llvm.org/D22556 


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/5] radeonsi: set dereferenceable attribute on descriptor arrays

2016-07-13 Thread Matt Arsenault

> On Jul 13, 2016, at 12:36, Marek Olšák  wrote:
> 
> On Wed, Jul 13, 2016 at 9:25 PM, Tom Stellard  > wrote:
>> On Wed, Jul 13, 2016 at 03:20:55PM -0400, Tom Stellard wrote:
>>> On Tue, Jul 12, 2016 at 10:52:35PM +0200, Marek Olšák wrote:
 From: Marek Olšák 
 
 This allows moving the loads arbitrarily in the Sinking pass.
 
 26002 shaders in 14643 tests
 Totals:
 SGPRS: 2080160 -> 2080160 (0.00 %)
 VGPRS: 798875 -> 797826 (-0.13 %)
 Spilled SGPRs: 108485 -> 79165 (-27.03 %)
 Spilled VGPRs: 327 -> 327 (0.00 %)
 Scratch VGPRs: 1656 -> 1652 (-0.24 %) dwords per thread
 Code Size: 36127192 -> 35559780 (-1.57 %) bytes
 LDS: 767 -> 767 (0.00 %) blocks
 Max Waves: 212464 -> 212672 (0.10 %)
 Wait states: 0 -> 0 (0.00 %)
 
 PERCENTAGES / AppShadersSGPRs VGPRs  SpillSGPR SpillVGPR  
 Scratch   CodeSize  MaxWavesWaits
 (unknown)  4 . . . . . 
 . . .
 0ad6 . . . . . 
 . . .
 alien_isolation 2938 .0.04 %   -8.53 % . . 
   -0.71 %   -0.06 % .
 anholt10 . . . . . 
 . . .
 batman_arkham_origins589 .   -0.58 %  -79.54 % . . 
   -6.72 %0.57 % .
 bioshock-infinite   1769 .   -0.65 %  -89.32 % . . 
   -4.73 %0.48 % .
 borderlands23968 .   -0.31 %  -51.21 % . . 
   -4.09 %0.22 % .
 brutal-legend338 .   -0.03 %   -2.95 % . . 
   -0.06 % . .
 civilization_beyond..116 . .  -14.17 % . . 
   -0.88 % . .
 counter_strike_glob..   1142 . . . . . 
 . . .
 dirt-showdown541 .   -0.56 %  -40.14 % .   
 -3.45 %   -1.82 %0.35 % .
 dolphin   22 . . . . . 
0.16 % . .
 dota2   1747 . . . . . 
0.01 % . .
 europa_universalis_4  76 .   -0.23 %  -42.11 % . . 
   -0.96 % . .
 f1-2015  774 .   -0.09 %  -28.89 % . . 
   -2.60 %0.09 % .
 furmark-0.7.0  4 . . . . . 
 . . .
 gimark-0.7.0  10 . . . . . 
 . . .
 glamor16 . . . . . 
 . . .
 humus-celshading   4 . . . . . 
 . . .
 humus-domino   6 . . . . . 
 . . .
 humus-dynamicbranching24 .0.71 % . . . 
0.29 %   -0.45 % .
 humus-hdr 10 . . . . . 
 . . .
 humus-portals  2 . . . . . 
 . . .
 humus-volumetricfog..  6 . . . . . 
 . . .
 left_4_dead_2   1762 . . . . . 
 . . .
 metro_2033_redux2670 .   -0.10 %   -7.15 % . . 
   -0.03 % . .
 nexuiz80 . . . . . 
 . . .
 pixmark-julia-fp32 2 . . . . . 
 . . .
 pixmark-julia-fp64 2 . . . . . 
 . . .
 pixmark-piano-0.7.02 . . . . . 
 . . .
 pixmark-volplosion-..  2 . . . . . 
 . . .
 plot3d-0.7.0   8 . . . . . 
 . . .
 portal   474 . . . . . 
 . . .
 sauerbraten7 . . . . . 
 . . .
 serious_sam_3_bfe392 . .  

Re: [Mesa-dev] [PATCH] radeonsi: add a debug flag for unsafe math LLVM optimizations

2016-06-13 Thread Matt Arsenault

> On Jun 13, 2016, at 09:27, Marek Olšák  wrote:
> 
> + { "unsafemath", DBG_UNSAFE_MATH, "Enable unsafe math shader 
> optimizations" },

Perhaps one for each of the individual fast math options as well (no nans, no 
signed zeros etc.)?___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] radeonsi: enable denorms for 64-bit and 16-bit floats

2016-02-09 Thread Matt Arsenault

> On Feb 9, 2016, at 11:23, Tom Stellard  wrote:
> 
> We should still add +fp64-denormals even if the backend doesn't do
> anything with it now.

This is the default, so it doesn’t really matter anyway.

-Matt___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] radeonsi: enable denorms for 64-bit and 16-bit floats

2016-02-08 Thread Matt Arsenault

> On Feb 8, 2016, at 08:08, Tom Stellard  wrote:
> 
> Do SI/CI support fp64 denorms?  If so, won't this hurt performance?
This is the only mode that should ever be used. I’m not sure why these are 
options. There technically are separate flush on input or flush on output 
options, but I’m not sure why they would be used.___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] radeonsi: enable denorms for 64-bit and 16-bit floats

2016-02-08 Thread Matt Arsenault

> On Feb 8, 2016, at 08:08, Tom Stellard  wrote:
> 
> Do SI/CI support fp64 denorms?  If so, won't this hurt performance?
> 
> We should tell the compiler we are enabling fp-64 denorms by adding
> +fp64-denormals to the feature string.  It would also be better to
> read the float_mode value from the config registers emitted by the
> compiler.

Yes, the runtime here should read the value out of the binary and enable it in 
the compiler rather than the runtime hardcoding it. If you wanted to load a 
shader with different FP rules for example it should be able to switch.

-Matt___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] radeonsi: enable denorms for 64-bit and 16-bit floats

2016-02-08 Thread Matt Arsenault

> On Feb 8, 2016, at 12:38, Marek Olšák  wrote:
> 
>> 
>> We should tell the compiler we are enabling fp-64 denorms by adding
>> +fp64-denormals to the feature string.  It would also be better to
>> read the float_mode value from the config registers emitted by the
>> compiler.
> 
> Yes, I agree, but LLVM only sets these parameters for compute or even
> HSA-only kernels, not for graphics shaders. We need to set the mode
> for all users _now_, not in 6 months. Last time I looked,
> +fp64-denormals had no effect on graphics shaders.

This is a bug. I think I left these because the config register macro names 
were different for the other shader types, even though they appeared to be the 
same thing.

-Matt___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/3] radeon/llvm: Set the target triple on the module

2016-02-04 Thread Matt Arsenault

> On Feb 4, 2016, at 13:02, Tom Stellard  wrote:
> 
> + LLVMSetTarget(ctx->gallivm.module,
> +
> +#if HAVE_LLVM < 0x0306
> + "r600--");
> +#else
> + triple);
> +#endif

This alone does not set the datalayout, which should also be set here.

-Matt___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/2] radeonsi: Allow dumping LLVM IR before optimization passes

2016-02-04 Thread Matt Arsenault

> On Feb 4, 2016, at 00:15, Nicolai Hähnle  wrote:
> 
> From: Nicolai Hähnle 
> 
> Set R600_DEBUG=preoptir to dump the LLVM IR before optimization passes,
> to allow diagnosing problems caused by optimization passes.
> 
> Note that in order to compile the resulting IR with llc, you will first
> have to run at least the mem2reg pass, e.g.
> 
> opt -mem2reg -S < shader.ll | llc -march=amdgcn -mcpu=bonaire
> 
> Signed-off-by: Michel Dänzer  (original patch)
> Signed-off-by: Nicolai Hähnle  (w/ debug flag)
> ---
> Having the option is a good idea, but I prefer to have a separate debug
> flag for it so that when you try to analyze bugs in codegen (which in
> my experience happens more often) you don't have to worry about
> replicating the exact same sequence of optimizations manually via the
> command line to reproduce the problem there.
> 
> src/gallium/drivers/radeon/r600_pipe_common.c |  1 +
> src/gallium/drivers/radeon/r600_pipe_common.h |  1 +
> src/gallium/drivers/radeonsi/si_shader.c  | 16 ++--
> 3 files changed, 16 insertions(+), 2 deletions(-)
> 
> diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c 
> b/src/gallium/drivers/radeon/r600_pipe_common.c
> index c827dbd..a1432ed 100644
> --- a/src/gallium/drivers/radeon/r600_pipe_common.c
> +++ b/src/gallium/drivers/radeon/r600_pipe_common.c
> @@ -393,6 +393,7 @@ static const struct debug_named_value 
> common_debug_options[] = {
>   { "noir", DBG_NO_IR, "Don't print the LLVM IR"},
>   { "notgsi", DBG_NO_TGSI, "Don't print the TGSI"},
>   { "noasm", DBG_NO_ASM, "Don't print disassembled shaders"},
> + { "preoptir", DBG_PREOPT_IR, "Print the LLVM IR before initial 
> optimizations" },
> 
>   /* features */
>   { "nodma", DBG_NO_ASYNC_DMA, "Disable asynchronous DMA" },
> diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h 
> b/src/gallium/drivers/radeon/r600_pipe_common.h
> index c7e4c44..4e36631 100644
> --- a/src/gallium/drivers/radeon/r600_pipe_common.h
> +++ b/src/gallium/drivers/radeon/r600_pipe_common.h
> @@ -71,6 +71,7 @@
> #define DBG_NO_IR (1 << 12)
> #define DBG_NO_TGSI   (1 << 13)
> #define DBG_NO_ASM(1 << 14)
> +#define DBG_PREOPT_IR(1 << 15)
> /* Bits 21-31 are reserved for the r600g driver. */
> /* features */
> #define DBG_NO_ASYNC_DMA  (1llu << 32)
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
> b/src/gallium/drivers/radeonsi/si_shader.c
> index 8b524cf..d9ed6b2 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -4092,7 +4092,7 @@ int si_compile_llvm(struct si_screen *sscreen,
>   if (r600_can_dump_shader(>b, processor)) {
>   fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
> 
> - if (!(sscreen->b.debug_flags & DBG_NO_IR))
> + if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR)))
>   LLVMDumpModule(mod);
>   }
> 
> @@ -4178,6 +4178,12 @@ static int si_generate_gs_copy_shader(struct si_screen 
> *sscreen,
>   si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
> 
>   LLVMBuildRetVoid(bld_base->base.gallivm->builder);

You might want to run the verifier if debugging (or maybe that should be a 
separate option)

> +
> + /* Dump LLVM IR before any optimization passes */
> + if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
> + r600_can_dump_shader(>b, TGSI_PROCESSOR_GEOMETRY))
> + LLVMDumpModule(bld_base->base.gallivm->module);
> +
>   radeon_llvm_finalize_module(_shader_ctx->radeon_bld);
> 
>   if (dump)
> @@ -4385,9 +4391,15 @@ int si_shader_create(struct si_screen *sscreen, 
> LLVMTargetMachineRef tm,
>   }
> 
>   LLVMBuildRetVoid(bld_base->base.gallivm->builder);
> + mod = bld_base->base.gallivm->module;
> +
> + /* Dump LLVM IR before any optimization passes */
> + if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
> + r600_can_dump_shader(>b, si_shader_ctx.type))
> + LLVMDumpModule(mod);
> +
>   radeon_llvm_finalize_module(_shader_ctx.radeon_bld);
> 
> - mod = bld_base->base.gallivm->module;
>   r = si_compile_llvm(sscreen, >binary, >config, tm,
>   mod, debug, si_shader_ctx.type);
>   if (r) {
> -- 
> 2.5.0
> 
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH shader-db] si-report: Track max waves per CU

2016-01-05 Thread Matt Arsenault

> On Jan 5, 2016, at 07:28, Marek Olšák  wrote:
> 
> Hi,
> 
> I'd like us to do this computation in Mesa, because it can be more
> accurate there. The pixel shader wave count depends heavily on LDS,
> because each interpolated input occupies 12 dwords of LDS per
> primitive and there can be 1-16 primitives per wave. With 32 inputs
> and 16 primitives, you can get 6144 dwords of LDS per wave and this is
> without the extra LDS storage needed by derivatives. In a nutshell,
> you need to know the number of interpolated inputs, then you can
> compute the best case (1 primitive) scenario and the worst case (16
> primitives).
> 
> Marek
> 

Are these dynamically known counts? I think having a statically known max waves 
would be useful (and noting that it is maximum static known) in addition to the 
dynamic from the runtime number of inputs.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 09/10] radeonsi: don't emit AMDGPU intrinsics for RSQ opcodes

2015-10-11 Thread Matt Arsenault

> On Oct 10, 2015, at 6:29 PM, Marek Olšák  wrote:
> 
> +/* This requires "unsafe-fp-math" for LLVM to convert it to RSQ. */
> +static void emit_rsq(const struct lp_build_tgsi_action *action,
> +  struct lp_build_tgsi_context *bld_base,
> +  struct lp_build_emit_data *emit_data)
> +{
> + LLVMBuilderRef builder = bld_base->base.gallivm->builder;
> + LLVMValueRef src = emit_data->args[0];
> + bool is_f64 = LLVMGetTypeKind(LLVMTypeOf(src)) == LLVMDoubleTypeKind;
> +
> + LLVMValueRef sqrt =
> + lp_build_emit_llvm_unary(bld_base,
> +  is_f64 ? TGSI_OPCODE_DSQRT
> + : TGSI_OPCODE_SQRT,
> +  src);
> +
> + emit_data->output[emit_data->chan] =
> + LLVMBuildFDiv(builder,
> +   is_f64 ? bld_base->dbl_bld.one
> +  : bld_base->base.one,
> +   sqrt, "");
> +}

You should add the per-instruction fast math flags here for nnan instead of 
just relying on the function attribute (although to get the codegen effect 
currently you will still need the attribute). I’m also not sure how to do this 
with the C API (and might need new functions to do it)___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 07/10] radeonsi: don't use the AMDGPU intrinsic for CMP

2015-10-11 Thread Matt Arsenault

> On Oct 10, 2015, at 6:29 PM, Marek Olšák  wrote:
> 
> The increase in VGPRs in unfortunate, but the decrease in the scratch size
> is always welcome.

Do you have a specific example where this happens you can post?___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] clover: Return the minimum required value for CL_DEVICE_SINGLE_FP_CONFIG

2015-03-06 Thread Matt Arsenault

 On Mar 6, 2015, at 8:56 AM, Francisco Jerez curroje...@riseup.net wrote:
 
 Tom Stellard t...@stellard.net mailto:t...@stellard.net writes:
 
 On Thu, Mar 05, 2015 at 08:42:25PM +0200, Francisco Jerez wrote:
 Tom Stellard thomas.stell...@amd.com writes:
 
 This means dropping CL_FP_DENORM from the current return value.
 ---
 src/gallium/state_trackers/clover/api/device.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)
 
 diff --git a/src/gallium/state_trackers/clover/api/device.cpp 
 b/src/gallium/state_trackers/clover/api/device.cpp
 index b1f556f..db3b931 100644
 --- a/src/gallium/state_trackers/clover/api/device.cpp
 +++ b/src/gallium/state_trackers/clover/api/device.cpp
 @@ -201,8 +201,10 @@ clGetDeviceInfo(cl_device_id d_dev, cl_device_info 
 param,
   break;
 
case CL_DEVICE_SINGLE_FP_CONFIG:
 +  // This is the mandated minimum single precision floating-point
 +  // capability
 
 Could you add that this is according to the OpenCL 1.1 specification?
 OpenCL 1.2 is even weaker (CL_FP_INF_NAN is not required, only one of
 CL_FP_ROUND_TO_ZERO or CL_FP_ROUND_TO_NEAREST is required, and no FP
 capabilities at all are required for custom devices as Jan pointed out).
 
   buf.as_scalarcl_device_fp_config() =
 - CL_FP_DENORM | CL_FP_INF_NAN | CL_FP_ROUND_TO_NEAREST;
 + CL_FP_INF_NAN | CL_FP_ROUND_TO_NEAREST;
 
 I'm okay with this change, but I'm curious, is this motivated by your
 architecture not supporting denorms?
 
 
 It can, but supporting them hurts performance.
 
 Sounds like you want to advertise denorm support and rely on the
 -cl-denorms-are-zero compiler option to decide whether to flush them to
 zero or not?

This is true for newer devices which have more instructions as fast with 
denormal support. For the currently supported devices, the performance 
difference is quite extreme and the denormal support is not that useful.





 
 -Tom
 
   break;
 
case CL_DEVICE_DOUBLE_FP_CONFIG:
 -- 
 2.0.4
 
 
 
 
 ___
 mesa-dev mailing list
 mesa-dev@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/mesa-dev
 ___
 mesa-dev mailing list
 mesa-dev@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/mesa-dev

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] clover: Return the minimum required value for CL_DEVICE_SINGLE_FP_CONFIG

2015-03-05 Thread Matt Arsenault

 On Mar 5, 2015, at 10:42 AM, Francisco Jerez curroje...@riseup.net wrote:
 
 Could you add that this is according to the OpenCL 1.1 specification?
 OpenCL 1.2 is even weaker (CL_FP_INF_NAN is not required, only one of
 CL_FP_ROUND_TO_ZERO or CL_FP_ROUND_TO_NEAREST is required, and no FP
 capabilities at all are required for custom devices as Jan pointed out).

This should come from a device check then. The weakest is a reasonable default, 
but it shouldn’t be removed from devices which do support CL_FP_INF_NAN


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/9] radeonsi: use V_BFE for extracting a sample index

2015-03-05 Thread Matt Arsenault

 On Mar 5, 2015, at 6:50 AM, Tom Stellard t...@stellard.net wrote:
 
 On Mon, Mar 02, 2015 at 02:09:29PM -0800, Matt Arsenault wrote:
 
 On Mar 2, 2015, at 1:19 PM, Tom Stellard t...@stellard.net wrote:
 
 On Mon, Mar 02, 2015 at 10:14:00PM +0100, Marek Olšák wrote:
 On Mon, Mar 2, 2015 at 10:05 PM, Tom Stellard t...@stellard.net wrote:
 On Mon, Mar 02, 2015 at 12:54:16PM +0100, Marek Olšák wrote:
 From: Marek Olšák marek.ol...@amd.com
 
 ---
 src/gallium/drivers/radeonsi/si_shader.c | 22 --
 1 file changed, 16 insertions(+), 6 deletions(-)
 
 diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
 b/src/gallium/drivers/radeonsi/si_shader.c
 index b0417ed..f125483 100644
 --- a/src/gallium/drivers/radeonsi/si_shader.c
 +++ b/src/gallium/drivers/radeonsi/si_shader.c
 @@ -192,6 +192,20 @@ static int get_param_index(unsigned semantic_name, 
 unsigned index,
 }
 
 /**
 + * BitField Extract: ((value  rshift)  ((1  bitwidth) - 1))
 + */
 
 Ideally, we would just add a pattern for this in the backend and emit 
 generic
 LLVM IR here.  This would also make it possible to share the code with 
 llvmpipe.
 
 I think the best place to do this would be in 
 AMDGPUTargetLowering::performDAGCombine().
 
 Why not SIInstructions.td?
 
 
 Because for patterns like this, I think it is important to match them as
 early as possible, because there may be another optimization which reduces
 the sequence from 5 to 4 instructions which would cause the pattern not to 
 match.
 
 -Tom
 
 
 I think the opposite in this case. The basic bit operations have a lot of 
 existing combines on them, and the computeKnownBits implementations are more 
 complete. The BFE nodes are not as well understood, and trickier to deal 
 with. AArch64 and NVPTX both have essentially the same instruction, and they 
 have a large bit of code to match them in their ISelDAGToDAGs. I’ve wanted 
 to add a generic BFE node to be matched after legalization, but I haven’t 
 had time to do it.
 
 
 Isn't there a potential for a DAG combine on bit operations, to 'break'
 the pattern so it can't be recognized?
 
 -Tom
 

Yes and no. There isn’t really only one pattern for this. Ideally the different 
∑ays to use this would each form a single canonical form the pattern would need 
to match, but that isn’t always possible. The AArch64 code for example tries 
many different patterns (see isBitfieldExtractOp in AArch64ISelDAGToDAG)

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/9] radeonsi: use V_BFE for extracting a sample index

2015-03-02 Thread Matt Arsenault

 On Mar 2, 2015, at 1:19 PM, Tom Stellard t...@stellard.net wrote:
 
 On Mon, Mar 02, 2015 at 10:14:00PM +0100, Marek Olšák wrote:
 On Mon, Mar 2, 2015 at 10:05 PM, Tom Stellard t...@stellard.net wrote:
 On Mon, Mar 02, 2015 at 12:54:16PM +0100, Marek Olšák wrote:
 From: Marek Olšák marek.ol...@amd.com
 
 ---
 src/gallium/drivers/radeonsi/si_shader.c | 22 --
 1 file changed, 16 insertions(+), 6 deletions(-)
 
 diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
 b/src/gallium/drivers/radeonsi/si_shader.c
 index b0417ed..f125483 100644
 --- a/src/gallium/drivers/radeonsi/si_shader.c
 +++ b/src/gallium/drivers/radeonsi/si_shader.c
 @@ -192,6 +192,20 @@ static int get_param_index(unsigned semantic_name, 
 unsigned index,
 }
 
 /**
 + * BitField Extract: ((value  rshift)  ((1  bitwidth) - 1))
 + */
 
 Ideally, we would just add a pattern for this in the backend and emit 
 generic
 LLVM IR here.  This would also make it possible to share the code with 
 llvmpipe.
 
 I think the best place to do this would be in 
 AMDGPUTargetLowering::performDAGCombine().
 
 Why not SIInstructions.td?
 
 
 Because for patterns like this, I think it is important to match them as
 early as possible, because there may be another optimization which reduces
 the sequence from 5 to 4 instructions which would cause the pattern not to 
 match.
 
 -Tom


I think the opposite in this case. The basic bit operations have a lot of 
existing combines on them, and the computeKnownBits implementations are more 
complete. The BFE nodes are not as well understood, and trickier to deal with. 
AArch64 and NVPTX both have essentially the same instruction, and they have a 
large bit of code to match them in their ISelDAGToDAGs. I’ve wanted to add a 
generic BFE node to be matched after legalization, but I haven’t had time to do 
it.

-Matt___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/3] clover: Enable cl_khr_fp64 for devices that support doubles v2

2015-02-26 Thread Matt Arsenault

 On Feb 26, 2015, at 5:06 PM, Tom Stellard thomas.stell...@amd.com wrote:
 
 v2:
  - Report correct values for CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE and
CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE.
  - Only define cl_khr_fp64 if the extension is supported.
  - Remove trailing space from extension string.
  - Rename device query function from cl_khr_fp64() to has_doubles().
 ---
 src/gallium/state_trackers/clover/api/device.cpp  | 6 +++---
 src/gallium/state_trackers/clover/core/device.cpp | 6 ++
 src/gallium/state_trackers/clover/core/device.hpp | 1 +
 3 files changed, 10 insertions(+), 3 deletions(-)

There are two more changes that I don’t see here that should accompany this 
one. The correct CL_DEVICE_DOUBLE_FP_CONFIG values should be reported 
(CL_DEVICE_SINGLE_FP_CONFIG is also currently wrong as well). 
Plus, the FP config register needs to be set from the program binary so that 
double denormals are not flushed.

 
 diff --git a/src/gallium/state_trackers/clover/api/device.cpp 
 b/src/gallium/state_trackers/clover/api/device.cpp
 index e825468..217d2c3 100644
 --- a/src/gallium/state_trackers/clover/api/device.cpp
 +++ b/src/gallium/state_trackers/clover/api/device.cpp
 @@ -145,7 +145,7 @@ clGetDeviceInfo(cl_device_id d_dev, cl_device_info param,
   break;
 
case CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE:
 -  buf.as_scalarcl_uint() = 2;
 +  buf.as_scalarcl_uint() = dev.has_doubles() ? 2 : 0;
   break;
 
case CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF:
 @@ -283,7 +283,7 @@ clGetDeviceInfo(cl_device_id d_dev, cl_device_info param,
   break;
 
case CL_DEVICE_EXTENSIONS:
 -  buf.as_string() = ;
 +  buf.as_string() = dev.has_doubles() ? cl_khr_fp64 : ;
   break;
 
case CL_DEVICE_PLATFORM:
 @@ -315,7 +315,7 @@ clGetDeviceInfo(cl_device_id d_dev, cl_device_info param,
   break;
 
case CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE:
 -  buf.as_scalarcl_uint() = 2;
 +  buf.as_scalarcl_uint() = dev.has_doubles() ? 2 : 0;
   break;
 
case CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF:
 diff --git a/src/gallium/state_trackers/clover/core/device.cpp 
 b/src/gallium/state_trackers/clover/core/device.cpp
 index 688a7dd..c3f3b4e 100644
 --- a/src/gallium/state_trackers/clover/core/device.cpp
 +++ b/src/gallium/state_trackers/clover/core/device.cpp
 @@ -173,6 +173,12 @@ device::image_support() const {
   PIPE_COMPUTE_CAP_IMAGES_SUPPORTED)[0];
 }
 
 +bool
 +device::has_doubles() const {
 +   return pipe-get_shader_param(pipe, PIPE_SHADER_COMPUTE,
 + PIPE_SHADER_CAP_DOUBLES);
 +}
 +
 std::vectorsize_t
 device::max_block_size() const {
auto v = get_compute_paramuint64_t(pipe, 
 PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE);
 diff --git a/src/gallium/state_trackers/clover/core/device.hpp 
 b/src/gallium/state_trackers/clover/core/device.hpp
 index 2201700..de5fc6b 100644
 --- a/src/gallium/state_trackers/clover/core/device.hpp
 +++ b/src/gallium/state_trackers/clover/core/device.hpp
 @@ -64,6 +64,7 @@ namespace clover {
   cl_uint max_clock_frequency() const;
   cl_uint max_compute_units() const;
   bool image_support() const;
 +  bool has_doubles() const;
 
   std::vectorsize_t max_block_size() const;
   std::string device_name() const;
 -- 
 2.0.4
 
 ___
 mesa-dev mailing list
 mesa-dev@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/mesa-dev

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] Revert radeon/llvm: enable unsafe math for graphics shaders

2015-02-18 Thread Matt Arsenault

 On Feb 17, 2015, at 11:52 PM, Grigori Goronzy g...@chown.ath.cx wrote:
 
 Hi,
 
 AFAIR not enabling this makes LLVM generate really slow code in some
 common cases. Maybe this is just a bug in LLVM/R600 triggered by unsafe
 FP math optimization or some optimization is too eager. Other drivers do
 fine with these types of optimization.
 
 What's the impact on performance with unsafe FP math disabled at this time?
 
 Best regards
 Grigori


The exact reason should be found, since this should be OK.

Also, this should be setting no-nans-fp-math / no-infs-fp-math and whatever 
others there are. unsafe-fp-math should just be controlling other algebraic 
kinds of optimizations


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] Revert radeon/llvm: enable unsafe math for graphics shaders

2015-02-18 Thread Matt Arsenault

 On Feb 18, 2015, at 1:15 AM, Michel Dänzer mic...@daenzer.net wrote:
 
 On 18.02.2015 17:13, Michel Dänzer wrote:
 On 18.02.2015 16:52, Grigori Goronzy wrote:
 
 What's the impact on performance with unsafe FP math disabled at this time?
 
 I don't know. Correctness trumps performance.
 
 FWIW, I couldn't seem to measure any significant difference with Unigine
 Valley, which recently got a ~10% boost from enabling the LLVM machine
 scheduler.

There are a couple of problems right now that mean the fast math option isn’t 
really doing anything. I just realized this patch is only setting the 
attribute, which will only effect codegen. The IR emission should be also be 
annotating the individual instructions with the fast math flags, which would 
enable the better math optimizations in the parts of the optimizer that are 
supposed to handle this.

Even with the fast math flags, the current graphics pass pipeline is 
problematic. Much of what the fast math flags accomplish is done in 
instcombine, which isn’t run now. It really should be, it’s the basic 
canonicalization and peephole pass pretty much everything else expects it to 
have cleaned up the IR. The standard pass pipeline runs it several times.

-Matt

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] radeonsi: force NaNs to 0

2014-12-10 Thread Matt Arsenault

 On Dec 10, 2014, at 5:08 PM, Marek Olšák mar...@gmail.com wrote:
 
 From: Marek Olšák marek.ol...@amd.com
 
 This fixes incorrect rendering in Unreal Engine demos.
 I don't know why it's called dx10 clamp mode. MSDN doesn't mention it.
 
 Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=83510 
 https://bugs.freedesktop.org/show_bug.cgi?id=83510

This should be set from the program binary, and not hardcoded this way. The 
compiler should probably change the default based on the shader type


 ---
 src/gallium/drivers/radeonsi/si_state_shaders.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)
 
 diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c 
 b/src/gallium/drivers/radeonsi/si_state_shaders.c
 index 3a5b0ae..355f8aa 100644
 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c
 +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
 @@ -64,7 +64,8 @@ static void si_shader_es(struct si_shader *shader)
   si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
  S_00B328_VGPRS((shader-num_vgprs - 1) / 4) |
  S_00B328_SGPRS((num_sgprs - 1) / 8) |
 -S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt));
 +S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) |
 +S_00B328_DX10_CLAMP(1));
   si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
  S_00B32C_USER_SGPR(num_user_sgprs));
 }
 @@ -132,7 +133,8 @@ static void si_shader_gs(struct si_shader *shader)
 
   si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
  S_00B228_VGPRS((shader-num_vgprs - 1) / 4) |
 -S_00B228_SGPRS((num_sgprs - 1) / 8));
 +S_00B228_SGPRS((num_sgprs - 1) / 8) |
 +S_00B228_DX10_CLAMP(1));
   si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
  S_00B22C_USER_SGPR(num_user_sgprs));
 }
 @@ -206,7 +208,8 @@ static void si_shader_vs(struct si_shader *shader)
   si_pm4_set_reg(pm4, R_00B128_SPI_SHADER_PGM_RSRC1_VS,
  S_00B128_VGPRS((shader-num_vgprs - 1) / 4) |
  S_00B128_SGPRS((num_sgprs - 1) / 8) |
 -S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt));
 +S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt) |
 +S_00B128_DX10_CLAMP(1));
   si_pm4_set_reg(pm4, R_00B12C_SPI_SHADER_PGM_RSRC2_VS,
  S_00B12C_USER_SGPR(num_user_sgprs) |
  S_00B12C_SO_BASE0_EN(!!shader-selector-so.stride[0]) |
 @@ -304,7 +307,8 @@ static void si_shader_ps(struct si_shader *shader)
 
   si_pm4_set_reg(pm4, R_00B028_SPI_SHADER_PGM_RSRC1_PS,
  S_00B028_VGPRS((shader-num_vgprs - 1) / 4) |
 -S_00B028_SGPRS((num_sgprs - 1) / 8));
 +S_00B028_SGPRS((num_sgprs - 1) / 8) |
 +S_00B028_DX10_CLAMP(1));
   si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
  S_00B02C_EXTRA_LDS_SIZE(shader-lds_size) |
  S_00B02C_USER_SGPR(num_user_sgprs));
 -- 
 2.1.0
 
 ___
 mesa-dev mailing list
 mesa-dev@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/mesa-dev

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] radeonsi: use minnum and maxnum LLVM intrinsics for MIN and MAX opcodes

2014-11-22 Thread Matt Arsenault

 On Nov 22, 2014, at 7:35 AM, Marek Olšák mar...@gmail.com wrote:
 
 AFAICS, the R600 backend doesn't implement the intrinsics for R600.
 
 Marek


Should it? It’s trivial to switch to these for it, but I wasn’t sure what the 
actual semantics of its instructions were. There’s MAX and MAX_DX10, where I 
think MAX_DX10 corresponds to maxnum and MAX is the legacy version, but I’m not 
sure.


 
 On Sat, Nov 22, 2014 at 3:53 AM, Michel Dänzer mic...@daenzer.net wrote:
 On 21.11.2014 06:21, Marek Olšák wrote:
 
 From: Marek Olšák marek.ol...@amd.com
 
 So far it has been compiled into pretty ugly code (8 instructions or so
 for either opcode).
 ---
  src/gallium/drivers/radeonsi/si_shader.c | 7 +++
  1 file changed, 7 insertions(+)
 
 diff --git a/src/gallium/drivers/radeonsi/si_shader.c
 b/src/gallium/drivers/radeonsi/si_shader.c
 index ee08d1a..973bac2 100644
 --- a/src/gallium/drivers/radeonsi/si_shader.c
 +++ b/src/gallium/drivers/radeonsi/si_shader.c
 @@ -2792,6 +2792,13 @@ int si_shader_create(struct si_screen *sscreen,
 struct si_shader *shader)
bld_base-op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
bld_base-op_actions[TGSI_OPCODE_ENDPRIM].emit =
 si_llvm_emit_primitive;
 
 +   if (HAVE_LLVM = 0x0306) {
 +   bld_base-op_actions[TGSI_OPCODE_MAX].emit =
 build_tgsi_intrinsic_nomem;
 +   bld_base-op_actions[TGSI_OPCODE_MAX].intr_name =
 llvm.maxnum.f32;
 +   bld_base-op_actions[TGSI_OPCODE_MIN].emit =
 build_tgsi_intrinsic_nomem;
 +   bld_base-op_actions[TGSI_OPCODE_MIN].intr_name =
 llvm.minnum.f32;
 +   }
 +
si_shader_ctx.radeon_bld.load_system_value = declare_system_value;
si_shader_ctx.tokens = sel-tokens;
tgsi_parse_init(si_shader_ctx.parse, si_shader_ctx.tokens);
 
 
 Shouldn't this be done in
 src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c, so it benefits r600g as
 well?
 
 
 --
 Earthling Michel Dänzer   |   http://www.amd.com
 Libre software enthusiast | Mesa and X developer
 ___
 mesa-dev mailing list
 mesa-dev@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/mesa-dev

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 05/10] clover: Add environment variables for dumping kernel code

2014-10-08 Thread Matt Arsenault

On Oct 6, 2014, at 12:44 PM, Tom Stellard thomas.stell...@amd.com wrote:

 ---
 .../state_trackers/clover/llvm/invocation.cpp  | 74 ++
 1 file changed, 63 insertions(+), 11 deletions(-)
 
 diff --git a/src/gallium/state_trackers/clover/llvm/invocation.cpp 
 b/src/gallium/state_trackers/clover/llvm/invocation.cpp
 index a1a54e0..3e6a186 100644
 --- a/src/gallium/state_trackers/clover/llvm/invocation.cpp
 +++ b/src/gallium/state_trackers/clover/llvm/invocation.cpp
 @@ -61,6 +61,8 @@
 #include llvm/Support/TargetRegistry.h
 #include llvm/Transforms/IPO.h
 #include llvm/Transforms/IPO/PassManagerBuilder.h
 +#include llvm/Transforms/Utils/Cloning.h
 +
 
 #if HAVE_LLVM  0x0302
 #include llvm/Target/TargetData.h
 @@ -433,19 +435,39 @@ namespace {
   return m;
}
 
 +   static void emit_code(LLVMTargetMachineRef tm, LLVMModuleRef mod,
 + LLVMCodeGenFileType file_type,
 + LLVMMemoryBufferRef *out_buffer,
 + compat::string r_log) {
 +  LLVMBool err;
 +  char *err_message = NULL;
 +
 +  err = LLVMTargetMachineEmitToMemoryBuffer(tm, mod, file_type,
 +err_message, out_buffer);
 +
 +  if (err) {
 + r_log = std::string(err_message);
 +  }
 +
 +  LLVMDisposeMessage(err_message);
 +
 +  if (err) {
 + throw build_error();
 +  }
 +   }
 +
module
build_module_native(llvm::Module *mod,
  const std::vectorllvm::Function * kernels,
  clang::LangAS::Map address_spaces,
  std::string triple, std::string processor,
 - compat::string r_log) {
 + bool dump_asm, compat::string r_log) {
   std::string log;
   LLVMTargetRef target;
   char *error_message;
   LLVMMemoryBufferRef out_buffer;
   unsigned buffer_size;
   const char *buffer_data;
 -  LLVMBool err;
   LLVMModuleRef mod_ref = wrap(mod);
 
   if (LLVMGetTargetFromTriple(triple.c_str(), target, error_message)) {
 @@ -463,16 +485,27 @@ namespace {
  throw build_error();
   }
 
 -  err = LLVMTargetMachineEmitToMemoryBuffer(tm, mod_ref, LLVMObjectFile,
 -error_message, out_buffer);
 +  if (dump_asm) {
 + LLVMSetTargetMachineAsmVerbosity(tm, true);
 +#if HAVE_LLVM = 0x0306
 + LLVMSetTargetMachineShowMCEncoding(tm, true);
 +#endif
 + LLVMModuleRef debug_mod = wrap(llvm::CloneModule(mod));
 + emit_code(tm, debug_mod, LLVMAssemblyFile, out_buffer, r_log);
 + buffer_size = LLVMGetBufferSize(out_buffer);
 + buffer_data = LLVMGetBufferStart(out_buffer);
 + fprintf(stderr, %.*s\n, buffer_size, buffer_data);

It would be much better to emit each of these to a separate file with a 
consistent naming scheme than to just dump everything to stderr. It’s easier to 
just have a separate file for the source, IR and ISA to look at and compare 
rather than having to split it out of the rest of the debug output, which also 
gets harder as programs get bigger and add multiple kernels. There’s also the 
problem with stderr not being flushed if the machine hangs, so you only get 
partial output. 

What we have is an environment variable that specifies the prefix to use for 
the file name, (defaulting to _temp_), so the output is like
_temp_0_Tahiti_foo.isa, _temp_0_Tahiti_foo.bc, _temp_1_Tahiti_bar.cl. The 
number is the compile index for the program, which is important for programs 
with multiple compiles, especially for ones which compile kernels with the same 
name multiple times (which seems to be strangely not uncommon). The specified 
prefix is useful for saving sets of slightly different output with a change or 
different compile options or something like that



 
 -  if (err) {
 - LLVMDisposeTargetMachine(tm);
 - r_log = std::string(error_message);
 - LLVMDisposeMessage(error_message);
 - throw build_error();
 + LLVMSetTargetMachineAsmVerbosity(tm, false);
 +#if HAVE_LLVM = 0x0306
 + LLVMSetTargetMachineShowMCEncoding(tm, false);
 +#endif
 + LLVMDisposeMemoryBuffer(out_buffer);
 + LLVMDisposeModule(debug_mod);
   }
 
 +  emit_code(tm, mod_ref, LLVMObjectFile, out_buffer, r_log);
 +
   buffer_size = LLVMGetBufferSize(out_buffer);
   buffer_data = LLVMGetBufferStart(out_buffer);
 
 @@ -569,6 +602,18 @@ static void diagnostic_handler(const 
 llvm::DiagnosticInfo di, void *err_string)
 
 #endif
 
 +#define DBG_CLC  (1  0)
 +#define DBG_LLVM (1  1)
 +#define DBG_ASM  (1  2)
 +
 +static const struct debug_named_value debug_options[] = {
 +   {clc, DBG_CLC, Dump the OpenCL C code for all kernels.},
 +   {llvm, DBG_LLVM, Dump the generated LLVM IR for all kernels.},
 +   {asm, DBG_ASM, Dump kernel assembly code for targets specifying 
 +

Re: [Mesa-dev] [PATCH 5/5] clover: Enable cl_khr_fp64 for devices that support doubles v2

2014-08-13 Thread Matt Arsenault
On Jun 26, 2014, at 7:15 AM, Francisco Jerez curroje...@riseup.net wrote:

 Tom Stellard thomas.stell...@amd.com writes:
 
 v2:
  - Report correct values for CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE and
CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE.
  - Only define cl_khr_fp64 if the extension is supported.
  - Remove trailing space from extension string.
  - Rename device query function from cl_khr_fp86() to has_doubles().
 ---
 src/gallium/state_trackers/clover/api/device.cpp  | 6 +++---
 src/gallium/state_trackers/clover/core/device.cpp | 6 ++
 src/gallium/state_trackers/clover/core/device.hpp | 1 +
 src/gallium/state_trackers/clover/core/program.cpp| 5 -
 src/gallium/state_trackers/clover/llvm/invocation.cpp | 1 -
 5 files changed, 14 insertions(+), 5 deletions(-)
 
 diff --git a/src/gallium/state_trackers/clover/api/device.cpp 
 b/src/gallium/state_trackers/clover/api/device.cpp
 index 7006702..1176668 100644
 --- a/src/gallium/state_trackers/clover/api/device.cpp
 +++ b/src/gallium/state_trackers/clover/api/device.cpp
 @@ -145,7 +145,7 @@ clGetDeviceInfo(cl_device_id d_dev, cl_device_info param,
   break;
 
case CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE:
 -  buf.as_scalarcl_uint() = 2;
 +  buf.as_scalarcl_uint() = dev.has_doubles() ? 2 : 0;
   break;
 
case CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF:
 @@ -290,7 +290,7 @@ clGetDeviceInfo(cl_device_id d_dev, cl_device_info param,
   break;
 
case CL_DEVICE_EXTENSIONS:
 -  buf.as_string() = ;
 +  buf.as_string() = dev.has_doubles() ? cl_khr_fp64 : ;
   break;
 
case CL_DEVICE_PLATFORM:
 @@ -322,7 +322,7 @@ clGetDeviceInfo(cl_device_id d_dev, cl_device_info param,
   break;
 
case CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE:
 -  buf.as_scalarcl_uint() = 2;
 +  buf.as_scalarcl_uint() = dev.has_doubles() ? 2 : 0;
   break;
 
case CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF:
 diff --git a/src/gallium/state_trackers/clover/core/device.cpp 
 b/src/gallium/state_trackers/clover/core/device.cpp
 index bc6b761..6bf33e0 100644
 --- a/src/gallium/state_trackers/clover/core/device.cpp
 +++ b/src/gallium/state_trackers/clover/core/device.cpp
 @@ -193,6 +193,12 @@ device::half_fp_config() const {
return CL_FP_DENORM | CL_FP_INF_NAN | CL_FP_ROUND_TO_NEAREST;
 }
 
 +bool
 +device::has_doubles() const {
 +   return pipe-get_shader_param(pipe, PIPE_SHADER_COMPUTE,
 + PIPE_SHADER_CAP_DOUBLES);
 +}
 +
 std::vectorsize_t
 device::max_block_size() const {
auto v = get_compute_paramuint64_t(pipe, 
 PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE);
 diff --git a/src/gallium/state_trackers/clover/core/device.hpp 
 b/src/gallium/state_trackers/clover/core/device.hpp
 index 16831ab..025c648 100644
 --- a/src/gallium/state_trackers/clover/core/device.hpp
 +++ b/src/gallium/state_trackers/clover/core/device.hpp
 @@ -66,6 +66,7 @@ namespace clover {
   cl_device_fp_config single_fp_config() const;
   cl_device_fp_config double_fp_config() const;
   cl_device_fp_config half_fp_config() const;
 +  bool has_doubles() const;
 
   std::vectorsize_t max_block_size() const;
   std::string device_name() const;
 diff --git a/src/gallium/state_trackers/clover/core/program.cpp 
 b/src/gallium/state_trackers/clover/core/program.cpp
 index e09c3aa..f65f321 100644
 --- a/src/gallium/state_trackers/clover/core/program.cpp
 +++ b/src/gallium/state_trackers/clover/core/program.cpp
 @@ -95,7 +95,10 @@ program::build_status(const device dev) const {
 
 std::string
 program::build_opts(const device dev) const {
 -   return _opts.count(dev) ? _opts.find(dev)-second : ;
 +   std::string opts = _opts.count(dev) ? _opts.find(dev)-second : ;
 +   if (dev.has_doubles())
 +  opts.append( -Dcl_khr_fp64);
 +   return opts;
 
 This define belongs in the target-specific part of libclc.  With this
 hunk removed this patch is:
 
 Reviewed-by: Francisco Jerez curroje...@riseup.net
 
 }
 
 std::string
 diff --git a/src/gallium/state_trackers/clover/llvm/invocation.cpp 
 b/src/gallium/state_trackers/clover/llvm/invocation.cpp
 index 5d2efc4..f2b4fd9 100644
 --- a/src/gallium/state_trackers/clover/llvm/invocation.cpp
 +++ b/src/gallium/state_trackers/clover/llvm/invocation.cpp
 @@ -183,7 +183,6 @@ namespace {
 
   // clc.h requires that this macro be defined:
   
 c.getPreprocessorOpts().addMacroDef(cl_clang_storage_class_specifiers);
 -  c.getPreprocessorOpts().addMacroDef(cl_khr_fp64);
 
   c.getLangOpts().NoBuiltin = true;
   c.getTargetOpts().Triple = triple;
 -- 
 1.8.1.5
 
 ___
 mesa-dev mailing list
 mesa-dev@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/mesa-dev
 ___
 mesa-dev mailing list
 mesa-dev@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/mesa-dev


ping
___
mesa-dev mailing list

Re: [Mesa-dev] [PATCH 0/2] clover: add clCompileProgram

2014-08-04 Thread Matt Arsenault

On Aug 4, 2014, at 8:03 AM, EdB edb+m...@sigluy.net wrote:

 Hello
 
 I'm done with the clCompile part of OpenCL 1.2.
 
 As you can see I use char* data to transfert data from core to llvm.
 
 At first I was thinking of using std class but we need to be binary safe
 when data are transfert beetween c++98/c++11 compiled code.
 
 Then I try to use compat class, but it add unexpected behavior.
 It's looks like they are acting like std, but they don't.
 For exemple vector.reserve is actually making the vector size grown,
 not like std::vector. Also std::string.c_str is always null terminated.
 compat::string is not.

I’ve run into many memory corruption problems with the compat:: stuff that is 
fixed by replacing it all with the std::versions. The compat version 
differences, while still looking similar is also pretty bad. Can we just get 
rid of compat?

 
 At the end I decide to use const char* as it also avoid some memory copy,
 but I can rewrite the patch if it's needed.
 And I think compat need to be fixed.
 
 EdB (2):
  clover: std::pair is not c++98/c++11 safe
  clover: add clCompileProgram
 
 src/gallium/state_trackers/clover/api/dispatch.cpp |  2 +-
 src/gallium/state_trackers/clover/api/program.cpp  | 41 --
 .../state_trackers/clover/core/compiler.hpp| 11 +++---
 src/gallium/state_trackers/clover/core/program.cpp | 14 ++--
 src/gallium/state_trackers/clover/core/program.hpp |  5 ++-
 .../state_trackers/clover/llvm/invocation.cpp  | 33 +
 src/gallium/state_trackers/clover/util/compat.hpp  |  6 
 7 files changed, 95 insertions(+), 17 deletions(-)
 
 -- 
 2.0.4
 
 ___
 mesa-dev mailing list
 mesa-dev@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/mesa-dev

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/2] r600g: Pass dimension parameter to compute shader.

2014-07-31 Thread Matt Arsenault

On 07/31/2014 03:58 PM, Jan Vesely wrote:

Would that work with things like one kernel calling another kernel? If
we had a function called from two kernels how would it know where to
look?
I don't think this case can be handled as 2 separate kernels with the 
same calling convention. If a kernel is calling another kernel function, 
I think it would be best to create stub kernels which call the 
(presumably inlined) kernel bodies functions.

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/1] r600: Use llvm intrinsic to read work dimension information

2014-07-30 Thread Matt Arsenault

On Jul 30, 2014, at 4:11 PM, Jan Vesely jan.ves...@rutgers.edu wrote:

 +define i32 @get_work_dim() nounwind readnone alwaysinline {
 +  %x = call i32 @llvm.r600.read.workdim() nounwind readnone
 +  ret i32 %x
 +}
 -- 

Maybe this should have range metadata attached now that it applies to calls?___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/1] R600: Add new intrinsic to read work dimensions

2014-07-30 Thread Matt Arsenault

On 07/30/2014 04:11 PM, Jan Vesely wrote:

CC: Tom Stellard t...@stellard.net
CC: Matt Arsenault matthew.arsena...@amd.com

Signed-off-by: Jan Vesely jan.ves...@rutgers.edu
---
  include/llvm/IR/IntrinsicsR600.td| 2 ++
  lib/Target/R600/R600ISelLowering.cpp | 6 --
  2 files changed, 6 insertions(+), 2 deletions(-)

Needs a test for the intrinsic


diff --git a/include/llvm/IR/IntrinsicsR600.td 
b/include/llvm/IR/IntrinsicsR600.td
index ba69eaa..37a9771 100644
--- a/include/llvm/IR/IntrinsicsR600.td
+++ b/include/llvm/IR/IntrinsicsR600.td
@@ -33,6 +33,8 @@ defm int_r600_read_tgid : R600ReadPreloadRegisterIntrinsic_xyz 

 __builtin_r600_read_tgid;
  defm int_r600_read_tidig : R600ReadPreloadRegisterIntrinsic_xyz 
 __builtin_r600_read_tidig;
+def int_r600_read_workdim : R600ReadPreloadRegisterIntrinsic 
+   __builtin_r600_read_workdim;
  
  } // End TargetPrefix = r600
We're trying to move the intrinsics to use the amdgpu name instead, 
although all the others use r600 now so it might be best to change them 
all at once.


  
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp

index 4c603f8..1c59684 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -805,6 +805,8 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, 
SelectionDAG DAG) const
return LowerImplicitParameter(DAG, VT, DL, 7);
  case Intrinsic::r600_read_local_size_z:
return LowerImplicitParameter(DAG, VT, DL, 8);
+case Intrinsic::r600_read_workdim:
+  return LowerImplicitParameter(DAG, VT, DL, 9);
  
  case Intrinsic::r600_read_tgid_x:

return CreateLiveInRegister(DAG, AMDGPU::R600_TReg32RegClass,
@@ -1722,7 +1724,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
  // being invalid. Somehow this seems to work with i64 arguments, but 
breaks
  // for 1 x i64.
  
-// The first 36 bytes of the input buffer contains information about

+// The first 40 bytes of the input buffer contains information about
  // thread group and global sizes.
I think we should probably round this up to some larger number since 
it's likely more will need to be added in the future


  
  // FIXME: This should really check the extload type, but the handling of

@@ -1730,7 +1732,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
  //ISD::LoadExtType Ext = Ins[i].Flags.isSExt() ? ISD::SEXTLOAD : 
ISD::ZEXTLOAD;
  ISD::LoadExtType Ext = ISD::SEXTLOAD;
  SDValue Arg = DAG.getExtLoad(Ext, DL, VT, Chain,
- DAG.getConstant(36 + VA.getLocMemOffset(), 
MVT::i32),
+ DAG.getConstant(40 + VA.getLocMemOffset(), 
MVT::i32),
   MachinePointerInfo(UndefValue::get(PtrTy)),
   MemVT, false, false, 4);
  


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] R600/SI: Use i32 vectors for resources and samplers

2014-07-07 Thread Matt Arsenault

On Jul 7, 2014, at 8:28 AM, Marek Olšák mar...@gmail.com wrote:

 From: Marek Olšák marek.ol...@amd.com
 
 This affects new intrinsics only.
 
 What surprises me is that v32i8 still works.
 ---
 lib/Target/R600/SIInstructions.td   |  4 +-
 lib/Target/R600/SIIntrinsics.td |  6 +--
 test/CodeGen/R600/llvm.SI.image.ll  | 12 ++---
 test/CodeGen/R600/llvm.SI.image.sample.ll   | 80 ++---
 test/CodeGen/R600/llvm.SI.image.sample.o.ll | 80 ++---
 5 files changed, 91 insertions(+), 91 deletions(-)

LGTM



 
 diff --git a/lib/Target/R600/SIInstructions.td 
 b/lib/Target/R600/SIInstructions.td
 index 25a9362..8aecbb9 100644
 --- a/lib/Target/R600/SIInstructions.td
 +++ b/lib/Target/R600/SIInstructions.td
 @@ -1851,7 +1851,7 @@ def : Pat 
 
 // Image + sampler
 class SampleRawPatternSDPatternOperator name, MIMG opcode, ValueType vt : 
 Pat 
 -  (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, i32:$dmask, i32:$unorm,
 +  (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm,
 i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe),
   (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm 
 $da),
   (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc),
 @@ -1868,7 +1868,7 @@ multiclass SampleRawPatternsSDPatternOperator name, 
 string opcode {
 
 // Image only
 class ImagePatternSDPatternOperator name, MIMG opcode, ValueType vt : Pat 
 -  (name vt:$addr, v32i8:$rsrc, i32:$dmask, i32:$unorm,
 +  (name vt:$addr, v8i32:$rsrc, i32:$dmask, i32:$unorm,
 i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe),
   (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm 
 $da),
   (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc),
 diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
 index fc0c850..027a0a2 100644
 --- a/lib/Target/R600/SIIntrinsics.td
 +++ b/lib/Target/R600/SIIntrinsics.td
 @@ -58,8 +58,8 @@ let TargetPrefix = SI, isTarget = 1 in {
   class SampleRaw : Intrinsic 
 [llvm_v4f32_ty],// vdata(VGPR)
 [llvm_anyint_ty,// vaddr(VGPR)
 - llvm_v32i8_ty, // rsrc(SGPR)
 - llvm_v16i8_ty, // sampler(SGPR)
 + llvm_v8i32_ty, // rsrc(SGPR)
 + llvm_v4i32_ty, // sampler(SGPR)
  llvm_i32_ty,   // dmask(imm)
  llvm_i32_ty,   // unorm(imm)
  llvm_i32_ty,   // r128(imm)
 @@ -74,7 +74,7 @@ let TargetPrefix = SI, isTarget = 1 in {
   class Image : Intrinsic 
 [llvm_v4f32_ty],// vdata(VGPR)
 [llvm_anyint_ty,// vaddr(VGPR)
 - llvm_v32i8_ty, // rsrc(SGPR)
 + llvm_v8i32_ty, // rsrc(SGPR)
  llvm_i32_ty,   // dmask(imm)
  llvm_i32_ty,   // unorm(imm)
  llvm_i32_ty,   // r128(imm)
 diff --git a/test/CodeGen/R600/llvm.SI.image.ll 
 b/test/CodeGen/R600/llvm.SI.image.ll
 index debe7c7..eac0b8e 100644
 --- a/test/CodeGen/R600/llvm.SI.image.ll
 +++ b/test/CodeGen/R600/llvm.SI.image.ll
 @@ -4,7 +4,7 @@
 ;CHECK: IMAGE_LOAD {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, 
 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @image_load() #0 {
 main_body:
 -  %r = call 4 x float @llvm.SI.image.load.v4i32(4 x i32 undef, 32 x i8 
 undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
 +  %r = call 4 x float @llvm.SI.image.load.v4i32(4 x i32 undef, 8 x i32 
 undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement 4 x float %r, i32 0
   %r1 = extractelement 4 x float %r, i32 1
   %r2 = extractelement 4 x float %r, i32 2
 @@ -17,7 +17,7 @@ main_body:
 ;CHECK: IMAGE_LOAD_MIP {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, 
 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @image_load_mip() #0 {
 main_body:
 -  %r = call 4 x float @llvm.SI.image.load.mip.v4i32(4 x i32 undef, 32 x 
 i8 undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
 +  %r = call 4 x float @llvm.SI.image.load.mip.v4i32(4 x i32 undef, 8 x 
 i32 undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement 4 x float %r, i32 0
   %r1 = extractelement 4 x float %r, i32 1
   %r2 = extractelement 4 x float %r, i32 2
 @@ -30,7 +30,7 @@ main_body:
 ;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, 
 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @getresinfo() #0 {
 main_body:
 -  %r = call 4 x float @llvm.SI.getresinfo.i32(i32 undef, 32 x i8 undef, 
 i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
 +  %r = call 4 x float @llvm.SI.getresinfo.i32(i32 undef, 8 x i32 undef, 
 i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement 4 x float %r, i32 0
   %r1 = extractelement 4 x float %r, i32 1
   %r2 = extractelement 4 x float %r, i32 2
 @@ -39,9 +39,9 @@ main_body:
   ret void
 }
 
 -declare 4 x float @llvm.SI.image.load.v4i32(4 x i32, 32 x i8, i32, 
 i32, i32, i32, i32, i32, i32, i32) #1
 

Re: [Mesa-dev] [PATCH 1/2] clover: Report a default value for CL_DEVICE_SINGLE_FP_CONFIG

2014-07-02 Thread Matt Arsenault

On Jul 2, 2014, at 12:48 PM, Tom Stellard thomas.stell...@amd.com wrote:

 ---
 src/gallium/state_trackers/clover/api/device.cpp  | 3 +--
 src/gallium/state_trackers/clover/core/device.cpp | 6 ++
 src/gallium/state_trackers/clover/core/device.hpp | 1 +
 3 files changed, 8 insertions(+), 2 deletions(-)
 
 diff --git a/src/gallium/state_trackers/clover/api/device.cpp 
 b/src/gallium/state_trackers/clover/api/device.cpp
 index 97b2cf9..3b91e9e 100644
 --- a/src/gallium/state_trackers/clover/api/device.cpp
 +++ b/src/gallium/state_trackers/clover/api/device.cpp
 @@ -201,8 +201,7 @@ clGetDeviceInfo(cl_device_id d_dev, cl_device_info param,
   break;
 
case CL_DEVICE_SINGLE_FP_CONFIG:
 -  buf.as_scalarcl_device_fp_config() =
 - CL_FP_DENORM | CL_FP_INF_NAN | CL_FP_ROUND_TO_NEAREST;
 +  buf.as_scalarcl_device_fp_config() = dev.single_fp_config();
   break;
 
case CL_DEVICE_GLOBAL_MEM_CACHE_TYPE:
 diff --git a/src/gallium/state_trackers/clover/core/device.cpp 
 b/src/gallium/state_trackers/clover/core/device.cpp
 index b6078db..498e7d9 100644
 --- a/src/gallium/state_trackers/clover/core/device.cpp
 +++ b/src/gallium/state_trackers/clover/core/device.cpp
 @@ -169,6 +169,12 @@ device::max_compute_units() const {
   PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS)[0];
 }
 
 +cl_device_fp_config
 +device::single_fp_config() const {
 +   // TODO: Get these from somewhere.
 +   return CL_FP_DENORM | CL_FP_INF_NAN | CL_FP_ROUND_TO_NEAREST;
 +}

Probably shouldn’t include CL_FP_DENORM as the default. It should be false on 
R600/SI, and probably many other devices


 +
 std::vectorsize_t
 device::max_block_size() const {
auto v = get_compute_paramuint64_t(pipe, 
 PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE);
 diff --git a/src/gallium/state_trackers/clover/core/device.hpp 
 b/src/gallium/state_trackers/clover/core/device.hpp
 index 731c31e..93f9091 100644
 --- a/src/gallium/state_trackers/clover/core/device.hpp
 +++ b/src/gallium/state_trackers/clover/core/device.hpp
 @@ -63,6 +63,7 @@ namespace clover {
   cl_ulong max_mem_alloc_size() const;
   cl_uint max_clock_frequency() const;
   cl_uint max_compute_units() const;
 +  cl_device_fp_config single_fp_config() const;
 
   std::vectorsize_t max_block_size() const;
   std::string device_name() const;
 -- 
 1.8.1.5
 
 ___
 mesa-dev mailing list
 mesa-dev@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/mesa-dev

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/2] clover: Report a default value for CL_DEVICE_SINGLE_FP_CONFIG

2014-07-02 Thread Matt Arsenault

On Jul 2, 2014, at 12:52 PM, Matt Arsenault arse...@gmail.com wrote:

 
 On Jul 2, 2014, at 12:48 PM, Tom Stellard thomas.stell...@amd.com wrote:
 
 ---
 src/gallium/state_trackers/clover/api/device.cpp  | 3 +--
 src/gallium/state_trackers/clover/core/device.cpp | 6 ++
 src/gallium/state_trackers/clover/core/device.hpp | 1 +
 3 files changed, 8 insertions(+), 2 deletions(-)
 
 diff --git a/src/gallium/state_trackers/clover/api/device.cpp 
 b/src/gallium/state_trackers/clover/api/device.cpp
 index 97b2cf9..3b91e9e 100644
 --- a/src/gallium/state_trackers/clover/api/device.cpp
 +++ b/src/gallium/state_trackers/clover/api/device.cpp
 @@ -201,8 +201,7 @@ clGetDeviceInfo(cl_device_id d_dev, cl_device_info param,
  break;
 
   case CL_DEVICE_SINGLE_FP_CONFIG:
 -  buf.as_scalarcl_device_fp_config() =
 - CL_FP_DENORM | CL_FP_INF_NAN | CL_FP_ROUND_TO_NEAREST;
 +  buf.as_scalarcl_device_fp_config() = dev.single_fp_config();
  break;
 
   case CL_DEVICE_GLOBAL_MEM_CACHE_TYPE:
 diff --git a/src/gallium/state_trackers/clover/core/device.cpp 
 b/src/gallium/state_trackers/clover/core/device.cpp
 index b6078db..498e7d9 100644
 --- a/src/gallium/state_trackers/clover/core/device.cpp
 +++ b/src/gallium/state_trackers/clover/core/device.cpp
 @@ -169,6 +169,12 @@ device::max_compute_units() const {
  PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS)[0];
 }
 
 +cl_device_fp_config
 +device::single_fp_config() const {
 +   // TODO: Get these from somewhere.
 +   return CL_FP_DENORM | CL_FP_INF_NAN | CL_FP_ROUND_TO_NEAREST;
 +}
 
 Probably shouldn’t include CL_FP_DENORM as the default. It should be false on 
 R600/SI, and probably many other devices
 
 

The spec says the mandated minimum for single precision on non-custom devices 
is CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN, so without a device query that 
should be the default


 +
 std::vectorsize_t
 device::max_block_size() const {
   auto v = get_compute_paramuint64_t(pipe, 
 PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE);
 diff --git a/src/gallium/state_trackers/clover/core/device.hpp 
 b/src/gallium/state_trackers/clover/core/device.hpp
 index 731c31e..93f9091 100644
 --- a/src/gallium/state_trackers/clover/core/device.hpp
 +++ b/src/gallium/state_trackers/clover/core/device.hpp
 @@ -63,6 +63,7 @@ namespace clover {
  cl_ulong max_mem_alloc_size() const;
  cl_uint max_clock_frequency() const;
  cl_uint max_compute_units() const;
 +  cl_device_fp_config single_fp_config() const;
 
  std::vectorsize_t max_block_size() const;
  std::string device_name() const;
 -- 
 1.8.1.5
 
 ___
 mesa-dev mailing list
 mesa-dev@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/mesa-dev

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 5/5] clover: Enable cl_khr_fp64 for devices that support doubles

2014-06-21 Thread Matt Arsenault

On Jun 21, 2014, at 9:37 AM, Francisco Jerez curroje...@riseup.net wrote:

 Tom Stellard thomas.stell...@amd.com writes:
 
 ---
 src/gallium/state_trackers/clover/api/device.cpp  | 4 +++-
 src/gallium/state_trackers/clover/core/device.cpp | 6 ++
 src/gallium/state_trackers/clover/core/device.hpp | 1 +
 3 files changed, 10 insertions(+), 1 deletion(-)
 
 diff --git a/src/gallium/state_trackers/clover/api/device.cpp 
 b/src/gallium/state_trackers/clover/api/device.cpp
 index dc8e22c..275542d 100644
 --- a/src/gallium/state_trackers/clover/api/device.cpp
 +++ b/src/gallium/state_trackers/clover/api/device.cpp
 @@ -290,7 +290,9 @@ clGetDeviceInfo(cl_device_id d_dev, cl_device_info param,
   break;
 
case CL_DEVICE_EXTENSIONS:
 -  buf.as_string() = ;
 +  // The trailing space is intentional. It is a spec-ism that there is a
 +  // trailing space at the end of the list of extensions.
 
 Hm...  Really?  Where does the spec say that?
 

I’m not really sure. I remember reporting a bug before about a trailing space 
in these outputs, and got the response that it is required by the spec, but 
never really finding where. I know it’s required for the versions (which also 
currently fails the conformance test with “A space must appear after the minor 
version! (returned: OpenCL C 1.1)”


 +  buf.as_string() = dev.cl_khr_fp64() ? cl_khr_fp64  : ;
   break;
 
case CL_DEVICE_PLATFORM:
 diff --git a/src/gallium/state_trackers/clover/core/device.cpp 
 b/src/gallium/state_trackers/clover/core/device.cpp
 index 6d52dd4..51b54fa 100644
 --- a/src/gallium/state_trackers/clover/core/device.cpp
 +++ b/src/gallium/state_trackers/clover/core/device.cpp
 @@ -187,6 +187,12 @@ device::half_fp_config() const {
return CL_FP_DENORM | CL_FP_INF_NAN | CL_FP_ROUND_TO_NEAREST;
 }
 
 +bool
 +device::cl_khr_fp64() const {
 +   return pipe-get_shader_param(pipe, PIPE_SHADER_COMPUTE,
 + PIPE_SHADER_CAP_DOUBLES);
 +}
 +
 
 Can we call this function has_doubles() or something similar?  The
 extension name is somewhat cryptic.  Or, I don't know, maybe use
 'device::double_fp_config()' to check if doubles are supported?  As
 Bruno said, it should return 0 if doubles are not supported by the
 implementation.

The extension is also “removed” in 1.2 and doubles become an “optional core 
extension” whatever that means, and is just required to be reported anyway for 
compatibility.

 
 std::vectorsize_t
 device::max_block_size() const {
auto v = get_compute_paramuint64_t(pipe, 
 PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE);
 diff --git a/src/gallium/state_trackers/clover/core/device.hpp 
 b/src/gallium/state_trackers/clover/core/device.hpp
 index 380029e..38bea54 100644
 --- a/src/gallium/state_trackers/clover/core/device.hpp
 +++ b/src/gallium/state_trackers/clover/core/device.hpp
 @@ -65,6 +65,7 @@ namespace clover {
   cl_device_fp_config single_fp_config() const;
   cl_device_fp_config double_fp_config() const;
   cl_device_fp_config half_fp_config() const;
 +  bool cl_khr_fp64() const;
 
   std::vectorsize_t max_block_size() const;
   std::string device_name() const;
 -- 
 1.8.1.4
 ___
 mesa-dev mailing list
 mesa-dev@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/mesa-dev

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 5/5] clover: Enable cl_khr_fp64 for devices that support doubles

2014-06-21 Thread Matt Arsenault

On Jun 21, 2014, at 12:32 PM, Francisco Jerez curroje...@riseup.net wrote:

 Matt Arsenault arse...@gmail.com writes:
 
 On Jun 21, 2014, at 9:37 AM, Francisco Jerez curroje...@riseup.net wrote:
 
 Tom Stellard thomas.stell...@amd.com writes:
 [...]
   case CL_DEVICE_EXTENSIONS:
 -  buf.as_string() = ;
 +  // The trailing space is intentional. It is a spec-ism that there 
 is a
 +  // trailing space at the end of the list of extensions.
 
 Hm...  Really?  Where does the spec say that?
 
 
 I’m not really sure. I remember reporting a bug before about a trailing 
 space in these outputs, and got the response that it is required by the 
 spec, but never really finding where. I know it’s required for the versions 
 (which also currently fails the conformance test with “A space must appear 
 after the minor version! (returned: OpenCL C 1.1)”
 
 
 That's because of the space-separated vendor-specific information that
 CL_DEVICE_VERSION is supposed to include after the minor version, it
 doesn't mean that the version string should necessarily be
 space-terminated.
 
 I don't think that the trailing space is required in the extension
 string, let's get rid of it unless you can find evidence from the spec
 that it should be otherwise.
 
 Thanks.
 

Fine with me
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] radeon/llvm: Adapt to AMDGPU.rsq intrinsic change in LLVM 3.5

2014-06-19 Thread Matt Arsenault

On Jun 18, 2014, at 11:53 PM, Michel Dänzer mic...@daenzer.net wrote:

 From: Michel Dänzer michel.daen...@amd.com
 
 Signed-off-by: Michel Dänzer michel.daen...@amd.com
 ---
 src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c | 4 
 1 file changed, 4 insertions(+)
 
 diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c 
 b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
 index f8be0df..217fa32 100644
 --- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
 +++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
 @@ -1384,7 +1384,11 @@ void radeon_llvm_context_init(struct 
 radeon_llvm_context * ctx)
   bld_base-op_actions[TGSI_OPCODE_UCMP].emit = emit_ucmp;
 
   bld_base-rsq_action.emit = build_tgsi_intrinsic_nomem;
 +#if HAVE_LLVM = 0x0305
 + bld_base-rsq_action.intr_name = llvm.AMDGPU.rsq.;
 +#else
   bld_base-rsq_action.intr_name = llvm.AMDGPU.rsq;
 +#endif
 }
 
 void radeon_llvm_create_func(struct radeon_llvm_context * ctx,
 -- 
 2.0.0
 
 ___
 mesa-dev mailing list
 mesa-dev@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/mesa-dev

While you’re at it, could you fix the old AMDIL intrinsics too?

I don’t think these have been updated yet (although the old names still 
technically work)

llvm.AMDIL.abs - llvm.AMDGPU.abs
llvm.AMDIL.clamp - llvm.AMDGPU.clamp
llvm.AMDIL.fraction - llvm.AMDGPU.fract
llvm.AMDIL.exp - llvm.exp2
llvm.AMDIL.round.nearest - llvm.rint
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 5/5] clover: Enable cl_khr_fp64 for devices that support doubles

2014-06-17 Thread Matt Arsenault

On Jun 17, 2014, at 3:11 PM, Bruno Jimenez brunoji...@gmail.com wrote:

 Hi,
 
 I have a couple of questions about this patch:
 
 1) Could you please also change how the results of the
 'CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE' and
 'CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE' queries are generated?
 According to the spec, if 'cl_khr_fp64' is not supported, they should be
 0.
 



 2) When llvm is invoked for compiling a module, a macro definition is
 added with:
 'c.getPreprocessorOpts().addMacroDef(cl_khr_fp64);'
 Is it ok to use this macro always or should it only be used when
 'cl_khr_fp64' is supported?

It should only be added when supported. The most common way of testing for 
double support is ifdef cl_khr_fp64
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/2] R600/SI: add Gather4 intrinsics (v2)

2014-06-16 Thread Matt Arsenault

On 06/16/2014 08:45 AM, Tom Stellard wrote:

You don't need to add new SDNodes for all these instructions, you can just use
the intrinsic directly in the pattern.

The only reason to add SDNodes, is if there are optimizations / special lowering
we can do for these instructions.

I kind of like having everything consistently use an SDNode
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] R600/SI: add Gather4 intrinsics

2014-06-08 Thread Matt Arsenault

On 06/06/2014 02:57 PM, Marek Olšák wrote:


DMASK was repurposed for GATHER4, so all passes which modify DMASK are
disabled by setting MIMG=0 and hasPostISelHook=0. See my Mesa patches
for how DMASK works with GATHER4, because this is not documented anywhere.

Can you add a comment explaining this to the source here?

Needs tests, other than that LGTM
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [cfe-dev] 3 element vectors in opencl 1.1+

2014-04-22 Thread Matt Arsenault

On 04/22/2014 02:35 PM, Tom Stellard wrote:

On Mon, Apr 21, 2014 at 10:02:27PM -0400, Jan Vesely wrote:

Hi,

I ran into a problem caused by this part of the OCL specs (6.1.5
Alignment of Types):
For 3-component vector data types, the size of the data type is 4 *
sizeof(component).

and the corresponding part of Khronos cl_platform.h (with all types, not
just float):
/* cl_float3 is identical in size, alignment and behavior to cl_float4.
See section 6.1.5. */
typedef  cl_float4  cl_float3;

So when I try to run kernel that takes 3 element vectors as arguments I
get 'invalid arg size' error.

Not sure whether this is best solved in clang, libclc or clover. I tried
changing float3 to have 4 elements in libclc, it caused clang to
complain in thousand places. I don't think this can be handled cleanly
in clang, unless we add something like __attribute__((padding)).

I have attached a workaround that I use now.


You may want to ask this question on the pocl mailing list as they
have likely solved this issue already.  Ideally, TD.getTypeStoreSize
would return the correct value.  Also, maybe look at the DataLayout
description for R600 and see if there is a way to specify the
correct type size.

-Tom

I think this is what v96:128 is for





any advice welcome,
Jan

--
Jan Vesely jan.ves...@rutgers.edu
diff --git a/src/gallium/state_trackers/clover/llvm/invocation.cpp 
b/src/gallium/state_trackers/clover/llvm/invocation.cpp
index a81bdf8..71ee01f 100644
--- a/src/gallium/state_trackers/clover/llvm/invocation.cpp
+++ b/src/gallium/state_trackers/clover/llvm/invocation.cpp
@@ -62,6 +62,8 @@
  #include llvm/IR/DataLayout.h
  #endif
  
+#include llvm/Support/Debug.h

+
  #include pipe/p_state.h
  #include util/u_memory.h
  
@@ -309,6 +311,13 @@ namespace {
  
  llvm::Type *arg_type = arg.getType();

  unsigned arg_size = TD.getTypeStoreSize(arg_type);
+if (arg_type-isVectorTy() 
+   ::llvm::cast::llvm::VectorType(arg_type)-getNumElements() == 
3) {
+   ::llvm::dbgs()  Fixing argument type:   *arg_type
+size:   arg_size  -
+   (arg_size / 3) * 4  \n;
+   arg_size = (arg_size / 3) * 4;
+}
  
  llvm::Type *target_type = arg_type-isIntegerTy() ?

 TD.getSmallestLegalIntType(mod-getContext(), arg_size * 8) :





___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

___
cfe-dev mailing list
cfe-...@cs.uiuc.edu
http://lists.cs.uiuc.edu/mailman/listinfo/cfe-dev


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [cfe-dev] 3 element vectors in opencl 1.1+

2014-04-22 Thread Matt Arsenault

On 04/22/2014 05:22 PM, Jan Vesely wrote:

On Tue, 2014-04-22 at 14:40 -0700, Matt Arsenault wrote:

On 04/22/2014 02:35 PM, Tom Stellard wrote:

On Mon, Apr 21, 2014 at 10:02:27PM -0400, Jan Vesely wrote:

Hi,

I ran into a problem caused by this part of the OCL specs (6.1.5
Alignment of Types):
For 3-component vector data types, the size of the data type is 4 *
sizeof(component).

and the corresponding part of Khronos cl_platform.h (with all types, not
just float):
/* cl_float3 is identical in size, alignment and behavior to cl_float4.
See section 6.1.5. */
typedef  cl_float4  cl_float3;

So when I try to run kernel that takes 3 element vectors as arguments I
get 'invalid arg size' error.

Not sure whether this is best solved in clang, libclc or clover. I tried
changing float3 to have 4 elements in libclc, it caused clang to
complain in thousand places. I don't think this can be handled cleanly
in clang, unless we add something like __attribute__((padding)).

I have attached a workaround that I use now.


You may want to ask this question on the pocl mailing list as they
have likely solved this issue already.  Ideally, TD.getTypeStoreSize
would return the correct value.  Also, maybe look at the DataLayout
description for R600 and see if there is a way to specify the
correct type size.

-Tom

I think this is what v96:128 is for

according to [0], it specifies only alignment, not size. I could not
find an __attribute__ that would change size either.

It should be possible to have ADMGPUDataLayout: public DataLayout class
that would intercept the call and fix the reported value, but I think it
would only move the hack to different place.

I have added pocl-devel list as suggested.

regards,
Jan

[0]http://llvm.org/docs/LangRef.html#data-layout



Only the size in memory matters, which is what the required alignment 
specifies. DataLayout::getTypeAllocSize accounts for the alignment, but 
getTypeStoreSize does not. I actually thought this was half of what 
getTypeStoreSize was for, but it turns out it isn't.

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] R600: Verify all instructions in the AsmPrinter on debug builds

2014-02-25 Thread Matt Arsenault

On 02/25/2014 02:01 PM, Eric Christopher wrote:

Also it probably shouldn't reference an external bug reporter.

-eric


There's a build configure bug report URL you could use (which is where I 
assume the llvm.org/bugs link comes from in the output from a crash)


On Tue, Feb 25, 2014 at 1:54 PM, Tom Stellard t...@stellard.net wrote:

On Tue, Feb 25, 2014 at 01:47:17PM -0800, Matt Arsenault wrote:

On 02/25/2014 01:42 PM, Tom Stellard wrote:

+errs()  Please file a bug a bugs.freedesktop.org\n;

Typo, s/a/at/


Thanks, I will fix this before I commit.

-Tom


___
llvm-commits mailing list
llvm-comm...@cs.uiuc.edu
http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits

___
llvm-commits mailing list
llvm-comm...@cs.uiuc.edu
http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits



___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] R600: Verify all instructions in the AsmPrinter on debug builds

2014-02-25 Thread Matt Arsenault

On 02/25/2014 01:42 PM, Tom Stellard wrote:

+errs()  Please file a bug a bugs.freedesktop.org\n;

Typo, s/a/at/

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] R600/SI: Custom select 64-bit ADD

2014-02-13 Thread Matt Arsenault

On Feb 7, 2014, at 7:46 AM, Tom Stellard t...@stellard.net wrote:

 From: Tom Stellard thomas.stell...@amd.com
 
 ---
 lib/Target/R600/AMDGPUISelDAGToDAG.cpp | 48 ++
 lib/Target/R600/SIISelLowering.cpp | 29 
 lib/Target/R600/SIISelLowering.h   |  1 -
 test/CodeGen/R600/add.ll   | 10 +++
 test/CodeGen/R600/add_i64.ll   | 23 +++-
 5 files changed, 75 insertions(+), 36 deletions(-)
 
 diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp 
 b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
 index a989135..fea875c 100644
 --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
 +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
 @@ -200,6 +200,54 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
   }
   switch (Opc) {
   default: break;
 +  // We are selecting i64 ADD here instead of custom lower it during
 +  // DAG legalization, so we can fold some i64 ADDs used for address
 +  // calculation into the LOAD and STORE instructions.
 +  case ISD::ADD: {
 +const AMDGPUSubtarget ST = TM.getSubtargetAMDGPUSubtarget();
 +if (N-getValueType(0) != MVT::i64 ||
 +ST.getGeneration()  AMDGPUSubtarget::SOUTHERN_ISLANDS)
 +  break;
 +
 +SDLoc DL(N);
 +SDValue LHS = N-getOperand(0);
 +SDValue RHS = N-getOperand(1);
 +
 +SDValue Sub0 = CurDAG-getTargetConstant(AMDGPU::sub0, MVT::i32);
 +SDValue Sub1 = CurDAG-getTargetConstant(AMDGPU::sub1, MVT::i32);
 +
 +SDNode *Lo0 = CurDAG-getMachineNode(TargetOpcode::EXTRACT_SUBREG,
 + DL, MVT::i32, LHS, Sub0);
 +SDNode *Hi0 = CurDAG-getMachineNode(TargetOpcode::EXTRACT_SUBREG,
 + DL, MVT::i32, LHS, Sub1);
 +
 +SDNode *Lo1 = CurDAG-getMachineNode(TargetOpcode::EXTRACT_SUBREG,
 + DL, MVT::i32, RHS, Sub0);
 +SDNode *Hi1 = CurDAG-getMachineNode(TargetOpcode::EXTRACT_SUBREG,
 + DL, MVT::i32, RHS, Sub1);
 +
 +SDVTList VTList = CurDAG-getVTList(MVT::i32, MVT::Glue);
 +
 +SmallVectorSDValue, 8 AddLoArgs;
 +AddLoArgs.push_back(SDValue(Lo0, 0));
 +AddLoArgs.push_back(SDValue(Lo1, 0));
 +
 +SDNode *AddLo = CurDAG-getMachineNode(AMDGPU::S_ADD_I32, DL,
 +   VTList, AddLoArgs);
 +SDValue Carry = SDValue(AddLo, 1);
 +SDNode *AddHi = CurDAG-getMachineNode(AMDGPU::S_ADDC_U32, DL,
 +   MVT::i32, SDValue(Hi0, 0),
 +   SDValue(Hi1, 0), Carry);
 +
 +SDValue Args[5] = {
 +  CurDAG-getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32),
 +  SDValue(AddLo,0),
 +  Sub0,
 +  SDValue(AddHi,0),
 +  Sub1,
 +};
 +return CurDAG-SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args, 5);
 +  }
   case ISD::BUILD_VECTOR: {
 unsigned RegClassID;
 const AMDGPUSubtarget ST = TM.getSubtargetAMDGPUSubtarget();
 diff --git a/lib/Target/R600/SIISelLowering.cpp 
 b/lib/Target/R600/SIISelLowering.cpp
 index 0a22d16..4d2f370 100644
 --- a/lib/Target/R600/SIISelLowering.cpp
 +++ b/lib/Target/R600/SIISelLowering.cpp
 @@ -76,7 +76,6 @@ SITargetLowering::SITargetLowering(TargetMachine TM) :
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
 
 -  setOperationAction(ISD::ADD, MVT::i64, Legal);

Would it be better to mark this as custom lowered, and then just return 
SDValue() for it? That way it won’t be incorrectly reported as legal for 
anything that might be checking.

   setOperationAction(ISD::ADD, MVT::i32, Legal);
   setOperationAction(ISD::ADDC, MVT::i32, Legal);
   setOperationAction(ISD::ADDE, MVT::i32, Legal);
 @@ -475,7 +474,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, 
 SelectionDAG DAG) const {
   SIMachineFunctionInfo *MFI = MF.getInfoSIMachineFunctionInfo();
   switch (Op.getOpcode()) {
   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 -  case ISD::ADD: return LowerADD(Op, DAG);
   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
   case ISD::LOAD: {
 LoadSDNode *Load = dyn_castLoadSDNode(Op);
 @@ -613,33 +611,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, 
 SelectionDAG DAG) const {
   return SDValue();
 }
 
 -SDValue SITargetLowering::LowerADD(SDValue Op,
 -   SelectionDAG DAG) const {
 -  if (Op.getValueType() != MVT::i64)
 -return SDValue();
 -
 -  SDLoc DL(Op);
 -  SDValue LHS = Op.getOperand(0);
 -  SDValue RHS = Op.getOperand(1);
 -
 -  SDValue Zero = DAG.getConstant(0, MVT::i32);
 -  SDValue One = DAG.getConstant(1, MVT::i32);
 -
 -  SDValue Lo0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, Zero);
 -  SDValue Hi0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, One);
 -
 -  SDValue Lo1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS, Zero);
 -  SDValue Hi1 = 

Re: [Mesa-dev] [PATCH] R600/SI: Split global vector loads with more than 4 elements

2014-02-10 Thread Matt Arsenault
Why would you want to do this for the small types? You should be able to 
load those in fewer loads and then promote them.


On 02/10/2014 01:32 PM, Tom Stellard wrote:

From: Tom Stellard thomas.stell...@amd.com

---
  lib/Target/R600/SIISelLowering.cpp |   8 +-
  test/CodeGen/R600/load.ll  | 178 +++--
  2 files changed, 98 insertions(+), 88 deletions(-)

diff --git a/lib/Target/R600/SIISelLowering.cpp 
b/lib/Target/R600/SIISelLowering.cpp
index 9537405..eb08a13 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -478,9 +478,11 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, 
SelectionDAG DAG) const {
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::LOAD: {
  LoadSDNode *Load = dyn_castLoadSDNode(Op);
-if ((Load-getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
- Load-getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) 
-Op.getValueType().isVector()) {
+if (Op.getValueType().isVector() 
+(Load-getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+ Load-getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
+ (Load-getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS 
+  Op.getValueType().getVectorNumElements()  4))) {
SDValue MergedValues[2] = {
  SplitVectorLoad(Op, DAG),
  Load-getChain()
diff --git a/test/CodeGen/R600/load.ll b/test/CodeGen/R600/load.ll
index 0153524..1486c4d 100644
--- a/test/CodeGen/R600/load.ll
+++ b/test/CodeGen/R600/load.ll
@@ -1,16 +1,15 @@
-; RUN: llc  %s -march=r600 -mcpu=redwood | FileCheck 
--check-prefix=R600-CHECK %s
-; RUN: llc  %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=R600-CHECK 
%s
-; RUN: llc  %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck 
--check-prefix=SI-CHECK  %s
+; RUN: llc  %s -march=r600 -mcpu=redwood | FileCheck 
--check-prefix=R600-CHECK --check-prefix=FUNC %s
+; RUN: llc  %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=R600-CHECK 
--check-prefix=FUNC %s
+; RUN: llc  %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck 
--check-prefix=SI-CHECK --check-prefix=FUNC %s
  
  ;======;

  ; GLOBAL ADDRESS SPACE
  
;======;
  
  ; Load an i8 value from the global address space.

-; R600-CHECK-LABEL: @load_i8
+; FUNC-LABEL: @load_i8
  ; R600-CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
  
-; SI-CHECK-LABEL: @load_i8

  ; SI-CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}},
  define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
%1 = load i8 addrspace(1)* %in
@@ -19,13 +18,12 @@ define void @load_i8(i32 addrspace(1)* %out, i8 
addrspace(1)* %in) {
ret void
  }
  
-; R600-CHECK-LABEL: @load_i8_sext

+; FUNC-LABEL: @load_i8_sext
  ; R600-CHECK: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
  ; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
  ; R600-CHECK: 24
  ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
  ; R600-CHECK: 24
-; SI-CHECK-LABEL: @load_i8_sext
  ; SI-CHECK: BUFFER_LOAD_SBYTE
  define void @load_i8_sext(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
  entry:
@@ -35,10 +33,9 @@ entry:
ret void
  }
  
-; R600-CHECK-LABEL: @load_v2i8

+; FUNC-LABEL: @load_v2i8
  ; R600-CHECK: VTX_READ_8
  ; R600-CHECK: VTX_READ_8
-; SI-CHECK-LABEL: @load_v2i8
  ; SI-CHECK: BUFFER_LOAD_UBYTE
  ; SI-CHECK: BUFFER_LOAD_UBYTE
  define void @load_v2i8(2 x i32 addrspace(1)* %out, 2 x i8 addrspace(1)* 
%in) {
@@ -49,7 +46,7 @@ entry:
ret void
  }
  
-; R600-CHECK-LABEL: @load_v2i8_sext

+; FUNC-LABEL: @load_v2i8_sext
  ; R600-CHECK-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
  ; R600-CHECK-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
  ; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
@@ -60,7 +57,6 @@ entry:
  ; R600-CHECK-DAG: 24
  ; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
  ; R600-CHECK-DAG: 24
-; SI-CHECK-LABEL: @load_v2i8_sext
  ; SI-CHECK: BUFFER_LOAD_SBYTE
  ; SI-CHECK: BUFFER_LOAD_SBYTE
  define void @load_v2i8_sext(2 x i32 addrspace(1)* %out, 2 x i8 
addrspace(1)* %in) {
@@ -71,12 +67,11 @@ entry:
ret void
  }
  
-; R600-CHECK-LABEL: @load_v4i8

+; FUNC-LABEL: @load_v4i8
  ; R600-CHECK: VTX_READ_8
  ; R600-CHECK: VTX_READ_8
  ; R600-CHECK: VTX_READ_8
  ; R600-CHECK: VTX_READ_8
-; SI-CHECK-LABEL: @load_v4i8
  ; SI-CHECK: BUFFER_LOAD_UBYTE
  ; SI-CHECK: BUFFER_LOAD_UBYTE
  ; SI-CHECK: BUFFER_LOAD_UBYTE
@@ -89,7 +84,7 @@ entry:
ret void
  }
  
-; R600-CHECK-LABEL: @load_v4i8_sext

+; FUNC-LABEL: @load_v4i8_sext
  ; R600-CHECK-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
  ; R600-CHECK-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
  ; R600-CHECK-DAG: VTX_READ_8 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
@@ -110,7 +105,6 @@ entry:
  ; R600-CHECK-DAG: 24
  ; R600-CHECK-DAG: ASHR {{[* 

Re: [Mesa-dev] [PATCH] R600/SI: Custom select 64-bit ADD

2014-02-08 Thread Matt Arsenault

I didn't think to try this. Where is the address folding happening?

On 02/07/2014 07:46 AM, Tom Stellard wrote:

From: Tom Stellard thomas.stell...@amd.com

---
  lib/Target/R600/AMDGPUISelDAGToDAG.cpp | 48 ++
  lib/Target/R600/SIISelLowering.cpp | 29 
  lib/Target/R600/SIISelLowering.h   |  1 -
  test/CodeGen/R600/add.ll   | 10 +++
  test/CodeGen/R600/add_i64.ll   | 23 +++-
  5 files changed, 75 insertions(+), 36 deletions(-)

diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp 
b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
index a989135..fea875c 100644
--- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
@@ -200,6 +200,54 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
}
switch (Opc) {
default: break;
+  // We are selecting i64 ADD here instead of custom lower it during
+  // DAG legalization, so we can fold some i64 ADDs used for address
+  // calculation into the LOAD and STORE instructions.
+  case ISD::ADD: {
+const AMDGPUSubtarget ST = TM.getSubtargetAMDGPUSubtarget();
+if (N-getValueType(0) != MVT::i64 ||
+ST.getGeneration()  AMDGPUSubtarget::SOUTHERN_ISLANDS)
+  break;
+
+SDLoc DL(N);
+SDValue LHS = N-getOperand(0);
+SDValue RHS = N-getOperand(1);
+
+SDValue Sub0 = CurDAG-getTargetConstant(AMDGPU::sub0, MVT::i32);
+SDValue Sub1 = CurDAG-getTargetConstant(AMDGPU::sub1, MVT::i32);
+
+SDNode *Lo0 = CurDAG-getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, LHS, Sub0);
+SDNode *Hi0 = CurDAG-getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, LHS, Sub1);
+
+SDNode *Lo1 = CurDAG-getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, RHS, Sub0);
+SDNode *Hi1 = CurDAG-getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, RHS, Sub1);
+
+SDVTList VTList = CurDAG-getVTList(MVT::i32, MVT::Glue);
+
+SmallVectorSDValue, 8 AddLoArgs;
+AddLoArgs.push_back(SDValue(Lo0, 0));
+AddLoArgs.push_back(SDValue(Lo1, 0));
+
+SDNode *AddLo = CurDAG-getMachineNode(AMDGPU::S_ADD_I32, DL,
+   VTList, AddLoArgs);
+SDValue Carry = SDValue(AddLo, 1);
+SDNode *AddHi = CurDAG-getMachineNode(AMDGPU::S_ADDC_U32, DL,
+   MVT::i32, SDValue(Hi0, 0),
+   SDValue(Hi1, 0), Carry);
+
+SDValue Args[5] = {
+  CurDAG-getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32),
+  SDValue(AddLo,0),
+  Sub0,
+  SDValue(AddHi,0),
+  Sub1,
+};
+return CurDAG-SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args, 5);
+  }
case ISD::BUILD_VECTOR: {
  unsigned RegClassID;
  const AMDGPUSubtarget ST = TM.getSubtargetAMDGPUSubtarget();
diff --git a/lib/Target/R600/SIISelLowering.cpp 
b/lib/Target/R600/SIISelLowering.cpp
index 0a22d16..4d2f370 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -76,7 +76,6 @@ SITargetLowering::SITargetLowering(TargetMachine TM) :
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
  
-  setOperationAction(ISD::ADD, MVT::i64, Legal);

setOperationAction(ISD::ADD, MVT::i32, Legal);
setOperationAction(ISD::ADDC, MVT::i32, Legal);
setOperationAction(ISD::ADDE, MVT::i32, Legal);
@@ -475,7 +474,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, 
SelectionDAG DAG) const {
SIMachineFunctionInfo *MFI = MF.getInfoSIMachineFunctionInfo();
switch (Op.getOpcode()) {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
-  case ISD::ADD: return LowerADD(Op, DAG);
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::LOAD: {
  LoadSDNode *Load = dyn_castLoadSDNode(Op);
@@ -613,33 +611,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, 
SelectionDAG DAG) const {
return SDValue();
  }
  
-SDValue SITargetLowering::LowerADD(SDValue Op,

-   SelectionDAG DAG) const {
-  if (Op.getValueType() != MVT::i64)
-return SDValue();
-
-  SDLoc DL(Op);
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-
-  SDValue Zero = DAG.getConstant(0, MVT::i32);
-  SDValue One = DAG.getConstant(1, MVT::i32);
-
-  SDValue Lo0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, Zero);
-  SDValue Hi0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, One);
-
-  SDValue Lo1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS, Zero);
-  SDValue Hi1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS, One);
-
-  SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Glue);
-
-  SDValue AddLo = DAG.getNode(ISD::ADDC, DL, VTList, Lo0, Lo1);
-  SDValue Carry = 

Re: [Mesa-dev] PATCH: R600 + SI Private memory fixes; Use more SALU instructions on SI

2013-10-10 Thread Matt Arsenault

On 10/10/2013 10:55 AM, Tom Stellard wrote:

Hi,

The attached patches simplify the handling of OpenCL private memory
space for VLIW4/VLIW5 GPUs and should fix a crash with pyrit on r600g.
Also included in the series is private memory support on SI as well as an
optimization to prefer selecting SALU instructions over VALU instructions.

Please test/review.

-Tom


___
llvm-commits mailing list
llvm-comm...@cs.uiuc.edu
http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits

The last one doesn't apply for me:

$ git am *.patch
Applying: R600: Remove unused InstrInfo::getMovImmInstr() function
Applying: R600: Simplify handling of private address space
Applying: R600/SI: Prefer SALU instructions for bit shift operations
Applying: R600/SI: Add support for i64 bitwise or
Applying: R600/SI: Add support for private address space load/store
error: patch failed: lib/Target/R600/SIRegisterInfo.cpp:26
error: lib/Target/R600/SIRegisterInfo.cpp: patch does not apply
Patch failed at 0005 R600/SI: Add support for private address space 
load/store

The copy of the patch that failed is found in:
   /home/marsenau/src/llvm/.git/rebase-apply/patch
When you have resolved this problem, run git am --resolved.
If you prefer to skip this patch, run git am --skip instead.
To restore the original branch and stop patching, run git am --abort.

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/2] R600/SI: FMA is faster than fmul and fadd for f64

2013-08-09 Thread Matt Arsenault

On 08/09/2013 05:59 AM, Niels Ole Salscheider wrote:

+bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+  VT = VT.getScalarType();
+
+  if (!VT.isSimple())
+return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f32:
+return false; /* There is V_MAD_F32 for f32 */

I don't think the f32 case is false for all devices.

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] R600 Patches: Add support for the local address space

2013-06-12 Thread Matt Arsenault

On 06/12/2013 05:42 PM, Tom Stellard wrote:

Hi,

The attached patches add support for local address space on
Evergreen / Northern Islands GPUs.

Please Review.

-Tom

 +  def int_AMDGPU_barrier_local  : Intrinsic[], [], [];
You probably want to mark this as IntrReadMem to try to avoid reordering 
stores around the barrier


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev