Re: [Mesa-dev] [PATCH 08/10] radeonsi: only upload (dump to L2) those descriptors that are used by shaders

2017-05-18 Thread Nicolai Hähnle

On 18.05.2017 12:43, Marek Olšák wrote:

On Thu, May 18, 2017 at 12:41 PM, Marek Olšák  wrote:

On Thu, May 18, 2017 at 11:31 AM, Nicolai Hähnle  wrote:

On 17.05.2017 21:38, Marek Olšák wrote:


From: Marek Olšák 

This decreases the size of CE RAM dumps to L2, or the size of descriptor
uploads without CE.
---
 src/gallium/drivers/radeonsi/si_compute.c   | 28 ++--
 src/gallium/drivers/radeonsi/si_descriptors.c   | 85
-
 src/gallium/drivers/radeonsi/si_state.h | 18 +-
 src/gallium/drivers/radeonsi/si_state_shaders.c |  6 ++
 4 files changed, 113 insertions(+), 24 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c
b/src/gallium/drivers/radeonsi/si_compute.c
index 22ef111..4c98066 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -201,21 +201,38 @@ static void *si_create_compute_state(
return NULL;
}
}

return program;
 }

 static void si_bind_compute_state(struct pipe_context *ctx, void *state)
 {
struct si_context *sctx = (struct si_context*)ctx;
-   sctx->cs_shader_state.program = (struct si_compute*)state;
+   struct si_compute *program = (struct si_compute*)state;
+
+   sctx->cs_shader_state.program = program;
+   if (!program)
+   return;
+
+   /* Wait because we need active slot usage masks. */
+   if (program->ir_type == PIPE_SHADER_IR_TGSI)
+   util_queue_fence_wait(>ready);
+
+   si_set_active_descriptors(sctx,
+ SI_DESCS_FIRST_COMPUTE +
+
SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
+
program->active_const_and_shader_buffers);
+   si_set_active_descriptors(sctx,
+ SI_DESCS_FIRST_COMPUTE +
+ SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
+ program->active_samplers_and_images);
 }

 static void si_set_global_binding(
struct pipe_context *ctx, unsigned first, unsigned n,
struct pipe_resource **resources,
uint32_t **handles)
 {
unsigned i;
struct si_context *sctx = (struct si_context*)ctx;
struct si_compute *program = sctx->cs_shader_state.program;
@@ -749,26 +766,23 @@ static void si_launch_grid(
bool cs_regalloc_hang =
(sctx->b.chip_class == SI ||
 sctx->b.family == CHIP_BONAIRE ||
 sctx->b.family == CHIP_KABINI) &&
info->block[0] * info->block[1] * info->block[2] > 256;

if (cs_regalloc_hang)
sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
 SI_CONTEXT_CS_PARTIAL_FLUSH;

-   if (program->ir_type == PIPE_SHADER_IR_TGSI) {
-   util_queue_fence_wait(>ready);
-
-   if (program->shader.compilation_failed)
-   return;
-   }
+   if (program->ir_type == PIPE_SHADER_IR_TGSI &&
+   program->shader.compilation_failed)
+   return;

si_decompress_compute_textures(sctx);

/* Add buffer sizes for memory checking in need_cs_space. */
r600_context_add_resource_size(ctx, >shader.bo->b.b);
/* TODO: add the scratch buffer */

if (info->indirect) {
r600_context_add_resource_size(ctx, info->indirect);

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c
b/src/gallium/drivers/radeonsi/si_descriptors.c
index 38e4ae1..a2f40a8 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -118,26 +118,28 @@ static void si_init_descriptors(struct
si_descriptors *desc,
}
 }

 static void si_release_descriptors(struct si_descriptors *desc)
 {
r600_resource_reference(>buffer, NULL);
FREE(desc->list);
 }

 static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset,
unsigned size,
-unsigned *out_offset, struct r600_resource
**out_buf) {
+unsigned *out_offset, struct r600_resource
**out_buf)
+{
uint64_t va;

u_suballocator_alloc(sctx->ce_suballocator, size,
-sctx->screen->b.info.tcc_cache_line_size,
-out_offset, (struct pipe_resource**)out_buf);
+si_optimal_tcc_alignment(sctx, size),
+(unsigned*)out_offset,



The extra cast of out_offset is unnecessary.



+(struct pipe_resource**)out_buf);
if (!out_buf)
return false;

va = (*out_buf)->gpu_address + *out_offset;

radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0));
radeon_emit(sctx->ce_ib, ce_offset);
radeon_emit(sctx->ce_ib, size / 4);
radeon_emit(sctx->ce_ib, va);
radeon_emit(sctx->ce_ib, va >> 

Re: [Mesa-dev] [PATCH 08/10] radeonsi: only upload (dump to L2) those descriptors that are used by shaders

2017-05-18 Thread Marek Olšák
On Thu, May 18, 2017 at 12:41 PM, Marek Olšák  wrote:
> On Thu, May 18, 2017 at 11:31 AM, Nicolai Hähnle  wrote:
>> On 17.05.2017 21:38, Marek Olšák wrote:
>>>
>>> From: Marek Olšák 
>>>
>>> This decreases the size of CE RAM dumps to L2, or the size of descriptor
>>> uploads without CE.
>>> ---
>>>  src/gallium/drivers/radeonsi/si_compute.c   | 28 ++--
>>>  src/gallium/drivers/radeonsi/si_descriptors.c   | 85
>>> -
>>>  src/gallium/drivers/radeonsi/si_state.h | 18 +-
>>>  src/gallium/drivers/radeonsi/si_state_shaders.c |  6 ++
>>>  4 files changed, 113 insertions(+), 24 deletions(-)
>>>
>>> diff --git a/src/gallium/drivers/radeonsi/si_compute.c
>>> b/src/gallium/drivers/radeonsi/si_compute.c
>>> index 22ef111..4c98066 100644
>>> --- a/src/gallium/drivers/radeonsi/si_compute.c
>>> +++ b/src/gallium/drivers/radeonsi/si_compute.c
>>> @@ -201,21 +201,38 @@ static void *si_create_compute_state(
>>> return NULL;
>>> }
>>> }
>>>
>>> return program;
>>>  }
>>>
>>>  static void si_bind_compute_state(struct pipe_context *ctx, void *state)
>>>  {
>>> struct si_context *sctx = (struct si_context*)ctx;
>>> -   sctx->cs_shader_state.program = (struct si_compute*)state;
>>> +   struct si_compute *program = (struct si_compute*)state;
>>> +
>>> +   sctx->cs_shader_state.program = program;
>>> +   if (!program)
>>> +   return;
>>> +
>>> +   /* Wait because we need active slot usage masks. */
>>> +   if (program->ir_type == PIPE_SHADER_IR_TGSI)
>>> +   util_queue_fence_wait(>ready);
>>> +
>>> +   si_set_active_descriptors(sctx,
>>> + SI_DESCS_FIRST_COMPUTE +
>>> +
>>> SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
>>> +
>>> program->active_const_and_shader_buffers);
>>> +   si_set_active_descriptors(sctx,
>>> + SI_DESCS_FIRST_COMPUTE +
>>> + SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
>>> + program->active_samplers_and_images);
>>>  }
>>>
>>>  static void si_set_global_binding(
>>> struct pipe_context *ctx, unsigned first, unsigned n,
>>> struct pipe_resource **resources,
>>> uint32_t **handles)
>>>  {
>>> unsigned i;
>>> struct si_context *sctx = (struct si_context*)ctx;
>>> struct si_compute *program = sctx->cs_shader_state.program;
>>> @@ -749,26 +766,23 @@ static void si_launch_grid(
>>> bool cs_regalloc_hang =
>>> (sctx->b.chip_class == SI ||
>>>  sctx->b.family == CHIP_BONAIRE ||
>>>  sctx->b.family == CHIP_KABINI) &&
>>> info->block[0] * info->block[1] * info->block[2] > 256;
>>>
>>> if (cs_regalloc_hang)
>>> sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>>>  SI_CONTEXT_CS_PARTIAL_FLUSH;
>>>
>>> -   if (program->ir_type == PIPE_SHADER_IR_TGSI) {
>>> -   util_queue_fence_wait(>ready);
>>> -
>>> -   if (program->shader.compilation_failed)
>>> -   return;
>>> -   }
>>> +   if (program->ir_type == PIPE_SHADER_IR_TGSI &&
>>> +   program->shader.compilation_failed)
>>> +   return;
>>>
>>> si_decompress_compute_textures(sctx);
>>>
>>> /* Add buffer sizes for memory checking in need_cs_space. */
>>> r600_context_add_resource_size(ctx, >shader.bo->b.b);
>>> /* TODO: add the scratch buffer */
>>>
>>> if (info->indirect) {
>>> r600_context_add_resource_size(ctx, info->indirect);
>>>
>>> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c
>>> b/src/gallium/drivers/radeonsi/si_descriptors.c
>>> index 38e4ae1..a2f40a8 100644
>>> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
>>> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
>>> @@ -118,26 +118,28 @@ static void si_init_descriptors(struct
>>> si_descriptors *desc,
>>> }
>>>  }
>>>
>>>  static void si_release_descriptors(struct si_descriptors *desc)
>>>  {
>>> r600_resource_reference(>buffer, NULL);
>>> FREE(desc->list);
>>>  }
>>>
>>>  static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset,
>>> unsigned size,
>>> -unsigned *out_offset, struct r600_resource
>>> **out_buf) {
>>> +unsigned *out_offset, struct r600_resource
>>> **out_buf)
>>> +{
>>> uint64_t va;
>>>
>>> u_suballocator_alloc(sctx->ce_suballocator, size,
>>> -sctx->screen->b.info.tcc_cache_line_size,
>>> -out_offset, (struct pipe_resource**)out_buf);
>>> +si_optimal_tcc_alignment(sctx, size),
>>> +(unsigned*)out_offset,
>>
>>
>> The extra cast of out_offset 

Re: [Mesa-dev] [PATCH 08/10] radeonsi: only upload (dump to L2) those descriptors that are used by shaders

2017-05-18 Thread Marek Olšák
On Thu, May 18, 2017 at 11:31 AM, Nicolai Hähnle  wrote:
> On 17.05.2017 21:38, Marek Olšák wrote:
>>
>> From: Marek Olšák 
>>
>> This decreases the size of CE RAM dumps to L2, or the size of descriptor
>> uploads without CE.
>> ---
>>  src/gallium/drivers/radeonsi/si_compute.c   | 28 ++--
>>  src/gallium/drivers/radeonsi/si_descriptors.c   | 85
>> -
>>  src/gallium/drivers/radeonsi/si_state.h | 18 +-
>>  src/gallium/drivers/radeonsi/si_state_shaders.c |  6 ++
>>  4 files changed, 113 insertions(+), 24 deletions(-)
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_compute.c
>> b/src/gallium/drivers/radeonsi/si_compute.c
>> index 22ef111..4c98066 100644
>> --- a/src/gallium/drivers/radeonsi/si_compute.c
>> +++ b/src/gallium/drivers/radeonsi/si_compute.c
>> @@ -201,21 +201,38 @@ static void *si_create_compute_state(
>> return NULL;
>> }
>> }
>>
>> return program;
>>  }
>>
>>  static void si_bind_compute_state(struct pipe_context *ctx, void *state)
>>  {
>> struct si_context *sctx = (struct si_context*)ctx;
>> -   sctx->cs_shader_state.program = (struct si_compute*)state;
>> +   struct si_compute *program = (struct si_compute*)state;
>> +
>> +   sctx->cs_shader_state.program = program;
>> +   if (!program)
>> +   return;
>> +
>> +   /* Wait because we need active slot usage masks. */
>> +   if (program->ir_type == PIPE_SHADER_IR_TGSI)
>> +   util_queue_fence_wait(>ready);
>> +
>> +   si_set_active_descriptors(sctx,
>> + SI_DESCS_FIRST_COMPUTE +
>> +
>> SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
>> +
>> program->active_const_and_shader_buffers);
>> +   si_set_active_descriptors(sctx,
>> + SI_DESCS_FIRST_COMPUTE +
>> + SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
>> + program->active_samplers_and_images);
>>  }
>>
>>  static void si_set_global_binding(
>> struct pipe_context *ctx, unsigned first, unsigned n,
>> struct pipe_resource **resources,
>> uint32_t **handles)
>>  {
>> unsigned i;
>> struct si_context *sctx = (struct si_context*)ctx;
>> struct si_compute *program = sctx->cs_shader_state.program;
>> @@ -749,26 +766,23 @@ static void si_launch_grid(
>> bool cs_regalloc_hang =
>> (sctx->b.chip_class == SI ||
>>  sctx->b.family == CHIP_BONAIRE ||
>>  sctx->b.family == CHIP_KABINI) &&
>> info->block[0] * info->block[1] * info->block[2] > 256;
>>
>> if (cs_regalloc_hang)
>> sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>>  SI_CONTEXT_CS_PARTIAL_FLUSH;
>>
>> -   if (program->ir_type == PIPE_SHADER_IR_TGSI) {
>> -   util_queue_fence_wait(>ready);
>> -
>> -   if (program->shader.compilation_failed)
>> -   return;
>> -   }
>> +   if (program->ir_type == PIPE_SHADER_IR_TGSI &&
>> +   program->shader.compilation_failed)
>> +   return;
>>
>> si_decompress_compute_textures(sctx);
>>
>> /* Add buffer sizes for memory checking in need_cs_space. */
>> r600_context_add_resource_size(ctx, >shader.bo->b.b);
>> /* TODO: add the scratch buffer */
>>
>> if (info->indirect) {
>> r600_context_add_resource_size(ctx, info->indirect);
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c
>> b/src/gallium/drivers/radeonsi/si_descriptors.c
>> index 38e4ae1..a2f40a8 100644
>> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
>> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
>> @@ -118,26 +118,28 @@ static void si_init_descriptors(struct
>> si_descriptors *desc,
>> }
>>  }
>>
>>  static void si_release_descriptors(struct si_descriptors *desc)
>>  {
>> r600_resource_reference(>buffer, NULL);
>> FREE(desc->list);
>>  }
>>
>>  static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset,
>> unsigned size,
>> -unsigned *out_offset, struct r600_resource
>> **out_buf) {
>> +unsigned *out_offset, struct r600_resource
>> **out_buf)
>> +{
>> uint64_t va;
>>
>> u_suballocator_alloc(sctx->ce_suballocator, size,
>> -sctx->screen->b.info.tcc_cache_line_size,
>> -out_offset, (struct pipe_resource**)out_buf);
>> +si_optimal_tcc_alignment(sctx, size),
>> +(unsigned*)out_offset,
>
>
> The extra cast of out_offset is unnecessary.
>
>
>> +(struct pipe_resource**)out_buf);
>> if (!out_buf)
>> return false;
>>
>> va = (*out_buf)->gpu_address + 

Re: [Mesa-dev] [PATCH 08/10] radeonsi: only upload (dump to L2) those descriptors that are used by shaders

2017-05-18 Thread Nicolai Hähnle

On 17.05.2017 21:38, Marek Olšák wrote:

From: Marek Olšák 

This decreases the size of CE RAM dumps to L2, or the size of descriptor
uploads without CE.
---
 src/gallium/drivers/radeonsi/si_compute.c   | 28 ++--
 src/gallium/drivers/radeonsi/si_descriptors.c   | 85 -
 src/gallium/drivers/radeonsi/si_state.h | 18 +-
 src/gallium/drivers/radeonsi/si_state_shaders.c |  6 ++
 4 files changed, 113 insertions(+), 24 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 22ef111..4c98066 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -201,21 +201,38 @@ static void *si_create_compute_state(
return NULL;
}
}

return program;
 }

 static void si_bind_compute_state(struct pipe_context *ctx, void *state)
 {
struct si_context *sctx = (struct si_context*)ctx;
-   sctx->cs_shader_state.program = (struct si_compute*)state;
+   struct si_compute *program = (struct si_compute*)state;
+
+   sctx->cs_shader_state.program = program;
+   if (!program)
+   return;
+
+   /* Wait because we need active slot usage masks. */
+   if (program->ir_type == PIPE_SHADER_IR_TGSI)
+   util_queue_fence_wait(>ready);
+
+   si_set_active_descriptors(sctx,
+ SI_DESCS_FIRST_COMPUTE +
+ SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
+ program->active_const_and_shader_buffers);
+   si_set_active_descriptors(sctx,
+ SI_DESCS_FIRST_COMPUTE +
+ SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
+ program->active_samplers_and_images);
 }

 static void si_set_global_binding(
struct pipe_context *ctx, unsigned first, unsigned n,
struct pipe_resource **resources,
uint32_t **handles)
 {
unsigned i;
struct si_context *sctx = (struct si_context*)ctx;
struct si_compute *program = sctx->cs_shader_state.program;
@@ -749,26 +766,23 @@ static void si_launch_grid(
bool cs_regalloc_hang =
(sctx->b.chip_class == SI ||
 sctx->b.family == CHIP_BONAIRE ||
 sctx->b.family == CHIP_KABINI) &&
info->block[0] * info->block[1] * info->block[2] > 256;

if (cs_regalloc_hang)
sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
 SI_CONTEXT_CS_PARTIAL_FLUSH;

-   if (program->ir_type == PIPE_SHADER_IR_TGSI) {
-   util_queue_fence_wait(>ready);
-
-   if (program->shader.compilation_failed)
-   return;
-   }
+   if (program->ir_type == PIPE_SHADER_IR_TGSI &&
+   program->shader.compilation_failed)
+   return;

si_decompress_compute_textures(sctx);

/* Add buffer sizes for memory checking in need_cs_space. */
r600_context_add_resource_size(ctx, >shader.bo->b.b);
/* TODO: add the scratch buffer */

if (info->indirect) {
r600_context_add_resource_size(ctx, info->indirect);

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index 38e4ae1..a2f40a8 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -118,26 +118,28 @@ static void si_init_descriptors(struct si_descriptors 
*desc,
}
 }

 static void si_release_descriptors(struct si_descriptors *desc)
 {
r600_resource_reference(>buffer, NULL);
FREE(desc->list);
 }

 static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned 
size,
-unsigned *out_offset, struct r600_resource **out_buf) {
+unsigned *out_offset, struct r600_resource **out_buf)
+{
uint64_t va;

u_suballocator_alloc(sctx->ce_suballocator, size,
-sctx->screen->b.info.tcc_cache_line_size,
-out_offset, (struct pipe_resource**)out_buf);
+si_optimal_tcc_alignment(sctx, size),
+(unsigned*)out_offset,


The extra cast of out_offset is unnecessary.



+(struct pipe_resource**)out_buf);
if (!out_buf)
return false;

va = (*out_buf)->gpu_address + *out_offset;

radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0));
radeon_emit(sctx->ce_ib, ce_offset);
radeon_emit(sctx->ce_ib, size / 4);
radeon_emit(sctx->ce_ib, va);
radeon_emit(sctx->ce_ib, va >> 32);
@@ -186,58 +188,70 @@ void si_ce_enable_loads(struct radeon_winsys_cs *ib)
radeon_emit(ib, 

[Mesa-dev] [PATCH 08/10] radeonsi: only upload (dump to L2) those descriptors that are used by shaders

2017-05-17 Thread Marek Olšák
From: Marek Olšák 

This decreases the size of CE RAM dumps to L2, or the size of descriptor
uploads without CE.
---
 src/gallium/drivers/radeonsi/si_compute.c   | 28 ++--
 src/gallium/drivers/radeonsi/si_descriptors.c   | 85 -
 src/gallium/drivers/radeonsi/si_state.h | 18 +-
 src/gallium/drivers/radeonsi/si_state_shaders.c |  6 ++
 4 files changed, 113 insertions(+), 24 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 22ef111..4c98066 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -201,21 +201,38 @@ static void *si_create_compute_state(
return NULL;
}
}
 
return program;
 }
 
 static void si_bind_compute_state(struct pipe_context *ctx, void *state)
 {
struct si_context *sctx = (struct si_context*)ctx;
-   sctx->cs_shader_state.program = (struct si_compute*)state;
+   struct si_compute *program = (struct si_compute*)state;
+
+   sctx->cs_shader_state.program = program;
+   if (!program)
+   return;
+
+   /* Wait because we need active slot usage masks. */
+   if (program->ir_type == PIPE_SHADER_IR_TGSI)
+   util_queue_fence_wait(>ready);
+
+   si_set_active_descriptors(sctx,
+ SI_DESCS_FIRST_COMPUTE +
+ SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
+ program->active_const_and_shader_buffers);
+   si_set_active_descriptors(sctx,
+ SI_DESCS_FIRST_COMPUTE +
+ SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
+ program->active_samplers_and_images);
 }
 
 static void si_set_global_binding(
struct pipe_context *ctx, unsigned first, unsigned n,
struct pipe_resource **resources,
uint32_t **handles)
 {
unsigned i;
struct si_context *sctx = (struct si_context*)ctx;
struct si_compute *program = sctx->cs_shader_state.program;
@@ -749,26 +766,23 @@ static void si_launch_grid(
bool cs_regalloc_hang =
(sctx->b.chip_class == SI ||
 sctx->b.family == CHIP_BONAIRE ||
 sctx->b.family == CHIP_KABINI) &&
info->block[0] * info->block[1] * info->block[2] > 256;
 
if (cs_regalloc_hang)
sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
 SI_CONTEXT_CS_PARTIAL_FLUSH;
 
-   if (program->ir_type == PIPE_SHADER_IR_TGSI) {
-   util_queue_fence_wait(>ready);
-
-   if (program->shader.compilation_failed)
-   return;
-   }
+   if (program->ir_type == PIPE_SHADER_IR_TGSI &&
+   program->shader.compilation_failed)
+   return;
 
si_decompress_compute_textures(sctx);
 
/* Add buffer sizes for memory checking in need_cs_space. */
r600_context_add_resource_size(ctx, >shader.bo->b.b);
/* TODO: add the scratch buffer */
 
if (info->indirect) {
r600_context_add_resource_size(ctx, info->indirect);
 
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index 38e4ae1..a2f40a8 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -118,26 +118,28 @@ static void si_init_descriptors(struct si_descriptors 
*desc,
}
 }
 
 static void si_release_descriptors(struct si_descriptors *desc)
 {
r600_resource_reference(>buffer, NULL);
FREE(desc->list);
 }
 
 static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned 
size,
-unsigned *out_offset, struct r600_resource **out_buf) {
+unsigned *out_offset, struct r600_resource **out_buf)
+{
uint64_t va;
 
u_suballocator_alloc(sctx->ce_suballocator, size,
-sctx->screen->b.info.tcc_cache_line_size,
-out_offset, (struct pipe_resource**)out_buf);
+si_optimal_tcc_alignment(sctx, size),
+(unsigned*)out_offset,
+(struct pipe_resource**)out_buf);
if (!out_buf)
return false;
 
va = (*out_buf)->gpu_address + *out_offset;
 
radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0));
radeon_emit(sctx->ce_ib, ce_offset);
radeon_emit(sctx->ce_ib, size / 4);
radeon_emit(sctx->ce_ib, va);
radeon_emit(sctx->ce_ib, va >> 32);
@@ -186,58 +188,70 @@ void si_ce_enable_loads(struct radeon_winsys_cs *ib)
radeon_emit(ib, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
radeon_emit(ib, CONTEXT_CONTROL_LOAD_ENABLE(1) |