Re: [Mesa-dev] [PATCH 08/10] radeonsi: only upload (dump to L2) those descriptors that are used by shaders
On 18.05.2017 12:43, Marek Olšák wrote: On Thu, May 18, 2017 at 12:41 PM, Marek Olšákwrote: On Thu, May 18, 2017 at 11:31 AM, Nicolai Hähnle wrote: On 17.05.2017 21:38, Marek Olšák wrote: From: Marek Olšák This decreases the size of CE RAM dumps to L2, or the size of descriptor uploads without CE. --- src/gallium/drivers/radeonsi/si_compute.c | 28 ++-- src/gallium/drivers/radeonsi/si_descriptors.c | 85 - src/gallium/drivers/radeonsi/si_state.h | 18 +- src/gallium/drivers/radeonsi/si_state_shaders.c | 6 ++ 4 files changed, 113 insertions(+), 24 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 22ef111..4c98066 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -201,21 +201,38 @@ static void *si_create_compute_state( return NULL; } } return program; } static void si_bind_compute_state(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context*)ctx; - sctx->cs_shader_state.program = (struct si_compute*)state; + struct si_compute *program = (struct si_compute*)state; + + sctx->cs_shader_state.program = program; + if (!program) + return; + + /* Wait because we need active slot usage masks. */ + if (program->ir_type == PIPE_SHADER_IR_TGSI) + util_queue_fence_wait(>ready); + + si_set_active_descriptors(sctx, + SI_DESCS_FIRST_COMPUTE + + SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS, + program->active_const_and_shader_buffers); + si_set_active_descriptors(sctx, + SI_DESCS_FIRST_COMPUTE + + SI_SHADER_DESCS_SAMPLERS_AND_IMAGES, + program->active_samplers_and_images); } static void si_set_global_binding( struct pipe_context *ctx, unsigned first, unsigned n, struct pipe_resource **resources, uint32_t **handles) { unsigned i; struct si_context *sctx = (struct si_context*)ctx; struct si_compute *program = sctx->cs_shader_state.program; @@ -749,26 +766,23 @@ static void si_launch_grid( bool cs_regalloc_hang = (sctx->b.chip_class == SI || sctx->b.family == CHIP_BONAIRE || sctx->b.family == CHIP_KABINI) && info->block[0] * info->block[1] * info->block[2] > 256; if (cs_regalloc_hang) sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH; - if (program->ir_type == PIPE_SHADER_IR_TGSI) { - util_queue_fence_wait(>ready); - - if (program->shader.compilation_failed) - return; - } + if (program->ir_type == PIPE_SHADER_IR_TGSI && + program->shader.compilation_failed) + return; si_decompress_compute_textures(sctx); /* Add buffer sizes for memory checking in need_cs_space. */ r600_context_add_resource_size(ctx, >shader.bo->b.b); /* TODO: add the scratch buffer */ if (info->indirect) { r600_context_add_resource_size(ctx, info->indirect); diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 38e4ae1..a2f40a8 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -118,26 +118,28 @@ static void si_init_descriptors(struct si_descriptors *desc, } } static void si_release_descriptors(struct si_descriptors *desc) { r600_resource_reference(>buffer, NULL); FREE(desc->list); } static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned size, -unsigned *out_offset, struct r600_resource **out_buf) { +unsigned *out_offset, struct r600_resource **out_buf) +{ uint64_t va; u_suballocator_alloc(sctx->ce_suballocator, size, -sctx->screen->b.info.tcc_cache_line_size, -out_offset, (struct pipe_resource**)out_buf); +si_optimal_tcc_alignment(sctx, size), +(unsigned*)out_offset, The extra cast of out_offset is unnecessary. +(struct pipe_resource**)out_buf); if (!out_buf) return false; va = (*out_buf)->gpu_address + *out_offset; radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0)); radeon_emit(sctx->ce_ib, ce_offset); radeon_emit(sctx->ce_ib, size / 4); radeon_emit(sctx->ce_ib, va); radeon_emit(sctx->ce_ib, va >>
Re: [Mesa-dev] [PATCH 08/10] radeonsi: only upload (dump to L2) those descriptors that are used by shaders
On Thu, May 18, 2017 at 12:41 PM, Marek Olšákwrote: > On Thu, May 18, 2017 at 11:31 AM, Nicolai Hähnle wrote: >> On 17.05.2017 21:38, Marek Olšák wrote: >>> >>> From: Marek Olšák >>> >>> This decreases the size of CE RAM dumps to L2, or the size of descriptor >>> uploads without CE. >>> --- >>> src/gallium/drivers/radeonsi/si_compute.c | 28 ++-- >>> src/gallium/drivers/radeonsi/si_descriptors.c | 85 >>> - >>> src/gallium/drivers/radeonsi/si_state.h | 18 +- >>> src/gallium/drivers/radeonsi/si_state_shaders.c | 6 ++ >>> 4 files changed, 113 insertions(+), 24 deletions(-) >>> >>> diff --git a/src/gallium/drivers/radeonsi/si_compute.c >>> b/src/gallium/drivers/radeonsi/si_compute.c >>> index 22ef111..4c98066 100644 >>> --- a/src/gallium/drivers/radeonsi/si_compute.c >>> +++ b/src/gallium/drivers/radeonsi/si_compute.c >>> @@ -201,21 +201,38 @@ static void *si_create_compute_state( >>> return NULL; >>> } >>> } >>> >>> return program; >>> } >>> >>> static void si_bind_compute_state(struct pipe_context *ctx, void *state) >>> { >>> struct si_context *sctx = (struct si_context*)ctx; >>> - sctx->cs_shader_state.program = (struct si_compute*)state; >>> + struct si_compute *program = (struct si_compute*)state; >>> + >>> + sctx->cs_shader_state.program = program; >>> + if (!program) >>> + return; >>> + >>> + /* Wait because we need active slot usage masks. */ >>> + if (program->ir_type == PIPE_SHADER_IR_TGSI) >>> + util_queue_fence_wait(>ready); >>> + >>> + si_set_active_descriptors(sctx, >>> + SI_DESCS_FIRST_COMPUTE + >>> + >>> SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS, >>> + >>> program->active_const_and_shader_buffers); >>> + si_set_active_descriptors(sctx, >>> + SI_DESCS_FIRST_COMPUTE + >>> + SI_SHADER_DESCS_SAMPLERS_AND_IMAGES, >>> + program->active_samplers_and_images); >>> } >>> >>> static void si_set_global_binding( >>> struct pipe_context *ctx, unsigned first, unsigned n, >>> struct pipe_resource **resources, >>> uint32_t **handles) >>> { >>> unsigned i; >>> struct si_context *sctx = (struct si_context*)ctx; >>> struct si_compute *program = sctx->cs_shader_state.program; >>> @@ -749,26 +766,23 @@ static void si_launch_grid( >>> bool cs_regalloc_hang = >>> (sctx->b.chip_class == SI || >>> sctx->b.family == CHIP_BONAIRE || >>> sctx->b.family == CHIP_KABINI) && >>> info->block[0] * info->block[1] * info->block[2] > 256; >>> >>> if (cs_regalloc_hang) >>> sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | >>> SI_CONTEXT_CS_PARTIAL_FLUSH; >>> >>> - if (program->ir_type == PIPE_SHADER_IR_TGSI) { >>> - util_queue_fence_wait(>ready); >>> - >>> - if (program->shader.compilation_failed) >>> - return; >>> - } >>> + if (program->ir_type == PIPE_SHADER_IR_TGSI && >>> + program->shader.compilation_failed) >>> + return; >>> >>> si_decompress_compute_textures(sctx); >>> >>> /* Add buffer sizes for memory checking in need_cs_space. */ >>> r600_context_add_resource_size(ctx, >shader.bo->b.b); >>> /* TODO: add the scratch buffer */ >>> >>> if (info->indirect) { >>> r600_context_add_resource_size(ctx, info->indirect); >>> >>> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c >>> b/src/gallium/drivers/radeonsi/si_descriptors.c >>> index 38e4ae1..a2f40a8 100644 >>> --- a/src/gallium/drivers/radeonsi/si_descriptors.c >>> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c >>> @@ -118,26 +118,28 @@ static void si_init_descriptors(struct >>> si_descriptors *desc, >>> } >>> } >>> >>> static void si_release_descriptors(struct si_descriptors *desc) >>> { >>> r600_resource_reference(>buffer, NULL); >>> FREE(desc->list); >>> } >>> >>> static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, >>> unsigned size, >>> -unsigned *out_offset, struct r600_resource >>> **out_buf) { >>> +unsigned *out_offset, struct r600_resource >>> **out_buf) >>> +{ >>> uint64_t va; >>> >>> u_suballocator_alloc(sctx->ce_suballocator, size, >>> -sctx->screen->b.info.tcc_cache_line_size, >>> -out_offset, (struct pipe_resource**)out_buf); >>> +si_optimal_tcc_alignment(sctx, size), >>> +(unsigned*)out_offset, >> >> >> The extra cast of out_offset
Re: [Mesa-dev] [PATCH 08/10] radeonsi: only upload (dump to L2) those descriptors that are used by shaders
On Thu, May 18, 2017 at 11:31 AM, Nicolai Hähnlewrote: > On 17.05.2017 21:38, Marek Olšák wrote: >> >> From: Marek Olšák >> >> This decreases the size of CE RAM dumps to L2, or the size of descriptor >> uploads without CE. >> --- >> src/gallium/drivers/radeonsi/si_compute.c | 28 ++-- >> src/gallium/drivers/radeonsi/si_descriptors.c | 85 >> - >> src/gallium/drivers/radeonsi/si_state.h | 18 +- >> src/gallium/drivers/radeonsi/si_state_shaders.c | 6 ++ >> 4 files changed, 113 insertions(+), 24 deletions(-) >> >> diff --git a/src/gallium/drivers/radeonsi/si_compute.c >> b/src/gallium/drivers/radeonsi/si_compute.c >> index 22ef111..4c98066 100644 >> --- a/src/gallium/drivers/radeonsi/si_compute.c >> +++ b/src/gallium/drivers/radeonsi/si_compute.c >> @@ -201,21 +201,38 @@ static void *si_create_compute_state( >> return NULL; >> } >> } >> >> return program; >> } >> >> static void si_bind_compute_state(struct pipe_context *ctx, void *state) >> { >> struct si_context *sctx = (struct si_context*)ctx; >> - sctx->cs_shader_state.program = (struct si_compute*)state; >> + struct si_compute *program = (struct si_compute*)state; >> + >> + sctx->cs_shader_state.program = program; >> + if (!program) >> + return; >> + >> + /* Wait because we need active slot usage masks. */ >> + if (program->ir_type == PIPE_SHADER_IR_TGSI) >> + util_queue_fence_wait(>ready); >> + >> + si_set_active_descriptors(sctx, >> + SI_DESCS_FIRST_COMPUTE + >> + >> SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS, >> + >> program->active_const_and_shader_buffers); >> + si_set_active_descriptors(sctx, >> + SI_DESCS_FIRST_COMPUTE + >> + SI_SHADER_DESCS_SAMPLERS_AND_IMAGES, >> + program->active_samplers_and_images); >> } >> >> static void si_set_global_binding( >> struct pipe_context *ctx, unsigned first, unsigned n, >> struct pipe_resource **resources, >> uint32_t **handles) >> { >> unsigned i; >> struct si_context *sctx = (struct si_context*)ctx; >> struct si_compute *program = sctx->cs_shader_state.program; >> @@ -749,26 +766,23 @@ static void si_launch_grid( >> bool cs_regalloc_hang = >> (sctx->b.chip_class == SI || >> sctx->b.family == CHIP_BONAIRE || >> sctx->b.family == CHIP_KABINI) && >> info->block[0] * info->block[1] * info->block[2] > 256; >> >> if (cs_regalloc_hang) >> sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | >> SI_CONTEXT_CS_PARTIAL_FLUSH; >> >> - if (program->ir_type == PIPE_SHADER_IR_TGSI) { >> - util_queue_fence_wait(>ready); >> - >> - if (program->shader.compilation_failed) >> - return; >> - } >> + if (program->ir_type == PIPE_SHADER_IR_TGSI && >> + program->shader.compilation_failed) >> + return; >> >> si_decompress_compute_textures(sctx); >> >> /* Add buffer sizes for memory checking in need_cs_space. */ >> r600_context_add_resource_size(ctx, >shader.bo->b.b); >> /* TODO: add the scratch buffer */ >> >> if (info->indirect) { >> r600_context_add_resource_size(ctx, info->indirect); >> >> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c >> b/src/gallium/drivers/radeonsi/si_descriptors.c >> index 38e4ae1..a2f40a8 100644 >> --- a/src/gallium/drivers/radeonsi/si_descriptors.c >> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c >> @@ -118,26 +118,28 @@ static void si_init_descriptors(struct >> si_descriptors *desc, >> } >> } >> >> static void si_release_descriptors(struct si_descriptors *desc) >> { >> r600_resource_reference(>buffer, NULL); >> FREE(desc->list); >> } >> >> static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, >> unsigned size, >> -unsigned *out_offset, struct r600_resource >> **out_buf) { >> +unsigned *out_offset, struct r600_resource >> **out_buf) >> +{ >> uint64_t va; >> >> u_suballocator_alloc(sctx->ce_suballocator, size, >> -sctx->screen->b.info.tcc_cache_line_size, >> -out_offset, (struct pipe_resource**)out_buf); >> +si_optimal_tcc_alignment(sctx, size), >> +(unsigned*)out_offset, > > > The extra cast of out_offset is unnecessary. > > >> +(struct pipe_resource**)out_buf); >> if (!out_buf) >> return false; >> >> va = (*out_buf)->gpu_address +
Re: [Mesa-dev] [PATCH 08/10] radeonsi: only upload (dump to L2) those descriptors that are used by shaders
On 17.05.2017 21:38, Marek Olšák wrote: From: Marek OlšákThis decreases the size of CE RAM dumps to L2, or the size of descriptor uploads without CE. --- src/gallium/drivers/radeonsi/si_compute.c | 28 ++-- src/gallium/drivers/radeonsi/si_descriptors.c | 85 - src/gallium/drivers/radeonsi/si_state.h | 18 +- src/gallium/drivers/radeonsi/si_state_shaders.c | 6 ++ 4 files changed, 113 insertions(+), 24 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 22ef111..4c98066 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -201,21 +201,38 @@ static void *si_create_compute_state( return NULL; } } return program; } static void si_bind_compute_state(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context*)ctx; - sctx->cs_shader_state.program = (struct si_compute*)state; + struct si_compute *program = (struct si_compute*)state; + + sctx->cs_shader_state.program = program; + if (!program) + return; + + /* Wait because we need active slot usage masks. */ + if (program->ir_type == PIPE_SHADER_IR_TGSI) + util_queue_fence_wait(>ready); + + si_set_active_descriptors(sctx, + SI_DESCS_FIRST_COMPUTE + + SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS, + program->active_const_and_shader_buffers); + si_set_active_descriptors(sctx, + SI_DESCS_FIRST_COMPUTE + + SI_SHADER_DESCS_SAMPLERS_AND_IMAGES, + program->active_samplers_and_images); } static void si_set_global_binding( struct pipe_context *ctx, unsigned first, unsigned n, struct pipe_resource **resources, uint32_t **handles) { unsigned i; struct si_context *sctx = (struct si_context*)ctx; struct si_compute *program = sctx->cs_shader_state.program; @@ -749,26 +766,23 @@ static void si_launch_grid( bool cs_regalloc_hang = (sctx->b.chip_class == SI || sctx->b.family == CHIP_BONAIRE || sctx->b.family == CHIP_KABINI) && info->block[0] * info->block[1] * info->block[2] > 256; if (cs_regalloc_hang) sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH; - if (program->ir_type == PIPE_SHADER_IR_TGSI) { - util_queue_fence_wait(>ready); - - if (program->shader.compilation_failed) - return; - } + if (program->ir_type == PIPE_SHADER_IR_TGSI && + program->shader.compilation_failed) + return; si_decompress_compute_textures(sctx); /* Add buffer sizes for memory checking in need_cs_space. */ r600_context_add_resource_size(ctx, >shader.bo->b.b); /* TODO: add the scratch buffer */ if (info->indirect) { r600_context_add_resource_size(ctx, info->indirect); diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 38e4ae1..a2f40a8 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -118,26 +118,28 @@ static void si_init_descriptors(struct si_descriptors *desc, } } static void si_release_descriptors(struct si_descriptors *desc) { r600_resource_reference(>buffer, NULL); FREE(desc->list); } static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned size, -unsigned *out_offset, struct r600_resource **out_buf) { +unsigned *out_offset, struct r600_resource **out_buf) +{ uint64_t va; u_suballocator_alloc(sctx->ce_suballocator, size, -sctx->screen->b.info.tcc_cache_line_size, -out_offset, (struct pipe_resource**)out_buf); +si_optimal_tcc_alignment(sctx, size), +(unsigned*)out_offset, The extra cast of out_offset is unnecessary. +(struct pipe_resource**)out_buf); if (!out_buf) return false; va = (*out_buf)->gpu_address + *out_offset; radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0)); radeon_emit(sctx->ce_ib, ce_offset); radeon_emit(sctx->ce_ib, size / 4); radeon_emit(sctx->ce_ib, va); radeon_emit(sctx->ce_ib, va >> 32); @@ -186,58 +188,70 @@ void si_ce_enable_loads(struct radeon_winsys_cs *ib) radeon_emit(ib,
[Mesa-dev] [PATCH 08/10] radeonsi: only upload (dump to L2) those descriptors that are used by shaders
From: Marek OlšákThis decreases the size of CE RAM dumps to L2, or the size of descriptor uploads without CE. --- src/gallium/drivers/radeonsi/si_compute.c | 28 ++-- src/gallium/drivers/radeonsi/si_descriptors.c | 85 - src/gallium/drivers/radeonsi/si_state.h | 18 +- src/gallium/drivers/radeonsi/si_state_shaders.c | 6 ++ 4 files changed, 113 insertions(+), 24 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 22ef111..4c98066 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -201,21 +201,38 @@ static void *si_create_compute_state( return NULL; } } return program; } static void si_bind_compute_state(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context*)ctx; - sctx->cs_shader_state.program = (struct si_compute*)state; + struct si_compute *program = (struct si_compute*)state; + + sctx->cs_shader_state.program = program; + if (!program) + return; + + /* Wait because we need active slot usage masks. */ + if (program->ir_type == PIPE_SHADER_IR_TGSI) + util_queue_fence_wait(>ready); + + si_set_active_descriptors(sctx, + SI_DESCS_FIRST_COMPUTE + + SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS, + program->active_const_and_shader_buffers); + si_set_active_descriptors(sctx, + SI_DESCS_FIRST_COMPUTE + + SI_SHADER_DESCS_SAMPLERS_AND_IMAGES, + program->active_samplers_and_images); } static void si_set_global_binding( struct pipe_context *ctx, unsigned first, unsigned n, struct pipe_resource **resources, uint32_t **handles) { unsigned i; struct si_context *sctx = (struct si_context*)ctx; struct si_compute *program = sctx->cs_shader_state.program; @@ -749,26 +766,23 @@ static void si_launch_grid( bool cs_regalloc_hang = (sctx->b.chip_class == SI || sctx->b.family == CHIP_BONAIRE || sctx->b.family == CHIP_KABINI) && info->block[0] * info->block[1] * info->block[2] > 256; if (cs_regalloc_hang) sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH; - if (program->ir_type == PIPE_SHADER_IR_TGSI) { - util_queue_fence_wait(>ready); - - if (program->shader.compilation_failed) - return; - } + if (program->ir_type == PIPE_SHADER_IR_TGSI && + program->shader.compilation_failed) + return; si_decompress_compute_textures(sctx); /* Add buffer sizes for memory checking in need_cs_space. */ r600_context_add_resource_size(ctx, >shader.bo->b.b); /* TODO: add the scratch buffer */ if (info->indirect) { r600_context_add_resource_size(ctx, info->indirect); diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 38e4ae1..a2f40a8 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -118,26 +118,28 @@ static void si_init_descriptors(struct si_descriptors *desc, } } static void si_release_descriptors(struct si_descriptors *desc) { r600_resource_reference(>buffer, NULL); FREE(desc->list); } static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned size, -unsigned *out_offset, struct r600_resource **out_buf) { +unsigned *out_offset, struct r600_resource **out_buf) +{ uint64_t va; u_suballocator_alloc(sctx->ce_suballocator, size, -sctx->screen->b.info.tcc_cache_line_size, -out_offset, (struct pipe_resource**)out_buf); +si_optimal_tcc_alignment(sctx, size), +(unsigned*)out_offset, +(struct pipe_resource**)out_buf); if (!out_buf) return false; va = (*out_buf)->gpu_address + *out_offset; radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0)); radeon_emit(sctx->ce_ib, ce_offset); radeon_emit(sctx->ce_ib, size / 4); radeon_emit(sctx->ce_ib, va); radeon_emit(sctx->ce_ib, va >> 32); @@ -186,58 +188,70 @@ void si_ce_enable_loads(struct radeon_winsys_cs *ib) radeon_emit(ib, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); radeon_emit(ib, CONTEXT_CONTROL_LOAD_ENABLE(1) |