No, the correct backport is attached. Marek
On Tue, Apr 23, 2019 at 2:51 PM Dylan Baker <dy...@pnwbakers.com> wrote: > Hi Marek and Samuel, > > I've staged this for 19.0, but I had to fix some very minor rebase > conflicts. > I'm getting ready to make a release, could one of you take a peak at the > tip of > the staging/19.0 branch and let me know if what I did looks okay? > > Thanks, > Dylan > > Quoting Samuel Pitoiset (2019-04-16 08:24:01) > > I don't have much context for that issue, so: > > > > Acked-by: Samuel Pitoiset <samuel.pitoi...@gmail.com> > > > > On 4/12/19 10:15 PM, Marek Ol\u0161 k wrote: > > > > Done locally. > > > > Marek > > > > On Fri, Apr 12, 2019 at 12:20 PM Samuel Pitoiset < > samuel.pitoi...@gmail.com > > > wrote: > > > > I would suggest to document that workaround somewhere in the > code. > > > > On 4/12/19 5:17 PM, Marek Ol\u0161 k wrote: > > > From: Marek Ol\u0161 k <marek.ol...@amd.com> > > > > > > This is a workaround for a thread deadlock that I have no idea > > > why it occurs. > > > > > > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=108879 > > > Fixes: 9b331e462e5021d994859756d46cd2519d9c9c6e > > > --- > > > src/gallium/drivers/radeonsi/si_clear.c | 6 +++--- > > > src/gallium/drivers/radeonsi/si_compute_blit.c | 8 +++++--- > > > src/gallium/drivers/radeonsi/si_pipe.c | 2 +- > > > src/gallium/drivers/radeonsi/si_pipe.h | 3 ++- > > > src/gallium/drivers/radeonsi/si_test_dma.c | 2 +- > > > 5 files changed, 12 insertions(+), 9 deletions(-) > > > > > > diff --git a/src/gallium/drivers/radeonsi/si_clear.c > b/src/gallium/ > > drivers/radeonsi/si_clear.c > > > index e1805f2a1c9..ead680b857b 100644 > > > --- a/src/gallium/drivers/radeonsi/si_clear.c > > > +++ b/src/gallium/drivers/radeonsi/si_clear.c > > > @@ -256,21 +256,21 @@ void vi_dcc_clear_level(struct si_context > > *sctx, > > > * would be more efficient than separate > per-layer > > clear operations. > > > */ > > > assert(tex->buffer.b.b.nr_storage_samples <= 2 || > > num_layers == 1); > > > > > > dcc_offset += tex->surface.u.legacy.level > > [level].dcc_offset; > > > clear_size = tex->surface.u.legacy.level > > [level].dcc_fast_clear_size * > > > num_layers; > > > } > > > > > > si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size, > > > - &clear_value, 4, SI_COHERENCY_CB_META); > > > + &clear_value, 4, SI_COHERENCY_CB_META, > false); > > > } > > > > > > /* Set the same micro tile mode as the destination of the > last MSAA > > resolve. > > > * This allows hitting the MSAA resolve fast path, which > requires > > that both > > > * src and dst micro tile modes match. > > > */ > > > static void si_set_optimal_micro_tile_mode(struct si_screen > > *sscreen, > > > struct si_texture > *tex) > > > { > > > if (tex->buffer.b.is_shared || > > > @@ -489,21 +489,21 @@ static void si_do_fast_color_clear(struct > > si_context *sctx, > > > > > > /* DCC fast clear with MSAA should clear > CMASK > > to 0xC. */ > > > if (tex->buffer.b.b.nr_samples >= 2 && > tex-> > > cmask_buffer) { > > > /* TODO: This doesn't work with > MSAA. * > > / > > > if (eliminate_needed) > > > continue; > > > > > > uint32_t clear_value = > 0xCCCCCCCC; > > > si_clear_buffer(sctx, &tex-> > > cmask_buffer->b.b, > > > > tex->cmask_offset, > > tex->surface.cmask_size, > > > - &clear_value, 4, > > SI_COHERENCY_CB_META); > > > + &clear_value, 4, > > SI_COHERENCY_CB_META, false); > > > fmask_decompress_needed = true; > > > } > > > > > > vi_dcc_clear_level(sctx, tex, 0, > reset_value); > > > tex->separate_dcc_dirty = true; > > > } else { > > > if (too_small) > > > continue; > > > > > > /* 128-bit formats are unusupported */ > > > @@ -517,21 +517,21 @@ static void si_do_fast_color_clear(struct > > si_context *sctx, > > > > > > /* ensure CMASK is enabled */ > > > si_alloc_separate_cmask(sctx->screen, > tex); > > > if (!tex->cmask_buffer) > > > continue; > > > > > > /* Do the fast clear. */ > > > uint32_t clear_value = 0; > > > si_clear_buffer(sctx, > &tex->cmask_buffer->b.b, > > > tex->cmask_offset, tex-> > > surface.cmask_size, > > > - &clear_value, 4, > > SI_COHERENCY_CB_META); > > > + &clear_value, 4, > > SI_COHERENCY_CB_META, false); > > > eliminate_needed = true; > > > } > > > > > > if ((eliminate_needed || > fmask_decompress_needed) && > > > !(tex->dirty_level_mask & (1 << level))) { > > > tex->dirty_level_mask |= 1 << level; > > > p_atomic_inc(&sctx->screen-> > > compressed_colortex_counter); > > > } > > > > > > /* We can change the micro tile mode before a > full > > clear. */ > > > diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c > b/src/ > > gallium/drivers/radeonsi/si_compute_blit.c > > > index 1abeac6adb0..fb0d8d2f1b6 100644 > > > --- a/src/gallium/drivers/radeonsi/si_compute_blit.c > > > +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c > > > @@ -179,21 +179,22 @@ static void > si_compute_do_clear_or_copy(struct > > si_context *sctx, > > > > > > /* Restore states. */ > > > ctx->bind_compute_state(ctx, saved_cs); > > > ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src > ? 2 : > > 1, saved_sb, > > > saved_writable_mask); > > > si_compute_internal_end(sctx); > > > } > > > > > > void si_clear_buffer(struct si_context *sctx, struct > pipe_resource > > *dst, > > > uint64_t offset, uint64_t size, uint32_t > > *clear_value, > > > - uint32_t clear_value_size, enum si_coherency > > coher) > > > + uint32_t clear_value_size, enum si_coherency > > coher, > > > + bool force_cpdma) > > > { > > > if (!size) > > > return; > > > > > > unsigned clear_alignment = MIN2(clear_value_size, 4); > > > > > > assert(clear_value_size != 3 && clear_value_size != 6); > /* 12 > > is allowed. */ > > > assert(offset % clear_alignment == 0); > > > assert(size % clear_alignment == 0); > > > assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit > sizes in > > all codepaths */ > > > @@ -243,21 +244,22 @@ void si_clear_buffer(struct si_context > *sctx, > > struct pipe_resource *dst, > > > return; > > > } > > > > > > uint64_t aligned_size = size & ~3ull; > > > if (aligned_size >= 4) { > > > /* Before GFX9, CP DMA was very slow when > clearing GTT, > > so never > > > * use CP DMA clears on those chips, because we > can't > > be certain > > > * about buffer placements. > > > */ > > > if (clear_value_size > 4 || > > > - (clear_value_size == 4 && > > > + (!force_cpdma && > > > + clear_value_size == 4 && > > > offset % 4 == 0 && > > > (size > 32*1024 || sctx->chip_class <= > VI))) { > > > si_compute_do_clear_or_copy(sctx, dst, > offset, > > NULL, 0, > > > aligned_size, > > clear_value, > > > > clear_value_size, > > coher); > > > } else { > > > assert(clear_value_size == 4); > > > si_cp_dma_clear_buffer(sctx, > sctx->gfx_cs, dst, > > offset, > > > aligned_size, > > *clear_value, 0, coher, > > > > get_cache_policy(sctx, > > coher, size)); > > > @@ -277,21 +279,21 @@ void si_clear_buffer(struct si_context > *sctx, > > struct pipe_resource *dst, > > > } > > > } > > > > > > static void si_pipe_clear_buffer(struct pipe_context *ctx, > > > struct pipe_resource *dst, > > > unsigned offset, unsigned size, > > > const void *clear_value, > > > int clear_value_size) > > > { > > > si_clear_buffer((struct si_context*)ctx, dst, offset, > size, > > (uint32_t*)clear_value, > > > - clear_value_size, SI_COHERENCY_SHADER); > > > + clear_value_size, SI_COHERENCY_SHADER, > false); > > > } > > > > > > void si_copy_buffer(struct si_context *sctx, > > > struct pipe_resource *dst, struct > pipe_resource > > *src, > > > uint64_t dst_offset, uint64_t src_offset, > unsigned > > size) > > > { > > > if (!size) > > > return; > > > > > > enum si_coherency coher = SI_COHERENCY_SHADER; > > > diff --git a/src/gallium/drivers/radeonsi/si_pipe.c > b/src/gallium/ > > drivers/radeonsi/si_pipe.c > > > index 5caeb575623..5d376e6181a 100644 > > > --- a/src/gallium/drivers/radeonsi/si_pipe.c > > > +++ b/src/gallium/drivers/radeonsi/si_pipe.c > > > @@ -634,21 +634,21 @@ static struct pipe_context > *si_create_context > > (struct pipe_screen *screen, > > > sizeof(sctx->sample_positions), &sctx-> > > sample_positions); > > > > > > /* this must be last */ > > > si_begin_new_gfx_cs(sctx); > > > > > > if (sctx->chip_class == CIK) { > > > /* Clear the NULL constant buffer, because loads > should > > return zeros. */ > > > uint32_t clear_value = 0; > > > si_clear_buffer(sctx, > sctx->null_const_buf.buffer, 0, > > > > sctx->null_const_buf.buffer->width0, > > > - &clear_value, 4, > SI_COHERENCY_SHADER); > > > + &clear_value, 4, > SI_COHERENCY_SHADER, > > true); > > > } > > > return &sctx->b; > > > fail: > > > fprintf(stderr, "radeonsi: Failed to create a > context.\n"); > > > si_destroy_context(&sctx->b); > > > return NULL; > > > } > > > > > > static struct pipe_context *si_pipe_create_context(struct > > pipe_screen *screen, > > > void *priv, > unsigned > > flags) > > > diff --git a/src/gallium/drivers/radeonsi/si_pipe.h > b/src/gallium/ > > drivers/radeonsi/si_pipe.h > > > index 301d38649bf..aaa95f32d20 100644 > > > --- a/src/gallium/drivers/radeonsi/si_pipe.h > > > +++ b/src/gallium/drivers/radeonsi/si_pipe.h > > > @@ -1182,21 +1182,22 @@ bool vi_alpha_is_on_msb(enum > pipe_format > > format); > > > void vi_dcc_clear_level(struct si_context *sctx, > > > struct si_texture *tex, > > > unsigned level, unsigned clear_value); > > > void si_init_clear_functions(struct si_context *sctx); > > > > > > /* si_compute_blit.c */ > > > unsigned si_get_flush_flags(struct si_context *sctx, enum > > si_coherency coher, > > > enum si_cache_policy cache_policy); > > > void si_clear_buffer(struct si_context *sctx, struct > pipe_resource > > *dst, > > > uint64_t offset, uint64_t size, uint32_t > > *clear_value, > > > - uint32_t clear_value_size, enum si_coherency > > coher); > > > + uint32_t clear_value_size, enum si_coherency > > coher, > > > + bool force_cpdma); > > > void si_copy_buffer(struct si_context *sctx, > > > struct pipe_resource *dst, struct > pipe_resource > > *src, > > > uint64_t dst_offset, uint64_t src_offset, > unsigned > > size); > > > void si_compute_copy_image(struct si_context *sctx, > > > struct pipe_resource *dst, > > > unsigned dst_level, > > > struct pipe_resource *src, > > > unsigned src_level, > > > unsigned dstx, unsigned dsty, > unsigned dstz, > > > const struct pipe_box *src_box); > > > diff --git a/src/gallium/drivers/radeonsi/si_test_dma.c > b/src/gallium > > /drivers/radeonsi/si_test_dma.c > > > index 90a2032cd80..7e396e671be 100644 > > > --- a/src/gallium/drivers/radeonsi/si_test_dma.c > > > +++ b/src/gallium/drivers/radeonsi/si_test_dma.c > > > @@ -302,21 +302,21 @@ void si_test_dma(struct si_screen > *sscreen) > > > tsrc.width0, tsrc.height0, > tsrc.array_size, > > > array_mode_to_string(sscreen, > &ssrc->surface), > > bpp); > > > fflush(stdout); > > > > > > /* set src pixels */ > > > set_random_pixels(ctx, src, &src_cpu); > > > > > > /* clear dst pixels */ > > > uint32_t zero = 0; > > > si_clear_buffer(sctx, dst, 0, > sdst->surface.surf_size, > > &zero, 4, > > > - SI_COHERENCY_SHADER); > > > + SI_COHERENCY_SHADER, false); > > > memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * > > tdst.array_size); > > > > > > /* preparation */ > > > max_width = MIN2(tsrc.width0, tdst.width0); > > > max_height = MIN2(tsrc.height0, tdst.height0); > > > max_depth = MIN2(tsrc.array_size, > tdst.array_size); > > > > > > num = do_partial_copies ? num_partial_copies : 1; > > > for (j = 0; j < num; j++) { > > > int width, height, depth; > > >
From 579072d03ab79bd46a19e0ae10f02211df76ccff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.ol...@amd.com> Date: Fri, 12 Apr 2019 11:12:34 -0400 Subject: [PATCH] radeonsi: use CP DMA for the null const buffer clear on CIK This is a workaround for a thread deadlock that I have no idea why it occurs. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=108879 Fixes: 9b331e462e5021d994859756d46cd2519d9c9c6e Acked-by: Samuel Pitoiset <samuel.pitoi...@gmail.com> (cherry picked from commit b58e5fb6f317be771326f98d498483e45942beaf) --- src/gallium/drivers/radeonsi/si_clear.c | 6 +++--- src/gallium/drivers/radeonsi/si_compute_blit.c | 8 +++++--- src/gallium/drivers/radeonsi/si_pipe.c | 7 +++++-- src/gallium/drivers/radeonsi/si_pipe.h | 3 ++- src/gallium/drivers/radeonsi/si_test_dma.c | 2 +- 5 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/drivers/radeonsi/si_clear.c index 9026f61dc0a..ff98452a5bc 100644 --- a/src/gallium/drivers/radeonsi/si_clear.c +++ b/src/gallium/drivers/radeonsi/si_clear.c @@ -272,7 +272,7 @@ void vi_dcc_clear_level(struct si_context *sctx, } si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size, - &clear_value, 4, SI_COHERENCY_CB_META); + &clear_value, 4, SI_COHERENCY_CB_META, false); } /* Set the same micro tile mode as the destination of the last MSAA resolve. @@ -505,7 +505,7 @@ static void si_do_fast_color_clear(struct si_context *sctx, uint32_t clear_value = 0xCCCCCCCC; si_clear_buffer(sctx, &tex->cmask_buffer->b.b, tex->cmask_offset, tex->surface.cmask_size, - &clear_value, 4, SI_COHERENCY_CB_META); + &clear_value, 4, SI_COHERENCY_CB_META, false); fmask_decompress_needed = true; } @@ -533,7 +533,7 @@ static void si_do_fast_color_clear(struct si_context *sctx, uint32_t clear_value = 0; si_clear_buffer(sctx, &tex->cmask_buffer->b.b, tex->cmask_offset, tex->surface.cmask_size, - &clear_value, 4, SI_COHERENCY_CB_META); + &clear_value, 4, SI_COHERENCY_CB_META, false); eliminate_needed = true; } diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index 38c48c30be9..304296c4a52 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -177,7 +177,8 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset, uint64_t size, uint32_t *clear_value, - uint32_t clear_value_size, enum si_coherency coher) + uint32_t clear_value_size, enum si_coherency coher, + bool force_cpdma) { if (!size) return; @@ -241,7 +242,8 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, * about buffer placements. */ if (clear_value_size > 4 || - (clear_value_size == 4 && + (!force_cpdma && + clear_value_size == 4 && offset % 4 == 0 && (size > 32*1024 || sctx->chip_class <= VI))) { si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0, @@ -282,7 +284,7 @@ static void si_pipe_clear_buffer(struct pipe_context *ctx, coher = SI_COHERENCY_SHADER; si_clear_buffer((struct si_context*)ctx, dst, offset, size, (uint32_t*)clear_value, - clear_value_size, coher); + clear_value_size, coher, false); } void si_copy_buffer(struct si_context *sctx, diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 2656bdc2068..5312ff99e62 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -609,11 +609,14 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, si_begin_new_gfx_cs(sctx); if (sctx->chip_class == CIK) { - /* Clear the NULL constant buffer, because loads should return zeros. */ + /* Clear the NULL constant buffer, because loads should return zeros. + * Note that this forces CP DMA to be used, because clover deadlocks + * for some reason when the compute codepath is used. + */ uint32_t clear_value = 0; si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0, sctx->null_const_buf.buffer->width0, - &clear_value, 4, SI_COHERENCY_SHADER); + &clear_value, 4, SI_COHERENCY_SHADER, true); } return &sctx->b; fail: diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index eb3ba951dae..d63943c19ed 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1168,7 +1168,8 @@ unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher, enum si_cache_policy cache_policy); void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset, uint64_t size, uint32_t *clear_value, - uint32_t clear_value_size, enum si_coherency coher); + uint32_t clear_value_size, enum si_coherency coher, + bool force_cpdma); void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, unsigned size); diff --git a/src/gallium/drivers/radeonsi/si_test_dma.c b/src/gallium/drivers/radeonsi/si_test_dma.c index 90a2032cd80..7e396e671be 100644 --- a/src/gallium/drivers/radeonsi/si_test_dma.c +++ b/src/gallium/drivers/radeonsi/si_test_dma.c @@ -309,7 +309,7 @@ void si_test_dma(struct si_screen *sscreen) /* clear dst pixels */ uint32_t zero = 0; si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, &zero, 4, - SI_COHERENCY_SHADER); + SI_COHERENCY_SHADER, false); memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size); /* preparation */ -- 2.17.1
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev