Reviewed-by: Marek Olšák <marek.ol...@amd.com> Marek
On Thu, Nov 22, 2018 at 6:32 AM Nicolai Hähnle <nhaeh...@gmail.com> wrote: > From: Nicolai Hähnle <nicolai.haeh...@amd.com> > > Introduce a new driver-private transfer flag RADEON_TRANSFER_TEMPORARY > that specifies whether the caller will use buffer_unmap or not. The > default behavior is set to permanent maps, because that's what drivers > do for Gallium buffer maps. > > This should eliminate the need for hacks in libdrm. Assertions are added > to catch when the buffer_unmap calls don't match the (temporary) > buffer_map calls. > > I did my best to update r600 for consistency (r300 needs no changes > because it never calls buffer_unmap), even though the radeon winsys > ignores the new flag. > > As an added bonus, this should actually improve the performance of > the normal fast path, because we no longer call into libdrm at all > after the first map, and there's one less atomic in the winsys itself > (there are now no atomics left in the UNSYNCHRONIZED fast path). > > Cc: Leo Liu <leo....@amd.com> > v2: > - remove comment about visible VRAM (Marek) > - don't rely on amdgpu_bo_cpu_map doing an atomic write > --- > src/gallium/drivers/r600/evergreen_compute.c | 4 +- > src/gallium/drivers/r600/r600_asm.c | 4 +- > src/gallium/drivers/r600/r600_shader.c | 4 +- > src/gallium/drivers/r600/radeon_uvd.c | 8 +- > src/gallium/drivers/r600/radeon_vce.c | 4 +- > src/gallium/drivers/r600/radeon_video.c | 6 +- > src/gallium/drivers/radeon/radeon_uvd.c | 10 +- > src/gallium/drivers/radeon/radeon_uvd_enc.c | 6 +- > src/gallium/drivers/radeon/radeon_vce.c | 4 +- > src/gallium/drivers/radeon/radeon_vcn_dec.c | 18 ++-- > src/gallium/drivers/radeon/radeon_vcn_enc.c | 4 +- > src/gallium/drivers/radeon/radeon_video.c | 6 +- > src/gallium/drivers/radeon/radeon_winsys.h | 14 ++- > src/gallium/drivers/radeonsi/si_shader.c | 3 +- > src/gallium/include/pipe/p_defines.h | 8 +- > src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 96 +++++++++++++------- > src/gallium/winsys/amdgpu/drm/amdgpu_bo.h | 3 +- > 17 files changed, 140 insertions(+), 62 deletions(-) > > diff --git a/src/gallium/drivers/r600/evergreen_compute.c > b/src/gallium/drivers/r600/evergreen_compute.c > index a77f58242e3..9085be4e2f3 100644 > --- a/src/gallium/drivers/r600/evergreen_compute.c > +++ b/src/gallium/drivers/r600/evergreen_compute.c > @@ -431,21 +431,23 @@ static void *evergreen_create_compute_state(struct > pipe_context *ctx, > COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n"); > header = cso->prog; > code = cso->prog + sizeof(struct pipe_llvm_program_header); > radeon_shader_binary_init(&shader->binary); > r600_elf_read(code, header->num_bytes, &shader->binary); > r600_create_shader(&shader->bc, &shader->binary, &use_kill); > > /* Upload code + ROdata */ > shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen, > shader->bc.ndw * > 4); > - p = r600_buffer_map_sync_with_rings(&rctx->b, shader->code_bo, > PIPE_TRANSFER_WRITE); > + p = r600_buffer_map_sync_with_rings( > + &rctx->b, shader->code_bo, > + PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY); > //TODO: use util_memcpy_cpu_to_le32 ? > memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4); > rctx->b.ws->buffer_unmap(shader->code_bo->buf); > #endif > > return shader; > } > > static void evergreen_delete_compute_state(struct pipe_context *ctx, void > *state) > { > diff --git a/src/gallium/drivers/r600/r600_asm.c > b/src/gallium/drivers/r600/r600_asm.c > index 7029be24f4b..4ba77c535f9 100644 > --- a/src/gallium/drivers/r600/r600_asm.c > +++ b/src/gallium/drivers/r600/r600_asm.c > @@ -2765,21 +2765,23 @@ void *r600_create_vertex_fetch_shader(struct > pipe_context *ctx, > > u_suballocator_alloc(rctx->allocator_fetch_shader, fs_size, 256, > &shader->offset, > (struct pipe_resource**)&shader->buffer); > if (!shader->buffer) { > r600_bytecode_clear(&bc); > FREE(shader); > return NULL; > } > > - bytecode = r600_buffer_map_sync_with_rings(&rctx->b, > shader->buffer, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED); > + bytecode = r600_buffer_map_sync_with_rings > + (&rctx->b, shader->buffer, > + PIPE_TRANSFER_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED | > RADEON_TRANSFER_TEMPORARY); > bytecode += shader->offset / 4; > > if (R600_BIG_ENDIAN) { > for (i = 0; i < fs_size / 4; ++i) { > bytecode[i] = util_cpu_to_le32(bc.bytecode[i]); > } > } else { > memcpy(bytecode, bc.bytecode, fs_size); > } > rctx->b.ws->buffer_unmap(shader->buffer->buf); > diff --git a/src/gallium/drivers/r600/r600_shader.c > b/src/gallium/drivers/r600/r600_shader.c > index 408939d1105..fc826470d69 100644 > --- a/src/gallium/drivers/r600/r600_shader.c > +++ b/src/gallium/drivers/r600/r600_shader.c > @@ -134,21 +134,23 @@ static int store_shader(struct pipe_context *ctx, > { > struct r600_context *rctx = (struct r600_context *)ctx; > uint32_t *ptr, i; > > if (shader->bo == NULL) { > shader->bo = (struct r600_resource*) > pipe_buffer_create(ctx->screen, 0, > PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4); > if (shader->bo == NULL) { > return -ENOMEM; > } > - ptr = r600_buffer_map_sync_with_rings(&rctx->b, > shader->bo, PIPE_TRANSFER_WRITE); > + ptr = r600_buffer_map_sync_with_rings( > + &rctx->b, shader->bo, > + PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY); > if (R600_BIG_ENDIAN) { > for (i = 0; i < shader->shader.bc.ndw; ++i) { > ptr[i] = > util_cpu_to_le32(shader->shader.bc.bytecode[i]); > } > } else { > memcpy(ptr, shader->shader.bc.bytecode, > shader->shader.bc.ndw * sizeof(*ptr)); > } > rctx->b.ws->buffer_unmap(shader->bo->buf); > } > > diff --git a/src/gallium/drivers/r600/radeon_uvd.c > b/src/gallium/drivers/r600/radeon_uvd.c > index 495a93dc55a..5568f2138e4 100644 > --- a/src/gallium/drivers/r600/radeon_uvd.c > +++ b/src/gallium/drivers/r600/radeon_uvd.c > @@ -145,21 +145,22 @@ static bool have_it(struct ruvd_decoder *dec) > /* map the next available message/feedback/itscaling buffer */ > static void map_msg_fb_it_buf(struct ruvd_decoder *dec) > { > struct rvid_buffer* buf; > uint8_t *ptr; > > /* grab the current message/feedback buffer */ > buf = &dec->msg_fb_it_buffers[dec->cur_buffer]; > > /* and map it for CPU access */ > - ptr = dec->ws->buffer_map(buf->res->buf, dec->cs, > PIPE_TRANSFER_WRITE); > + ptr = dec->ws->buffer_map(buf->res->buf, dec->cs, > + PIPE_TRANSFER_WRITE | > RADEON_TRANSFER_TEMPORARY); > > /* calc buffer offsets */ > dec->msg = (struct ruvd_msg *)ptr; > memset(dec->msg, 0, sizeof(*dec->msg)); > > dec->fb = (uint32_t *)(ptr + FB_BUFFER_OFFSET); > if (have_it(dec)) > dec->it = (uint8_t *)(ptr + FB_BUFFER_OFFSET + > dec->fb_size); > } > > @@ -1061,21 +1062,21 @@ static void ruvd_begin_frame(struct > pipe_video_codec *decoder, > > assert(decoder); > > frame = ++dec->frame_number; > vl_video_buffer_set_associated_data(target, decoder, (void *)frame, > &ruvd_destroy_associated_data); > > dec->bs_size = 0; > dec->bs_ptr = dec->ws->buffer_map( > dec->bs_buffers[dec->cur_buffer].res->buf, > - dec->cs, PIPE_TRANSFER_WRITE); > + dec->cs, PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY); > } > > /** > * decode a macroblock > */ > static void ruvd_decode_macroblock(struct pipe_video_codec *decoder, > struct pipe_video_buffer *target, > struct pipe_picture_desc *picture, > const struct pipe_macroblock > *macroblocks, > unsigned num_macroblocks) > @@ -1114,21 +1115,22 @@ static void ruvd_decode_bitstream(struct > pipe_video_codec *decoder, > new_size += 2; /* save for EOI */ > > if (new_size > buf->res->buf->size) { > dec->ws->buffer_unmap(buf->res->buf); > if (!rvid_resize_buffer(dec->screen, dec->cs, buf, > new_size)) { > RVID_ERR("Can't resize bitstream buffer!"); > return; > } > > dec->bs_ptr = dec->ws->buffer_map(buf->res->buf, > dec->cs, > - > PIPE_TRANSFER_WRITE); > + > PIPE_TRANSFER_WRITE | > + > RADEON_TRANSFER_TEMPORARY); > if (!dec->bs_ptr) > return; > > dec->bs_ptr += dec->bs_size; > } > > memcpy(dec->bs_ptr, buffers[i], sizes[i]); > dec->bs_size += sizes[i]; > dec->bs_ptr += sizes[i]; > } > diff --git a/src/gallium/drivers/r600/radeon_vce.c > b/src/gallium/drivers/r600/radeon_vce.c > index 60ba12a593a..e38b927b1d4 100644 > --- a/src/gallium/drivers/r600/radeon_vce.c > +++ b/src/gallium/drivers/r600/radeon_vce.c > @@ -346,21 +346,23 @@ static void rvce_end_frame(struct pipe_video_codec > *encoder, > } > } > > static void rvce_get_feedback(struct pipe_video_codec *encoder, > void *feedback, unsigned *size) > { > struct rvce_encoder *enc = (struct rvce_encoder*)encoder; > struct rvid_buffer *fb = feedback; > > if (size) { > - uint32_t *ptr = enc->ws->buffer_map(fb->res->buf, enc->cs, > PIPE_TRANSFER_READ_WRITE); > + uint32_t *ptr = enc->ws->buffer_map( > + fb->res->buf, enc->cs, > + PIPE_TRANSFER_READ_WRITE | > RADEON_TRANSFER_TEMPORARY); > > if (ptr[1]) { > *size = ptr[4] - ptr[9]; > } else { > *size = 0; > } > > enc->ws->buffer_unmap(fb->res->buf); > } > //dump_feedback(enc, fb); > diff --git a/src/gallium/drivers/r600/radeon_video.c > b/src/gallium/drivers/r600/radeon_video.c > index 02fcf77d4ff..8e0af448be5 100644 > --- a/src/gallium/drivers/r600/radeon_video.c > +++ b/src/gallium/drivers/r600/radeon_video.c > @@ -90,25 +90,27 @@ bool rvid_resize_buffer(struct pipe_screen *screen, > struct radeon_cmdbuf *cs, > { > struct r600_common_screen *rscreen = (struct r600_common_screen > *)screen; > struct radeon_winsys* ws = rscreen->ws; > unsigned bytes = MIN2(new_buf->res->buf->size, new_size); > struct rvid_buffer old_buf = *new_buf; > void *src = NULL, *dst = NULL; > > if (!rvid_create_buffer(screen, new_buf, new_size, new_buf->usage)) > goto error; > > - src = ws->buffer_map(old_buf.res->buf, cs, PIPE_TRANSFER_READ); > + src = ws->buffer_map(old_buf.res->buf, cs, > + PIPE_TRANSFER_READ | > RADEON_TRANSFER_TEMPORARY); > if (!src) > goto error; > > - dst = ws->buffer_map(new_buf->res->buf, cs, PIPE_TRANSFER_WRITE); > + dst = ws->buffer_map(new_buf->res->buf, cs, > + PIPE_TRANSFER_WRITE | > RADEON_TRANSFER_TEMPORARY); > if (!dst) > goto error; > > memcpy(dst, src, bytes); > if (new_size > bytes) { > new_size -= bytes; > dst += bytes; > memset(dst, 0, new_size); > } > ws->buffer_unmap(new_buf->res->buf); > diff --git a/src/gallium/drivers/radeon/radeon_uvd.c > b/src/gallium/drivers/radeon/radeon_uvd.c > index 62af1a311c2..ca066e89823 100644 > --- a/src/gallium/drivers/radeon/radeon_uvd.c > +++ b/src/gallium/drivers/radeon/radeon_uvd.c > @@ -141,21 +141,22 @@ static bool have_it(struct ruvd_decoder *dec) > /* map the next available message/feedback/itscaling buffer */ > static void map_msg_fb_it_buf(struct ruvd_decoder *dec) > { > struct rvid_buffer* buf; > uint8_t *ptr; > > /* grab the current message/feedback buffer */ > buf = &dec->msg_fb_it_buffers[dec->cur_buffer]; > > /* and map it for CPU access */ > - ptr = dec->ws->buffer_map(buf->res->buf, dec->cs, > PIPE_TRANSFER_WRITE); > + ptr = dec->ws->buffer_map(buf->res->buf, dec->cs, > + PIPE_TRANSFER_WRITE | > RADEON_TRANSFER_TEMPORARY); > > /* calc buffer offsets */ > dec->msg = (struct ruvd_msg *)ptr; > memset(dec->msg, 0, sizeof(*dec->msg)); > > dec->fb = (uint32_t *)(ptr + FB_BUFFER_OFFSET); > if (have_it(dec)) > dec->it = (uint8_t *)(ptr + FB_BUFFER_OFFSET + > dec->fb_size); > } > > @@ -1008,21 +1009,21 @@ static void ruvd_begin_frame(struct > pipe_video_codec *decoder, > > assert(decoder); > > frame = ++dec->frame_number; > vl_video_buffer_set_associated_data(target, decoder, (void *)frame, > &ruvd_destroy_associated_data); > > dec->bs_size = 0; > dec->bs_ptr = dec->ws->buffer_map( > dec->bs_buffers[dec->cur_buffer].res->buf, > - dec->cs, PIPE_TRANSFER_WRITE); > + dec->cs, PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY); > } > > /** > * decode a macroblock > */ > static void ruvd_decode_macroblock(struct pipe_video_codec *decoder, > struct pipe_video_buffer *target, > struct pipe_picture_desc *picture, > const struct pipe_macroblock > *macroblocks, > unsigned num_macroblocks) > @@ -1053,22 +1054,23 @@ static void ruvd_decode_bitstream(struct > pipe_video_codec *decoder, > struct rvid_buffer *buf = > &dec->bs_buffers[dec->cur_buffer]; > unsigned new_size = dec->bs_size + sizes[i]; > > if (new_size > buf->res->buf->size) { > dec->ws->buffer_unmap(buf->res->buf); > if (!si_vid_resize_buffer(dec->screen, dec->cs, > buf, new_size)) { > RVID_ERR("Can't resize bitstream buffer!"); > return; > } > > - dec->bs_ptr = dec->ws->buffer_map(buf->res->buf, > dec->cs, > - > PIPE_TRANSFER_WRITE); > + dec->bs_ptr = dec->ws->buffer_map( > + buf->res->buf, dec->cs, > + PIPE_TRANSFER_WRITE | > RADEON_TRANSFER_TEMPORARY); > if (!dec->bs_ptr) > return; > > dec->bs_ptr += dec->bs_size; > } > > memcpy(dec->bs_ptr, buffers[i], sizes[i]); > dec->bs_size += sizes[i]; > dec->bs_ptr += sizes[i]; > } > diff --git a/src/gallium/drivers/radeon/radeon_uvd_enc.c > b/src/gallium/drivers/radeon/radeon_uvd_enc.c > index 4384e5e1646..3164dbb2c20 100644 > --- a/src/gallium/drivers/radeon/radeon_uvd_enc.c > +++ b/src/gallium/drivers/radeon/radeon_uvd_enc.c > @@ -256,23 +256,23 @@ radeon_uvd_enc_destroy(struct pipe_video_codec > *encoder) > > static void > radeon_uvd_enc_get_feedback(struct pipe_video_codec *encoder, > void *feedback, unsigned *size) > { > struct radeon_uvd_encoder *enc = (struct radeon_uvd_encoder *) encoder; > struct rvid_buffer *fb = feedback; > > if (NULL != size) { > radeon_uvd_enc_feedback_t *fb_data = > - (radeon_uvd_enc_feedback_t *) enc->ws->buffer_map(fb->res->buf, > - enc->cs, > - > PIPE_TRANSFER_READ_WRITE); > + (radeon_uvd_enc_feedback_t *) enc->ws->buffer_map( > + fb->res->buf, enc->cs, > + PIPE_TRANSFER_READ_WRITE | RADEON_TRANSFER_TEMPORARY); > > if (!fb_data->status) > *size = fb_data->bitstream_size; > else > *size = 0; > enc->ws->buffer_unmap(fb->res->buf); > } > > si_vid_destroy_buffer(fb); > FREE(fb); > diff --git a/src/gallium/drivers/radeon/radeon_vce.c > b/src/gallium/drivers/radeon/radeon_vce.c > index 310d1654b05..94df06e88c6 100644 > --- a/src/gallium/drivers/radeon/radeon_vce.c > +++ b/src/gallium/drivers/radeon/radeon_vce.c > @@ -345,21 +345,23 @@ static void rvce_end_frame(struct pipe_video_codec > *encoder, > } > } > > static void rvce_get_feedback(struct pipe_video_codec *encoder, > void *feedback, unsigned *size) > { > struct rvce_encoder *enc = (struct rvce_encoder*)encoder; > struct rvid_buffer *fb = feedback; > > if (size) { > - uint32_t *ptr = enc->ws->buffer_map(fb->res->buf, enc->cs, > PIPE_TRANSFER_READ_WRITE); > + uint32_t *ptr = enc->ws->buffer_map( > + fb->res->buf, enc->cs, > + PIPE_TRANSFER_READ_WRITE | > RADEON_TRANSFER_TEMPORARY); > > if (ptr[1]) { > *size = ptr[4] - ptr[9]; > } else { > *size = 0; > } > > enc->ws->buffer_unmap(fb->res->buf); > } > //dump_feedback(enc, fb); > diff --git a/src/gallium/drivers/radeon/radeon_vcn_dec.c > b/src/gallium/drivers/radeon/radeon_vcn_dec.c > index 1ee85ae3d3f..e402af21a64 100644 > --- a/src/gallium/drivers/radeon/radeon_vcn_dec.c > +++ b/src/gallium/drivers/radeon/radeon_vcn_dec.c > @@ -934,21 +934,23 @@ static struct pb_buffer > *rvcn_dec_message_decode(struct radeon_decoder *dec, > ctx_size += 8 * 2 * 4096; > > if (dec->base.profile == > PIPE_VIDEO_PROFILE_VP9_PROFILE2) > ctx_size += 8 * 2 * 4096; > > if (!si_vid_create_buffer(dec->screen, &dec->ctx, > ctx_size, PIPE_USAGE_DEFAULT)) > RVID_ERR("Can't allocated context > buffer.\n"); > si_vid_clear_buffer(dec->base.context, &dec->ctx); > > /* ctx needs probs table */ > - ptr = dec->ws->buffer_map(dec->ctx.res->buf, > dec->cs, PIPE_TRANSFER_WRITE); > + ptr = dec->ws->buffer_map( > + dec->ctx.res->buf, dec->cs, > + PIPE_TRANSFER_WRITE | > RADEON_TRANSFER_TEMPORARY); > fill_probs_table(ptr); > dec->ws->buffer_unmap(dec->ctx.res->buf); > } > break; > } > default: > assert(0); > return NULL; > } > > @@ -1027,21 +1029,22 @@ static bool have_probs(struct radeon_decoder *dec) > /* map the next available message/feedback/itscaling buffer */ > static void map_msg_fb_it_probs_buf(struct radeon_decoder *dec) > { > struct rvid_buffer* buf; > uint8_t *ptr; > > /* grab the current message/feedback buffer */ > buf = &dec->msg_fb_it_probs_buffers[dec->cur_buffer]; > > /* and map it for CPU access */ > - ptr = dec->ws->buffer_map(buf->res->buf, dec->cs, > PIPE_TRANSFER_WRITE); > + ptr = dec->ws->buffer_map(buf->res->buf, dec->cs, > + PIPE_TRANSFER_WRITE | > RADEON_TRANSFER_TEMPORARY); > > /* calc buffer offsets */ > dec->msg = ptr; > > dec->fb = (uint32_t *)(ptr + FB_BUFFER_OFFSET); > if (have_it(dec)) > dec->it = (uint8_t *)(ptr + FB_BUFFER_OFFSET + > FB_BUFFER_SIZE); > else if (have_probs(dec)) > dec->probs = (uint8_t *)(ptr + FB_BUFFER_OFFSET + > FB_BUFFER_SIZE); > } > @@ -1305,21 +1308,21 @@ static void radeon_dec_begin_frame(struct > pipe_video_codec *decoder, > assert(decoder); > > frame = ++dec->frame_number; > if (dec->stream_type != RDECODE_CODEC_VP9) > vl_video_buffer_set_associated_data(target, decoder, (void > *)frame, > > &radeon_dec_destroy_associated_data); > > dec->bs_size = 0; > dec->bs_ptr = dec->ws->buffer_map( > dec->bs_buffers[dec->cur_buffer].res->buf, > - dec->cs, PIPE_TRANSFER_WRITE); > + dec->cs, PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY); > } > > /** > * decode a macroblock > */ > static void radeon_dec_decode_macroblock(struct pipe_video_codec *decoder, > struct pipe_video_buffer *target, > struct pipe_picture_desc *picture, > const struct pipe_macroblock > *macroblocks, > unsigned num_macroblocks) > @@ -1350,22 +1353,23 @@ static void radeon_dec_decode_bitstream(struct > pipe_video_codec *decoder, > struct rvid_buffer *buf = > &dec->bs_buffers[dec->cur_buffer]; > unsigned new_size = dec->bs_size + sizes[i]; > > if (new_size > buf->res->buf->size) { > dec->ws->buffer_unmap(buf->res->buf); > if (!si_vid_resize_buffer(dec->screen, dec->cs, > buf, new_size)) { > RVID_ERR("Can't resize bitstream buffer!"); > return; > } > > - dec->bs_ptr = dec->ws->buffer_map(buf->res->buf, > dec->cs, > - > PIPE_TRANSFER_WRITE); > + dec->bs_ptr = dec->ws->buffer_map( > + buf->res->buf, dec->cs, > + PIPE_TRANSFER_WRITE | > RADEON_TRANSFER_TEMPORARY); > if (!dec->bs_ptr) > return; > > dec->bs_ptr += dec->bs_size; > } > > memcpy(dec->bs_ptr, buffers[i], sizes[i]); > dec->bs_size += sizes[i]; > dec->bs_ptr += sizes[i]; > } > @@ -1536,21 +1540,23 @@ struct pipe_video_codec > *radeon_create_decoder(struct pipe_context *context, > } > > si_vid_clear_buffer(context, > &dec->msg_fb_it_probs_buffers[i]); > si_vid_clear_buffer(context, &dec->bs_buffers[i]); > > if (have_probs(dec)) { > struct rvid_buffer* buf; > void *ptr; > > buf = &dec->msg_fb_it_probs_buffers[i]; > - ptr = dec->ws->buffer_map(buf->res->buf, dec->cs, > PIPE_TRANSFER_WRITE); > + ptr = dec->ws->buffer_map( > + buf->res->buf, dec->cs, > + PIPE_TRANSFER_WRITE | > RADEON_TRANSFER_TEMPORARY); > ptr += FB_BUFFER_OFFSET + FB_BUFFER_SIZE; > fill_probs_table(ptr); > dec->ws->buffer_unmap(buf->res->buf); > } > } > > dpb_size = calc_dpb_size(dec); > if (dpb_size) { > if (!si_vid_create_buffer(dec->screen, &dec->dpb, > dpb_size, PIPE_USAGE_DEFAULT)) { > RVID_ERR("Can't allocated dpb.\n"); > diff --git a/src/gallium/drivers/radeon/radeon_vcn_enc.c > b/src/gallium/drivers/radeon/radeon_vcn_enc.c > index e8676f6c721..7d64a28a405 100644 > --- a/src/gallium/drivers/radeon/radeon_vcn_enc.c > +++ b/src/gallium/drivers/radeon/radeon_vcn_enc.c > @@ -237,21 +237,23 @@ static void radeon_enc_destroy(struct > pipe_video_codec *encoder) > FREE(enc); > } > > static void radeon_enc_get_feedback(struct pipe_video_codec *encoder, > void *feedback, > unsigned *size) > { > struct radeon_encoder *enc = (struct radeon_encoder*)encoder; > struct rvid_buffer *fb = feedback; > > if (size) { > - uint32_t *ptr = enc->ws->buffer_map(fb->res->buf, enc->cs, > PIPE_TRANSFER_READ_WRITE); > + uint32_t *ptr = enc->ws->buffer_map( > + fb->res->buf, enc->cs, > + PIPE_TRANSFER_READ_WRITE | > RADEON_TRANSFER_TEMPORARY); > if (ptr[1]) > *size = ptr[6]; > else > *size = 0; > enc->ws->buffer_unmap(fb->res->buf); > } > > si_vid_destroy_buffer(fb); > FREE(fb); > } > diff --git a/src/gallium/drivers/radeon/radeon_video.c > b/src/gallium/drivers/radeon/radeon_video.c > index a39ce4cc73e..bb1173e8005 100644 > --- a/src/gallium/drivers/radeon/radeon_video.c > +++ b/src/gallium/drivers/radeon/radeon_video.c > @@ -81,25 +81,27 @@ bool si_vid_resize_buffer(struct pipe_screen *screen, > struct radeon_cmdbuf *cs, > { > struct si_screen *sscreen = (struct si_screen *)screen; > struct radeon_winsys* ws = sscreen->ws; > unsigned bytes = MIN2(new_buf->res->buf->size, new_size); > struct rvid_buffer old_buf = *new_buf; > void *src = NULL, *dst = NULL; > > if (!si_vid_create_buffer(screen, new_buf, new_size, > new_buf->usage)) > goto error; > > - src = ws->buffer_map(old_buf.res->buf, cs, PIPE_TRANSFER_READ); > + src = ws->buffer_map(old_buf.res->buf, cs, > + PIPE_TRANSFER_READ | > RADEON_TRANSFER_TEMPORARY); > if (!src) > goto error; > > - dst = ws->buffer_map(new_buf->res->buf, cs, PIPE_TRANSFER_WRITE); > + dst = ws->buffer_map(new_buf->res->buf, cs, > + PIPE_TRANSFER_WRITE | > RADEON_TRANSFER_TEMPORARY); > if (!dst) > goto error; > > memcpy(dst, src, bytes); > if (new_size > bytes) { > new_size -= bytes; > dst += bytes; > memset(dst, 0, new_size); > } > ws->buffer_unmap(new_buf->res->buf); > diff --git a/src/gallium/drivers/radeon/radeon_winsys.h > b/src/gallium/drivers/radeon/radeon_winsys.h > index 49f8bb279e5..a56ff75ad24 100644 > --- a/src/gallium/drivers/radeon/radeon_winsys.h > +++ b/src/gallium/drivers/radeon/radeon_winsys.h > @@ -69,20 +69,29 @@ enum radeon_bo_usage { /* bitfield */ > RADEON_USAGE_READ = 2, > RADEON_USAGE_WRITE = 4, > RADEON_USAGE_READWRITE = RADEON_USAGE_READ | RADEON_USAGE_WRITE, > > /* The winsys ensures that the CS submission will be scheduled after > * previously flushed CSs referencing this BO in a conflicting way. > */ > RADEON_USAGE_SYNCHRONIZED = 8 > }; > > +enum radeon_transfer_flags { > + /* Indicates that the caller will unmap the buffer. > + * > + * Not unmapping buffers is an important performance optimization for > + * OpenGL (avoids kernel overhead for frequently mapped buffers). > + */ > + RADEON_TRANSFER_TEMPORARY = (PIPE_TRANSFER_DRV_PRV << 0), > +}; > + > #define RADEON_SPARSE_PAGE_SIZE (64 * 1024) > > enum ring_type { > RING_GFX = 0, > RING_COMPUTE, > RING_DMA, > RING_UVD, > RING_VCE, > RING_UVD_ENC, > RING_VCN_DEC, > @@ -287,23 +296,26 @@ struct radeon_winsys { > struct pb_buffer *(*buffer_create)(struct radeon_winsys *ws, > uint64_t size, > unsigned alignment, > enum radeon_bo_domain domain, > enum radeon_bo_flag flags); > > /** > * Map the entire data store of a buffer object into the client's > address > * space. > * > + * Callers are expected to unmap buffers again if and only if the > + * RADEON_TRANSFER_TEMPORARY flag is set in \p usage. > + * > * \param buf A winsys buffer object to map. > * \param cs A command stream to flush if the buffer is > referenced by it. > - * \param usage A bitmask of the PIPE_TRANSFER_* flags. > + * \param usage A bitmask of the PIPE_TRANSFER_* and > RADEON_TRANSFER_* flags. > * \return The pointer at the beginning of the buffer. > */ > void *(*buffer_map)(struct pb_buffer *buf, > struct radeon_cmdbuf *cs, > enum pipe_transfer_usage usage); > > /** > * Unmap a buffer object from the client's address space. > * > * \param buf A winsys buffer object to unmap. > diff --git a/src/gallium/drivers/radeonsi/si_shader.c > b/src/gallium/drivers/radeonsi/si_shader.c > index 19522cc97b1..d455fb5db6a 100644 > --- a/src/gallium/drivers/radeonsi/si_shader.c > +++ b/src/gallium/drivers/radeonsi/si_shader.c > @@ -5286,21 +5286,22 @@ int si_shader_binary_upload(struct si_screen > *sscreen, struct si_shader *shader) > 0 : > SI_RESOURCE_FLAG_READ_ONLY, > PIPE_USAGE_IMMUTABLE, > align(bo_size, > SI_CPDMA_ALIGNMENT), > 256); > if (!shader->bo) > return -ENOMEM; > > /* Upload. */ > ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL, > PIPE_TRANSFER_READ_WRITE | > - PIPE_TRANSFER_UNSYNCHRONIZED); > + PIPE_TRANSFER_UNSYNCHRONIZED | > + RADEON_TRANSFER_TEMPORARY); > > /* Don't use util_memcpy_cpu_to_le32. LLVM binaries are > * endian-independent. */ > if (prolog) { > memcpy(ptr, prolog->code, prolog->code_size); > ptr += prolog->code_size; > } > if (previous_stage) { > memcpy(ptr, previous_stage->code, > previous_stage->code_size); > ptr += previous_stage->code_size; > diff --git a/src/gallium/include/pipe/p_defines.h > b/src/gallium/include/pipe/p_defines.h > index 693f041b1da..e99895d30d8 100644 > --- a/src/gallium/include/pipe/p_defines.h > +++ b/src/gallium/include/pipe/p_defines.h > @@ -334,21 +334,27 @@ enum pipe_transfer_usage > */ > PIPE_TRANSFER_PERSISTENT = (1 << 13), > > /** > * If PERSISTENT is set, this ensures any writes done by the device are > * immediately visible to the CPU and vice versa. > * > * PIPE_RESOURCE_FLAG_MAP_COHERENT must be set when creating > * the resource. > */ > - PIPE_TRANSFER_COHERENT = (1 << 14) > + PIPE_TRANSFER_COHERENT = (1 << 14), > + > + /** > + * This and higher bits are reserved for private use by drivers. > Drivers > + * should use this as (PIPE_TRANSFER_DRV_PRV << i). > + */ > + PIPE_TRANSFER_DRV_PRV = (1 << 24) > }; > > /** > * Flags for the flush function. > */ > enum pipe_flush_flags > { > PIPE_FLUSH_END_OF_FRAME = (1 << 0), > PIPE_FLUSH_DEFERRED = (1 << 1), > PIPE_FLUSH_FENCE_FD = (1 << 2), > diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c > b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c > index 9f0d4c12482..99cd8cb31a5 100644 > --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c > +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c > @@ -49,20 +49,21 @@ > struct amdgpu_sparse_backing_chunk { > uint32_t begin, end; > }; > > static struct pb_buffer * > amdgpu_bo_create(struct radeon_winsys *rws, > uint64_t size, > unsigned alignment, > enum radeon_bo_domain domain, > enum radeon_bo_flag flags); > +static void amdgpu_bo_unmap(struct pb_buffer *buf); > > static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout, > enum radeon_bo_usage usage) > { > struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); > struct amdgpu_winsys *ws = bo->ws; > int64_t abs_timeout; > > if (timeout == 0) { > if (p_atomic_read(&bo->num_active_ioctls)) > @@ -166,20 +167,26 @@ static void amdgpu_bo_remove_fences(struct > amdgpu_winsys_bo *bo) > bo->max_fences = 0; > } > > void amdgpu_bo_destroy(struct pb_buffer *_buf) > { > struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); > struct amdgpu_winsys *ws = bo->ws; > > assert(bo->bo && "must not be called for slab entries"); > > + if (!bo->is_user_ptr && bo->cpu_ptr) { > + bo->cpu_ptr = NULL; > + amdgpu_bo_unmap(&bo->base); > + } > + assert(bo->is_user_ptr || bo->u.real.map_count == 0); > + > if (ws->debug_all_bos) { > simple_mtx_lock(&ws->global_bo_list_lock); > LIST_DEL(&bo->u.real.global_list_item); > ws->num_buffers--; > simple_mtx_unlock(&ws->global_bo_list_lock); > } > > simple_mtx_lock(&ws->bo_export_table_lock); > util_hash_table_remove(ws->bo_export_table, bo->bo); > simple_mtx_unlock(&ws->bo_export_table_lock); > @@ -188,54 +195,66 @@ void amdgpu_bo_destroy(struct pb_buffer *_buf) > amdgpu_va_range_free(bo->u.real.va_handle); > amdgpu_bo_free(bo->bo); > > amdgpu_bo_remove_fences(bo); > > if (bo->initial_domain & RADEON_DOMAIN_VRAM) > ws->allocated_vram -= align64(bo->base.size, > ws->info.gart_page_size); > else if (bo->initial_domain & RADEON_DOMAIN_GTT) > ws->allocated_gtt -= align64(bo->base.size, > ws->info.gart_page_size); > > - if (bo->u.real.map_count >= 1) { > - if (bo->initial_domain & RADEON_DOMAIN_VRAM) > - ws->mapped_vram -= bo->base.size; > - else if (bo->initial_domain & RADEON_DOMAIN_GTT) > - ws->mapped_gtt -= bo->base.size; > - ws->num_mapped_buffers--; > - } > - > simple_mtx_destroy(&bo->lock); > FREE(bo); > } > > static void amdgpu_bo_destroy_or_cache(struct pb_buffer *_buf) > { > struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); > > assert(bo->bo); /* slab buffers have a separate vtbl */ > > if (bo->u.real.use_reusable_pool) > pb_cache_add_buffer(&bo->u.real.cache_entry); > else > amdgpu_bo_destroy(_buf); > } > > +static bool amdgpu_bo_do_map(struct amdgpu_winsys_bo *bo, void **cpu) > +{ > + assert(!bo->sparse && bo->bo && !bo->is_user_ptr); > + int r = amdgpu_bo_cpu_map(bo->bo, cpu); > + if (r) { > + /* Clear the cache and try again. */ > + pb_cache_release_all_buffers(&bo->ws->bo_cache); > + r = amdgpu_bo_cpu_map(bo->bo, cpu); > + if (r) > + return false; > + } > + > + if (p_atomic_inc_return(&bo->u.real.map_count) == 1) { > + if (bo->initial_domain & RADEON_DOMAIN_VRAM) > + bo->ws->mapped_vram += bo->base.size; > + else if (bo->initial_domain & RADEON_DOMAIN_GTT) > + bo->ws->mapped_gtt += bo->base.size; > + bo->ws->num_mapped_buffers++; > + } > + > + return true; > +} > + > static void *amdgpu_bo_map(struct pb_buffer *buf, > struct radeon_cmdbuf *rcs, > enum pipe_transfer_usage usage) > { > struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf; > struct amdgpu_winsys_bo *real; > struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs; > - int r; > - void *cpu = NULL; > - uint64_t offset = 0; > > assert(!bo->sparse); > > /* If it's not unsynchronized bo_map, flush CS if needed and then > wait. */ > if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { > /* DONTBLOCK doesn't make sense with UNSYNCHRONIZED. */ > if (usage & PIPE_TRANSFER_DONTBLOCK) { > if (!(usage & PIPE_TRANSFER_WRITE)) { > /* Mapping for read. > * > @@ -306,63 +325,75 @@ static void *amdgpu_bo_map(struct pb_buffer *buf, > } > > amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE, > RADEON_USAGE_READWRITE); > } > > bo->ws->buffer_wait_time += os_time_get_nano() - time; > } > } > > - /* If the buffer is created from user memory, return the user pointer. > */ > - if (bo->user_ptr) > - return bo->user_ptr; > + /* Buffer synchronization has been checked, now actually map the > buffer. */ > + void *cpu = NULL; > + uint64_t offset = 0; > > if (bo->bo) { > real = bo; > } else { > real = bo->u.slab.real; > offset = bo->va - real->va; > } > > - r = amdgpu_bo_cpu_map(real->bo, &cpu); > - if (r) { > - /* Clear the cache and try again. */ > - pb_cache_release_all_buffers(&real->ws->bo_cache); > - r = amdgpu_bo_cpu_map(real->bo, &cpu); > - if (r) > - return NULL; > + if (usage & RADEON_TRANSFER_TEMPORARY) { > + if (real->is_user_ptr) { > + cpu = real->cpu_ptr; > + } else { > + if (!amdgpu_bo_do_map(real, &cpu)) > + return NULL; > + } > + } else { > + cpu = p_atomic_read(&real->cpu_ptr); > + if (!cpu) { > + simple_mtx_lock(&real->lock); > + /* Must re-check due to the possibility of a race. Re-check need > not > + * be atomic thanks to the lock. */ > + cpu = real->cpu_ptr; > + if (!cpu) { > + if (!amdgpu_bo_do_map(real, &cpu)) { > + simple_mtx_unlock(&real->lock); > + return NULL; > + } > + p_atomic_set(&real->cpu_ptr, cpu); > + } > + simple_mtx_unlock(&real->lock); > + } > } > > - if (p_atomic_inc_return(&real->u.real.map_count) == 1) { > - if (real->initial_domain & RADEON_DOMAIN_VRAM) > - real->ws->mapped_vram += real->base.size; > - else if (real->initial_domain & RADEON_DOMAIN_GTT) > - real->ws->mapped_gtt += real->base.size; > - real->ws->num_mapped_buffers++; > - } > return (uint8_t*)cpu + offset; > } > > static void amdgpu_bo_unmap(struct pb_buffer *buf) > { > struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf; > struct amdgpu_winsys_bo *real; > > assert(!bo->sparse); > > - if (bo->user_ptr) > + if (bo->is_user_ptr) > return; > > real = bo->bo ? bo : bo->u.slab.real; > - > + assert(real->u.real.map_count != 0 && "too many unmaps"); > if (p_atomic_dec_zero(&real->u.real.map_count)) { > + assert(!real->cpu_ptr && > + "too many unmaps or forgot RADEON_TRANSFER_TEMPORARY flag"); > + > if (real->initial_domain & RADEON_DOMAIN_VRAM) > real->ws->mapped_vram -= real->base.size; > else if (real->initial_domain & RADEON_DOMAIN_GTT) > real->ws->mapped_gtt -= real->base.size; > real->ws->num_mapped_buffers--; > } > > amdgpu_bo_cpu_unmap(real->bo); > } > > @@ -1446,28 +1477,29 @@ static struct pb_buffer *amdgpu_bo_from_ptr(struct > radeon_winsys *rws, > > if (amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, > aligned_size, 1 << 12, 0, &va, &va_handle, > AMDGPU_VA_RANGE_HIGH)) > goto error_va_alloc; > > if (amdgpu_bo_va_op(buf_handle, 0, aligned_size, va, 0, > AMDGPU_VA_OP_MAP)) > goto error_va_map; > > /* Initialize it. */ > + bo->is_user_ptr = true; > pipe_reference_init(&bo->base.reference, 1); > simple_mtx_init(&bo->lock, mtx_plain); > bo->bo = buf_handle; > bo->base.alignment = 0; > bo->base.size = size; > bo->base.vtbl = &amdgpu_winsys_bo_vtbl; > bo->ws = ws; > - bo->user_ptr = pointer; > + bo->cpu_ptr = pointer; > bo->va = va; > bo->u.real.va_handle = va_handle; > bo->initial_domain = RADEON_DOMAIN_GTT; > bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1); > > ws->allocated_gtt += aligned_size; > > amdgpu_add_buffer_to_global_list(bo); > > amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, > &bo->u.real.kms_handle); > @@ -1480,21 +1512,21 @@ error_va_map: > error_va_alloc: > amdgpu_bo_free(buf_handle); > > error: > FREE(bo); > return NULL; > } > > static bool amdgpu_bo_is_user_ptr(struct pb_buffer *buf) > { > - return ((struct amdgpu_winsys_bo*)buf)->user_ptr != NULL; > + return ((struct amdgpu_winsys_bo*)buf)->is_user_ptr; > } > > static bool amdgpu_bo_is_suballocated(struct pb_buffer *buf) > { > struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf; > > return !bo->bo && !bo->sparse; > } > > static uint64_t amdgpu_bo_get_va(struct pb_buffer *buf) > diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h > b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h > index 58e6eed733d..88f4241327d 100644 > --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h > +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h > @@ -81,24 +81,25 @@ struct amdgpu_winsys_bo { > uint32_t num_backing_pages; > > struct list_head backing; > > /* Commitment information for each page of the virtual memory > area. */ > struct amdgpu_sparse_commitment *commitments; > } sparse; > } u; > > struct amdgpu_winsys *ws; > - void *user_ptr; /* from buffer_from_ptr */ > + void *cpu_ptr; /* for user_ptr and permanent maps */ > > amdgpu_bo_handle bo; /* NULL for slab entries and sparse buffers */ > bool sparse; > + bool is_user_ptr; > bool is_local; > uint32_t unique_id; > uint64_t va; > enum radeon_bo_domain initial_domain; > > /* how many command streams is this bo referenced in? */ > int num_cs_references; > > /* how many command streams, which are being emitted in a separate > * thread, is this bo referenced in? */ > -- > 2.19.1 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev