From: Marek Olšák <marek.ol...@amd.com>

---
 src/gallium/drivers/radeonsi/si_buffer.c | 30 ++++++++++++++++++++----
 src/gallium/drivers/radeonsi/si_pipe.c   |  2 ++
 src/gallium/drivers/radeonsi/si_pipe.h   |  8 +++++++
 3 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_buffer.c 
b/src/gallium/drivers/radeonsi/si_buffer.c
index a1e421b8b0d..1d4387252a0 100644
--- a/src/gallium/drivers/radeonsi/si_buffer.c
+++ b/src/gallium/drivers/radeonsi/si_buffer.c
@@ -443,32 +443,47 @@ static void *si_buffer_transfer_map(struct pipe_context 
*ctx,
                         PIPE_TRANSFER_PERSISTENT))) ||
             (rbuffer->flags & RADEON_FLAG_SPARSE))) {
                assert(usage & PIPE_TRANSFER_WRITE);
 
                /* Check if mapping this buffer would cause waiting for the GPU.
                 */
                if (rbuffer->flags & RADEON_FLAG_SPARSE ||
                    force_discard_range ||
                    si_rings_is_buffer_referenced(sctx, rbuffer->buf, 
RADEON_USAGE_READWRITE) ||
                    !sctx->ws->buffer_wait(rbuffer->buf, 0, 
RADEON_USAGE_READWRITE)) {
+                       unsigned alloc_start = box->x % SI_MAP_BUFFER_ALIGNMENT;
+                       unsigned alloc_size = alloc_start + box->width;
+
+                       /* Use PKT3_WRITE_DATA for small uploads. */
+                       if (box->width <= SI_TRANSFER_WRITE_DATA_THRESHOLD &&
+                           box->x % 4 == 0 && box->width % 4 == 0) {
+                               void *cpu_map = 
u_cpu_suballoc(&sctx->cpu_suballoc, alloc_size,
+                                                              
SI_MAP_BUFFER_ALIGNMENT);
+                               cpu_map = (char*)cpu_map + alloc_start;
+
+                               return si_buffer_get_transfer(ctx, resource, 
usage, box,
+                                                             ptransfer, 
cpu_map, cpu_map,
+                                                             
SI_TRANSFER_SPECIAL_OFFSET_USE_CPU_ALLOC);
+                       }
+
                        /* Do a wait-free write-only transfer using a temporary 
buffer. */
                        unsigned offset;
                        struct r600_resource *staging = NULL;
 
                        u_upload_alloc(ctx->stream_uploader, 0,
-                                       box->width + (box->x % 
SI_MAP_BUFFER_ALIGNMENT),
+                                       alloc_size,
                                       sctx->screen->info.tcc_cache_line_size,
                                       &offset, (struct 
pipe_resource**)&staging,
                                        (void**)&data);
 
                        if (staging) {
-                               data += box->x % SI_MAP_BUFFER_ALIGNMENT;
+                               data += alloc_start;
                                return si_buffer_get_transfer(ctx, resource, 
usage, box,
                                                                ptransfer, 
data, staging, offset);
                        } else if (rbuffer->flags & RADEON_FLAG_SPARSE) {
                                return NULL;
                        }
                } else {
                        /* At this point, the buffer is always idle (we checked 
it above). */
                        usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
                }
        }
@@ -530,26 +545,30 @@ static void si_buffer_write_data(struct si_context *sctx, 
struct r600_resource *
        si_cp_write_data(sctx, buf, offset, size, V_370_TC_L2, V_370_ME, data);
 
        radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
        radeon_emit(cs, 0);
 }
 
 static void si_buffer_do_flush_region(struct pipe_context *ctx,
                                      struct pipe_transfer *transfer,
                                      const struct pipe_box *box)
 {
+       struct si_context *sctx = (struct si_context*)ctx;
        struct si_transfer *stransfer = (struct si_transfer*)transfer;
        struct r600_resource *rbuffer = r600_resource(transfer->resource);
 
-       if (stransfer->u.staging) {
+       if (stransfer->offset == SI_TRANSFER_SPECIAL_OFFSET_USE_CPU_ALLOC) {
+               si_buffer_write_data(sctx, rbuffer, box->x, box->width,
+                                    stransfer->u.cpu);
+       } else if (stransfer->u.staging) {
                /* Copy the staging buffer into the original one. */
-               si_copy_buffer((struct si_context*)ctx, transfer->resource,
+               si_copy_buffer(sctx, transfer->resource,
                               &stransfer->u.staging->b.b, box->x,
                               stransfer->offset + box->x % 
SI_MAP_BUFFER_ALIGNMENT,
                               box->width);
        }
 
        util_range_add(&rbuffer->valid_buffer_range, box->x,
                       box->x + box->width);
 }
 
 static void si_buffer_flush_region(struct pipe_context *ctx,
@@ -570,21 +589,22 @@ static void si_buffer_flush_region(struct pipe_context 
*ctx,
 static void si_buffer_transfer_unmap(struct pipe_context *ctx,
                                     struct pipe_transfer *transfer)
 {
        struct si_context *sctx = (struct si_context*)ctx;
        struct si_transfer *stransfer = (struct si_transfer*)transfer;
 
        if (transfer->usage & PIPE_TRANSFER_WRITE &&
            !(transfer->usage & PIPE_TRANSFER_FLUSH_EXPLICIT))
                si_buffer_do_flush_region(ctx, transfer, &transfer->box);
 
-       r600_resource_reference(&stransfer->u.staging, NULL);
+       if (stransfer->offset != SI_TRANSFER_SPECIAL_OFFSET_USE_CPU_ALLOC)
+               r600_resource_reference(&stransfer->u.staging, NULL);
        assert(stransfer->b.staging == NULL); /* for threaded context only */
        pipe_resource_reference(&transfer->resource, NULL);
 
        /* Don't use pool_transfers_unsync. We are always in the driver
         * thread. */
        slab_free(&sctx->pool_transfers, transfer);
 }
 
 static void si_buffer_subdata(struct pipe_context *ctx,
                              struct pipe_resource *buffer,
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
b/src/gallium/drivers/radeonsi/si_pipe.c
index 3bb8e04e4ad..a17929c2d5f 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -253,20 +253,21 @@ static void si_destroy_context(struct pipe_context 
*context)
        si_saved_cs_reference(&sctx->current_saved_cs, NULL);
 
        _mesa_hash_table_destroy(sctx->tex_handles, NULL);
        _mesa_hash_table_destroy(sctx->img_handles, NULL);
 
        util_dynarray_fini(&sctx->resident_tex_handles);
        util_dynarray_fini(&sctx->resident_img_handles);
        util_dynarray_fini(&sctx->resident_tex_needs_color_decompress);
        util_dynarray_fini(&sctx->resident_img_needs_color_decompress);
        util_dynarray_fini(&sctx->resident_tex_needs_depth_decompress);
+       u_cpu_suballoc_deinit(&sctx->cpu_suballoc);
        FREE(sctx);
 }
 
 static enum pipe_reset_status si_get_reset_status(struct pipe_context *ctx)
 {
        struct si_context *sctx = (struct si_context *)ctx;
 
        if (sctx->screen->info.has_gpu_reset_status_query)
                return sctx->ws->ctx_query_reset_status(sctx->ctx);
 
@@ -390,20 +391,21 @@ static struct pipe_context *si_create_context(struct 
pipe_screen *screen,
        sctx->b.screen = screen; /* this must be set first */
        sctx->b.priv = NULL;
        sctx->b.destroy = si_destroy_context;
        sctx->b.emit_string_marker = si_emit_string_marker;
        sctx->b.set_debug_callback = si_set_debug_callback;
        sctx->b.set_log_context = si_set_log_context;
        sctx->b.set_context_param = si_set_context_param;
        sctx->screen = sscreen; /* Easy accessing of screen/winsys. */
        sctx->is_debug = (flags & PIPE_CONTEXT_DEBUG) != 0;
 
+       u_cpu_suballoc_init(&sctx->cpu_suballoc, 64 * 1024, 
SI_MAP_BUFFER_ALIGNMENT);
        slab_create_child(&sctx->pool_transfers, &sscreen->pool_transfers);
        slab_create_child(&sctx->pool_transfers_unsync, 
&sscreen->pool_transfers);
 
        sctx->ws = sscreen->ws;
        sctx->family = sscreen->info.family;
        sctx->chip_class = sscreen->info.chip_class;
 
        if (sscreen->info.has_gpu_reset_counter_query) {
                sctx->gpu_reset_counter =
                        sctx->ws->query_value(sctx->ws, 
RADEON_GPU_RESET_COUNTER);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index f79828f3438..e2cca55a8e2 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -21,20 +21,21 @@
  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 #ifndef SI_PIPE_H
 #define SI_PIPE_H
 
 #include "si_shader.h"
 #include "si_state.h"
 
+#include "util/u_cpu_suballoc.h"
 #include "util/u_dynarray.h"
 #include "util/u_idalloc.h"
 #include "util/u_threaded_context.h"
 
 #ifdef PIPE_ARCH_BIG_ENDIAN
 #define SI_BIG_ENDIAN 1
 #else
 #define SI_BIG_ENDIAN 0
 #endif
 
@@ -244,25 +245,31 @@ struct r600_resource {
        bool                            TC_L2_dirty;
 
        /* Whether this resource is referenced by bindless handles. */
        bool                            texture_handle_allocated;
        bool                            image_handle_allocated;
 
        /* Whether the resource has been exported via resource_get_handle. */
        unsigned                        external_usage; /* PIPE_HANDLE_USAGE_* 
*/
 };
 
+#define SI_TRANSFER_SPECIAL_OFFSET_USE_CPU_ALLOC UINT_MAX
+
 struct si_transfer {
        struct threaded_transfer        b;
        union {
                struct r600_resource    *staging;
+               uint32_t                *cpu;
        } u;
+       /* If offset == SI_TRANSFER_SPECIAL_OFFSET_USE_CPU_ALLOC, use "cpu",
+        * else use "staging".
+        */
        unsigned                        offset;
 };
 
 struct si_texture {
        struct r600_resource            buffer;
 
        struct radeon_surf              surface;
        uint64_t                        size;
        struct si_texture               *flushed_depth_texture;
 
@@ -778,20 +785,21 @@ struct si_context {
        struct radeon_winsys            *ws;
        struct radeon_winsys_ctx        *ctx;
        struct radeon_cmdbuf            *gfx_cs;
        struct radeon_cmdbuf            *dma_cs;
        struct pipe_fence_handle        *last_gfx_fence;
        struct pipe_fence_handle        *last_sdma_fence;
        struct r600_resource            *eop_bug_scratch;
        struct u_upload_mgr             *cached_gtt_allocator;
        struct threaded_context         *tc;
        struct u_suballocator           *allocator_zeroed_memory;
+       struct u_cpu_suballoc           cpu_suballoc;
        struct slab_child_pool          pool_transfers;
        struct slab_child_pool          pool_transfers_unsync; /* for 
threaded_context */
        struct pipe_device_reset_callback device_reset_callback;
        struct u_log_context            *log;
        void                            *query_result_shader;
        struct blitter_context          *blitter;
        void                            *custom_dsa_flush;
        void                            *custom_blend_resolve;
        void                            *custom_blend_fmask_decompress;
        void                            *custom_blend_eliminate_fastclear;
-- 
2.17.1

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to