From: Marek Olšák <marek.ol...@amd.com>

v3: use PFP_SYNC_ME on EG-CM only when supported by the kernel,
    otherwise use MEM_WRITE + WAIT_REG_MEM to emulate that
---
 src/gallium/drivers/r600/evergreen_hw_context.c | 16 ++++--
 src/gallium/drivers/r600/evergreend.h           |  1 +
 src/gallium/drivers/r600/r600_blit.c            |  2 +-
 src/gallium/drivers/r600/r600_hw_context.c      | 69 ++++++++++++++++++++++++-
 src/gallium/drivers/r600/r600_pipe.h            |  5 +-
 src/gallium/drivers/r600/r600d.h                |  5 ++
 src/gallium/drivers/radeonsi/sid.h              |  2 +-
 7 files changed, 93 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c 
b/src/gallium/drivers/r600/evergreen_hw_context.c
index f456696..2feb801 100644
--- a/src/gallium/drivers/r600/evergreen_hw_context.c
+++ b/src/gallium/drivers/r600/evergreen_hw_context.c
@@ -85,7 +85,8 @@ void evergreen_dma_copy_buffer(struct r600_context *rctx,
 
 void evergreen_cp_dma_clear_buffer(struct r600_context *rctx,
                                   struct pipe_resource *dst, uint64_t offset,
-                                  unsigned size, uint32_t clear_value)
+                                  unsigned size, uint32_t clear_value,
+                                  enum r600_coherency coher)
 {
        struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 
@@ -117,7 +118,9 @@ void evergreen_cp_dma_clear_buffer(struct r600_context 
*rctx,
                unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
                unsigned reloc;
 
-               r600_need_cs_space(rctx, 10 + (rctx->b.flags ? 
R600_MAX_FLUSH_CS_DWORDS : 0), FALSE);
+               r600_need_cs_space(rctx,
+                                  10 + (rctx->b.flags ? 
R600_MAX_FLUSH_CS_DWORDS : 0) +
+                                  R600_MAX_PFP_SYNC_ME_DWORDS, FALSE);
 
                /* Flush the caches for the first copy only. */
                if (rctx->b.flags) {
@@ -148,9 +151,16 @@ void evergreen_cp_dma_clear_buffer(struct r600_context 
*rctx,
                offset += byte_count;
        }
 
+       /* CP DMA is executed in ME, but index buffers are read by PFP.
+        * This ensures that ME (CP DMA) is idle before PFP starts fetching
+        * indices. If we wanted to execute CP DMA in PFP, this packet
+        * should precede it.
+        */
+       if (coher == R600_COHERENCY_SHADER)
+               r600_emit_pfp_sync_me(rctx);
+
        /* Invalidate the read caches. */
        rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
                         R600_CONTEXT_INV_VERTEX_CACHE |
                         R600_CONTEXT_INV_TEX_CACHE;
 }
-
diff --git a/src/gallium/drivers/r600/evergreend.h 
b/src/gallium/drivers/r600/evergreend.h
index c1c6169..a81b6c5 100644
--- a/src/gallium/drivers/r600/evergreend.h
+++ b/src/gallium/drivers/r600/evergreend.h
@@ -88,6 +88,7 @@
 #define                WAIT_REG_MEM_EQUAL              3
 #define PKT3_MEM_WRITE                         0x3D
 #define PKT3_INDIRECT_BUFFER                   0x32
+#define PKT3_PFP_SYNC_ME                      0x42
 #define PKT3_SURFACE_SYNC                      0x43
 #define PKT3_ME_INITIALIZE                     0x44
 #define PKT3_COND_WRITE                        0x45
diff --git a/src/gallium/drivers/r600/r600_blit.c 
b/src/gallium/drivers/r600/r600_blit.c
index 282645f..76c3364 100644
--- a/src/gallium/drivers/r600/r600_blit.c
+++ b/src/gallium/drivers/r600/r600_blit.c
@@ -589,7 +589,7 @@ static void r600_clear_buffer(struct pipe_context *ctx, 
struct pipe_resource *ds
        if (rctx->screen->b.has_cp_dma &&
            rctx->b.chip_class >= EVERGREEN &&
            offset % 4 == 0 && size % 4 == 0) {
-               evergreen_cp_dma_clear_buffer(rctx, dst, offset, size, value);
+               evergreen_cp_dma_clear_buffer(rctx, dst, offset, size, value, 
coher);
        } else if (rctx->screen->b.has_streamout && offset % 4 == 0 && size % 4 
== 0) {
                union pipe_color_union clear_value;
                clear_value.ui[0] = value;
diff --git a/src/gallium/drivers/r600/r600_hw_context.c 
b/src/gallium/drivers/r600/r600_hw_context.c
index bbfe620..1ae3f04 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -365,6 +365,66 @@ void r600_begin_new_cs(struct r600_context *ctx)
        ctx->b.initial_gfx_cs_size = ctx->b.gfx.cs->cdw;
 }
 
+void r600_emit_pfp_sync_me(struct r600_context *rctx)
+{
+       struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+
+       if (rctx->b.chip_class >= EVERGREEN &&
+           rctx->b.screen->info.drm_minor >= 46) {
+               radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+               radeon_emit(cs, 0);
+       } else {
+               /* Emulate PFP_SYNC_ME by writing a value to memory in ME and
+                * waiting for it in PFP.
+                */
+               struct r600_resource *buf = NULL;
+               unsigned offset, reloc;
+               uint64_t va;
+
+               /* 16-byte address alignment is required by WAIT_REG_MEM. */
+               u_suballocator_alloc(rctx->b.allocator_zeroed_memory, 4, 16,
+                                    &offset, (struct pipe_resource**)&buf);
+               if (!buf) {
+                       /* This is too heavyweight, but will work. */
+                       rctx->b.gfx.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
+                       return;
+               }
+
+               reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, buf,
+                                                 RADEON_USAGE_READWRITE,
+                                                 RADEON_PRIO_FENCE);
+
+               va = buf->gpu_address + offset;
+               assert(va % 16 == 0);
+
+               /* Write 1 to memory in ME. */
+               radeon_emit(cs, PKT3(PKT3_MEM_WRITE, 3, 0));
+               radeon_emit(cs, va);
+               radeon_emit(cs, ((va >> 32) & 0xff) | MEM_WRITE_32_BITS);
+               radeon_emit(cs, 1);
+               radeon_emit(cs, 0);
+
+               radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+               radeon_emit(cs, reloc);
+
+               /* Wait in PFP (PFP can only do GEQUAL against memory). */
+               radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
+               radeon_emit(cs, WAIT_REG_MEM_GEQUAL |
+                               WAIT_REG_MEM_MEMORY |
+                               WAIT_REG_MEM_PFP);
+               radeon_emit(cs, va);
+               radeon_emit(cs, va >> 32);
+               radeon_emit(cs, 1); /* reference value */
+               radeon_emit(cs, 0xffffffff); /* mask */
+               radeon_emit(cs, 4); /* poll interval */
+
+               radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+               radeon_emit(cs, reloc);
+
+               r600_resource_reference(&buf, NULL);
+       }
+}
+
 /* The max number of bytes to copy per packet. */
 #define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
 
@@ -408,7 +468,7 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx,
 
                r600_need_cs_space(rctx,
                                   10 + (rctx->b.flags ? 
R600_MAX_FLUSH_CS_DWORDS : 0) +
-                                  3, FALSE);
+                                  3 + R600_MAX_PFP_SYNC_ME_DWORDS, FALSE);
 
                /* Flush the caches for the first copy only. */
                if (rctx->b.flags) {
@@ -448,6 +508,13 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx,
                radeon_set_config_reg(cs, R_008040_WAIT_UNTIL,
                                      S_008040_WAIT_CP_DMA_IDLE(1));
 
+       /* CP DMA is executed in ME, but index buffers are read by PFP.
+        * This ensures that ME (CP DMA) is idle before PFP starts fetching
+        * indices. If we wanted to execute CP DMA in PFP, this packet
+        * should precede it.
+        */
+       r600_emit_pfp_sync_me(rctx);
+
        /* Invalidate the read caches. */
        rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
                         R600_CONTEXT_INV_VERTEX_CACHE |
diff --git a/src/gallium/drivers/r600/r600_pipe.h 
b/src/gallium/drivers/r600/r600_pipe.h
index cdb8e82..58ab14c 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -57,6 +57,7 @@
 /* the number of CS dwords for flushing and drawing */
 #define R600_MAX_FLUSH_CS_DWORDS       18
 #define R600_MAX_DRAW_CS_DWORDS                58
+#define R600_MAX_PFP_SYNC_ME_DWORDS    16
 
 #define R600_MAX_USER_CONST_BUFFERS 13
 #define R600_MAX_DRIVER_CONST_BUFFERS 3
@@ -663,13 +664,15 @@ void r600_context_gfx_flush(void *context, unsigned flags,
 void r600_begin_new_cs(struct r600_context *ctx);
 void r600_flush_emit(struct r600_context *ctx);
 void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, boolean 
count_draw_in);
+void r600_emit_pfp_sync_me(struct r600_context *rctx);
 void r600_cp_dma_copy_buffer(struct r600_context *rctx,
                             struct pipe_resource *dst, uint64_t dst_offset,
                             struct pipe_resource *src, uint64_t src_offset,
                             unsigned size);
 void evergreen_cp_dma_clear_buffer(struct r600_context *rctx,
                                   struct pipe_resource *dst, uint64_t offset,
-                                  unsigned size, uint32_t clear_value);
+                                  unsigned size, uint32_t clear_value,
+                                  enum r600_coherency coher);
 void r600_dma_copy_buffer(struct r600_context *rctx,
                          struct pipe_resource *dst,
                          struct pipe_resource *src,
diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h
index 24f599e..75d64c1 100644
--- a/src/gallium/drivers/r600/r600d.h
+++ b/src/gallium/drivers/r600/r600d.h
@@ -96,8 +96,13 @@
 #define                COPY_DW_DST_IS_MEM              (1 << 1)
 #define PKT3_WAIT_REG_MEM                      0x3C
 #define                WAIT_REG_MEM_EQUAL              3
+#define                WAIT_REG_MEM_GEQUAL             5
+#define                WAIT_REG_MEM_MEMORY             (1 << 4)
+#define                WAIT_REG_MEM_PFP                (1 << 8)
 #define PKT3_MEM_WRITE                         0x3D
+#define                MEM_WRITE_32_BITS               (1 << 18)
 #define PKT3_INDIRECT_BUFFER                   0x32
+#define PKT3_PFP_SYNC_ME                      0x42 /* EG+ */
 #define PKT3_SURFACE_SYNC                      0x43
 #define PKT3_ME_INITIALIZE                     0x44
 #define PKT3_COND_WRITE                        0x45
diff --git a/src/gallium/drivers/radeonsi/sid.h 
b/src/gallium/drivers/radeonsi/sid.h
index 25f8cf5..ddbfe00 100644
--- a/src/gallium/drivers/radeonsi/sid.h
+++ b/src/gallium/drivers/radeonsi/sid.h
@@ -135,7 +135,7 @@
 #define                COPY_DATA_DST_SEL(x)            (((unsigned)(x) & 0xf) 
<< 8)
 #define                COPY_DATA_COUNT_SEL             (1 << 16)
 #define                COPY_DATA_WR_CONFIRM            (1 << 20)
-#define PKT3_PFP_SYNC_ME                      0x42 /* r7xx+ */
+#define PKT3_PFP_SYNC_ME                      0x42
 #define PKT3_SURFACE_SYNC                      0x43 /* deprecated on CIK, use 
ACQUIRE_MEM */
 #define PKT3_ME_INITIALIZE                     0x44 /* not on CIK */
 #define PKT3_COND_WRITE                        0x45
-- 
2.7.4

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to