From: Jerome Glisse <jgli...@redhat.com>

v2: Add virtual address to dma src/dst offset for cayman

Signed-off-by: Jerome Glisse <jgli...@redhat.com>
---
 src/gallium/drivers/r600/evergreen_hw_context.c |  46 ++++++
 src/gallium/drivers/r600/evergreen_state.c      | 201 ++++++++++++++++++++++++
 src/gallium/drivers/r600/evergreend.h           |  15 ++
 src/gallium/drivers/r600/r600.h                 |  27 ++++
 src/gallium/drivers/r600/r600_buffer.c          |  25 ++-
 src/gallium/drivers/r600/r600_hw_context.c      |  48 +++++-
 src/gallium/drivers/r600/r600_pipe.c            |   6 +-
 src/gallium/drivers/r600/r600_pipe.h            |   9 ++
 src/gallium/drivers/r600/r600_state.c           | 190 ++++++++++++++++++++++
 src/gallium/drivers/r600/r600_state_common.c    |   6 +-
 src/gallium/drivers/r600/r600_texture.c         |  24 ++-
 src/gallium/drivers/r600/r600d.h                |  15 ++
 12 files changed, 595 insertions(+), 17 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c 
b/src/gallium/drivers/r600/evergreen_hw_context.c
index fa90c9a..ca4f4b3 100644
--- a/src/gallium/drivers/r600/evergreen_hw_context.c
+++ b/src/gallium/drivers/r600/evergreen_hw_context.c
@@ -26,6 +26,7 @@
 #include "r600_hw_context_priv.h"
 #include "evergreend.h"
 #include "util/u_memory.h"
+#include "util/u_math.h"
 
 static const struct r600_reg cayman_config_reg_list[] = {
        {R_009100_SPI_CONFIG_CNTL, REG_FLAG_ENABLE_ALWAYS | 
REG_FLAG_FLUSH_CHANGE, 0},
@@ -238,3 +239,48 @@ void evergreen_set_streamout_enable(struct r600_context 
*ctx, unsigned buffer_en
                r600_write_context_reg(cs, R_028B94_VGT_STRMOUT_CONFIG, 
S_028B94_STREAMOUT_0_EN(0));
        }
 }
+
+void evergreen_dma_copy(struct r600_context *rctx,
+               struct pipe_resource *dst,
+               struct pipe_resource *src,
+               unsigned long dst_offset,
+               unsigned long src_offset,
+               unsigned long size)
+{
+       struct radeon_winsys_cs *cs = rctx->rings.dma.cs;
+       unsigned i, ncopy, csize, sub_cmd, shift;
+       struct r600_resource *rdst = (struct r600_resource*)dst;
+       struct r600_resource *rsrc = (struct r600_resource*)src;
+
+       /* make sure that the dma ring is only one active */
+       rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC);
+       dst_offset += r600_resource_va(&rctx->screen->screen, dst);
+       src_offset += r600_resource_va(&rctx->screen->screen, src);
+
+       /* see if we use dword or byte copy */
+       if (!(dst_offset & 0x3) && !(src_offset & 0x3) && !(size & 0x3)) {
+               size >>= 2;
+               sub_cmd = 0x00;
+               shift = 2;
+       } else {
+               sub_cmd = 0x40;
+               shift = 0;
+       }
+       ncopy = (size / 0x000fffff) + !!(size % 0x000fffff);
+
+       r600_need_dma_space(rctx, ncopy * 5);
+       for (i = 0; i < ncopy; i++) {
+               csize = size < 0x000fffff ? size : 0x000fffff;
+               /* emit reloc before writting cs so that cs is always in 
consistent state */
+               r600_context_bo_reloc(rctx, &rctx->rings.dma, rsrc, 
RADEON_USAGE_READ);
+               r600_context_bo_reloc(rctx, &rctx->rings.dma, rdst, 
RADEON_USAGE_WRITE);
+               cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, sub_cmd, 
csize);
+               cs->buf[cs->cdw++] = dst_offset & 0xffffffff;
+               cs->buf[cs->cdw++] = src_offset & 0xffffffff;
+               cs->buf[cs->cdw++] = (dst_offset >> 32UL) & 0xff;
+               cs->buf[cs->cdw++] = (src_offset >> 32UL) & 0xff;
+               dst_offset += csize << shift;
+               src_offset += csize << shift;
+               size -= csize;
+       }
+}
diff --git a/src/gallium/drivers/r600/evergreen_state.c 
b/src/gallium/drivers/r600/evergreen_state.c
index 86e2c81..5c22e24 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -30,6 +30,20 @@
 #include "util/u_framebuffer.h"
 #include "util/u_dual_blend.h"
 #include "evergreen_compute.h"
+#include "util/u_math.h"
+
+static INLINE unsigned evergreen_array_mode(unsigned mode)
+{
+       switch (mode) {
+       case RADEON_SURF_MODE_LINEAR_ALIGNED:   return 
V_028C70_ARRAY_LINEAR_ALIGNED;
+               break;
+       case RADEON_SURF_MODE_1D:               return 
V_028C70_ARRAY_1D_TILED_THIN1;
+               break;
+       case RADEON_SURF_MODE_2D:               return 
V_028C70_ARRAY_2D_TILED_THIN1;
+       default:
+       case RADEON_SURF_MODE_LINEAR:           return 
V_028C70_ARRAY_LINEAR_GENERAL;
+       }
+}
 
 static uint32_t eg_num_banks(uint32_t nbanks)
 {
@@ -3445,3 +3459,190 @@ void evergreen_update_db_shader_control(struct 
r600_context * rctx)
                rctx->db_misc_state.atom.dirty = true;
        }
 }
+
+static void evergreen_dma_copy_tile(struct r600_context *rctx,
+                               struct pipe_resource *dst,
+                               unsigned dst_level,
+                               unsigned dst_x,
+                               unsigned dst_y,
+                               unsigned dst_z,
+                               struct pipe_resource *src,
+                               unsigned src_level,
+                               unsigned src_x,
+                               unsigned src_y,
+                               unsigned src_z,
+                               unsigned copy_height,
+                               unsigned pitch,
+                               unsigned bpp)
+{
+       struct radeon_winsys_cs *cs = rctx->rings.dma.cs;
+       struct r600_texture *rsrc = (struct r600_texture*)src;
+       struct r600_texture *rdst = (struct r600_texture*)dst;
+       unsigned array_mode, lbpp, pitch_tile_max, slice_tile_max, size;
+       unsigned ncopy, height, cheight, detile, i, x, y, z, src_mode, dst_mode;
+       unsigned sub_cmd, bank_h, bank_w, mt_aspect, nbanks, tile_split;
+       unsigned long base, addr;
+
+       /* make sure that the dma ring is only one active */
+       rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC);
+
+       dst_mode = rdst->surface.level[dst_level].mode;
+       src_mode = rsrc->surface.level[src_level].mode;
+       /* downcast linear aligned to linear to simplify test */
+       src_mode = src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? 
RADEON_SURF_MODE_LINEAR : src_mode;
+       dst_mode = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? 
RADEON_SURF_MODE_LINEAR : dst_mode;
+       assert(dst_mode != src_mode);
+
+       y = 0;
+       sub_cmd = 0x8;
+       lbpp = util_logbase2(bpp);
+       pitch_tile_max = ((pitch / bpp) >> 3) - 1;
+       nbanks = eg_num_banks(rctx->screen->tiling_info.num_banks);
+
+       if (dst_mode == RADEON_SURF_MODE_LINEAR) {
+               /* T2L */
+               array_mode = evergreen_array_mode(src_mode);
+               slice_tile_max = (((pitch * 
rsrc->surface.level[src_level].npix_y) >> 6) / bpp) - 1;
+               /* linear height must be the same as the slice tile max height, 
it's ok even
+                * if the linear destination/source have smaller heigh as the 
size of the
+                * dma packet will be using the copy_height which is always 
smaller or equal
+                * to the linear height
+                */
+               height = rsrc->surface.level[src_level].npix_y;
+               detile = 1;
+               x = src_x;
+               y = src_y;
+               z = src_z;
+               base = rsrc->surface.level[src_level].offset;
+               addr = rdst->surface.level[dst_level].offset;
+               addr += rdst->surface.level[dst_level].slice_size * dst_z;
+               addr += dst_y * pitch + dst_x * bpp;
+               bank_h = eg_bank_wh(rsrc->surface.bankh);
+               bank_w = eg_bank_wh(rsrc->surface.bankw);
+               mt_aspect = eg_macro_tile_aspect(rsrc->surface.mtilea);
+               tile_split = eg_tile_split(rsrc->surface.tile_split);
+               base += r600_resource_va(&rctx->screen->screen, src);
+               addr += r600_resource_va(&rctx->screen->screen, dst);
+       } else {
+               /* L2T */
+               array_mode = evergreen_array_mode(dst_mode);
+               slice_tile_max = (((pitch * 
rdst->surface.level[dst_level].npix_y) >> 6) / bpp) - 1;
+               /* linear height must be the same as the slice tile max height, 
it's ok even
+                * if the linear destination/source have smaller heigh as the 
size of the
+                * dma packet will be using the copy_height which is always 
smaller or equal
+                * to the linear height
+                */
+               height = rdst->surface.level[dst_level].npix_y;
+               detile = 0;
+               x = dst_x;
+               y = dst_y;
+               z = dst_z;
+               base = rdst->surface.level[dst_level].offset;
+               addr = rsrc->surface.level[src_level].offset;
+               addr += rsrc->surface.level[src_level].slice_size * src_z;
+               addr += src_y * pitch + src_x * bpp;
+               bank_h = eg_bank_wh(rdst->surface.bankh);
+               bank_w = eg_bank_wh(rdst->surface.bankw);
+               mt_aspect = eg_macro_tile_aspect(rdst->surface.mtilea);
+               tile_split = eg_tile_split(rdst->surface.tile_split);
+               base += r600_resource_va(&rctx->screen->screen, dst);
+               addr += r600_resource_va(&rctx->screen->screen, src);
+       }
+
+       size = (copy_height * pitch) >> 2;
+       ncopy = (size / 0x000fffff) + !!(size % 0x000fffff);
+       r600_need_dma_space(rctx, ncopy * 9);
+
+       for (i = 0; i < ncopy; i++) {
+               cheight = copy_height;
+               if (((cheight * pitch) >> 2) > 0x000fffff) {
+                       cheight = (0x000fffff << 2) / pitch;
+               }
+               size = (cheight * pitch) >> 2;
+               /* emit reloc before writting cs so that cs is always in 
consistent state */
+               r600_context_bo_reloc(rctx, &rctx->rings.dma, &rsrc->resource, 
RADEON_USAGE_READ);
+               r600_context_bo_reloc(rctx, &rctx->rings.dma, &rdst->resource, 
RADEON_USAGE_WRITE);
+               cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, sub_cmd, size);
+               cs->buf[cs->cdw++] = base >> 8;
+               cs->buf[cs->cdw++] = (detile << 31) | (array_mode << 27) |
+                                       (lbpp << 24) | (bank_h << 21) |
+                                       (bank_w << 18) | (mt_aspect << 16);
+               cs->buf[cs->cdw++] = (pitch_tile_max << 0) | ((height - 1) << 
16);
+               cs->buf[cs->cdw++] = (slice_tile_max << 0);
+               cs->buf[cs->cdw++] = (x << 0) | (z << 18);
+               cs->buf[cs->cdw++] = (y << 0) | (tile_split << 21) | (nbanks << 
25);
+               cs->buf[cs->cdw++] = addr & 0xfffffffc;
+               cs->buf[cs->cdw++] = (addr >> 32UL) & 0xff;
+               copy_height -= cheight;
+               addr += cheight * pitch;
+               y += cheight;
+       }
+}
+
+boolean evergreen_dma_blit(struct pipe_context *ctx,
+                       struct pipe_resource *dst,
+                       unsigned dst_level,
+                       unsigned dst_x, unsigned dst_y, unsigned dst_z,
+                       struct pipe_resource *src,
+                       unsigned src_level,
+                       const struct pipe_box *src_box)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_texture *rsrc = (struct r600_texture*)src;
+       struct r600_texture *rdst = (struct r600_texture*)dst;
+       unsigned dst_pitch, src_pitch, bpp, dst_mode, src_mode, copy_height;
+       unsigned src_w, dst_w;
+
+       if (rctx->rings.dma.cs == NULL) {
+               return FALSE;
+       }
+       if (src->format != dst->format) {
+               return FALSE;
+       }
+
+       bpp = rdst->surface.bpe;
+       dst_pitch = rdst->surface.level[dst_level].pitch_bytes;
+       src_pitch = rsrc->surface.level[src_level].pitch_bytes;
+       src_w = rsrc->surface.level[src_level].npix_x;
+       dst_w = rdst->surface.level[dst_level].npix_x;
+       copy_height = src_box->height / rsrc->surface.blk_h;
+
+       dst_mode = rdst->surface.level[dst_level].mode;
+       src_mode = rsrc->surface.level[src_level].mode;
+       /* downcast linear aligned to linear to simplify test */
+       src_mode = src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? 
RADEON_SURF_MODE_LINEAR : src_mode;
+       dst_mode = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? 
RADEON_SURF_MODE_LINEAR : dst_mode;
+
+       if (src_pitch != dst_pitch || src_box->x || dst_x || src_w != dst_w) {
+               /* FIXME evergreen can do partial blit */
+               return FALSE;
+       }
+       /* the x test here are currently useless (because we don't support 
partial blit)
+        * but keep them around so we don't forget about those
+        */
+       if ((src_pitch & 0x7) || (src_box->x & 0x7) || (dst_x & 0x7) || 
(src_box->y & 0x7) || (dst_y & 0x7)) {
+               return FALSE;
+       }
+
+       if (src_mode == dst_mode) {
+               unsigned long dst_offset, src_offset;
+               /* simple dma blit would do NOTE code here assume :
+                *   src_box.x/y == 0
+                *   dst_x/y == 0
+                *   dst_pitch == src_pitch
+                */
+               src_offset= rsrc->surface.level[src_level].offset;
+               src_offset += rsrc->surface.level[src_level].slice_size * 
src_box->z;
+               src_offset += src_box->y * src_pitch + src_box->x * bpp;
+               dst_offset = rdst->surface.level[dst_level].offset;
+               dst_offset += rdst->surface.level[dst_level].slice_size * dst_z;
+               dst_offset += dst_y * dst_pitch + dst_x * bpp;
+               evergreen_dma_copy(rctx, dst, src, dst_offset, src_offset,
+                                       src_box->height * src_pitch);
+       } else {
+               evergreen_dma_copy_tile(rctx, dst, dst_level, dst_x, dst_y, 
dst_z,
+                                       src, src_level, src_box->x, src_box->y, 
src_box->z,
+                                       copy_height, dst_pitch, bpp);
+       }
+       return TRUE;
+}
diff --git a/src/gallium/drivers/r600/evergreend.h 
b/src/gallium/drivers/r600/evergreend.h
index d9dba95..12c7ed1 100644
--- a/src/gallium/drivers/r600/evergreend.h
+++ b/src/gallium/drivers/r600/evergreend.h
@@ -2317,4 +2317,19 @@
 #define   G_028AA8_SWITCH_ON_EOP(x)                    (((x) >> 17) & 0x1)
 #define   C_028AA8_SWITCH_ON_EOP                       0xFFFDFFFF
 
+/* async DMA packets */
+#define DMA_PACKET(cmd, sub_cmd, n) ((((cmd) & 0xF) << 28) |    \
+                                    (((sub_cmd) & 0xFF) << 20) |\
+                                    (((n) & 0xFFFFF) << 0))
+/* async DMA Packet types */
+#define    DMA_PACKET_WRITE                     0x2
+#define    DMA_PACKET_COPY                      0x3
+#define    DMA_PACKET_INDIRECT_BUFFER           0x4
+#define    DMA_PACKET_SEMAPHORE                 0x5
+#define    DMA_PACKET_FENCE                     0x6
+#define    DMA_PACKET_TRAP                      0x7
+#define    DMA_PACKET_SRBM_WRITE                0x9
+#define    DMA_PACKET_CONSTANT_FILL             0xd
+#define    DMA_PACKET_NOP                       0xf
+
 #endif
diff --git a/src/gallium/drivers/r600/r600.h b/src/gallium/drivers/r600/r600.h
index 93604fb..3ee6b79 100644
--- a/src/gallium/drivers/r600/r600.h
+++ b/src/gallium/drivers/r600/r600.h
@@ -170,6 +170,33 @@ void r600_flush_emit(struct r600_context *ctx);
 void r600_context_streamout_begin(struct r600_context *ctx);
 void r600_context_streamout_end(struct r600_context *ctx);
 void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, boolean 
count_draw_in);
+void r600_need_dma_space(struct r600_context *ctx, unsigned num_dw);
+void r600_dma_copy(struct r600_context *rctx,
+               struct pipe_resource *dst,
+               struct pipe_resource *src,
+               unsigned long dst_offset,
+               unsigned long src_offset,
+               unsigned long size);
+boolean r600_dma_blit(struct pipe_context *ctx,
+                       struct pipe_resource *dst,
+                       unsigned dst_level,
+                       unsigned dst_x, unsigned dst_y, unsigned dst_z,
+                       struct pipe_resource *src,
+                       unsigned src_level,
+                       const struct pipe_box *src_box);
+void evergreen_dma_copy(struct r600_context *rctx,
+               struct pipe_resource *dst,
+               struct pipe_resource *src,
+               unsigned long dst_offset,
+               unsigned long src_offset,
+               unsigned long size);
+boolean evergreen_dma_blit(struct pipe_context *ctx,
+                       struct pipe_resource *dst,
+                       unsigned dst_level,
+                       unsigned dst_x, unsigned dst_y, unsigned dst_z,
+                       struct pipe_resource *src,
+                       unsigned src_level,
+                       const struct pipe_box *src_box);
 void r600_context_block_emit_dirty(struct r600_context *ctx, struct r600_block 
*block, unsigned pkt_flags);
 void r600_cp_dma_copy_buffer(struct r600_context *rctx,
                             struct pipe_resource *dst, unsigned dst_offset,
diff --git a/src/gallium/drivers/r600/r600_buffer.c 
b/src/gallium/drivers/r600/r600_buffer.c
index be171f8..6df0d91 100644
--- a/src/gallium/drivers/r600/r600_buffer.c
+++ b/src/gallium/drivers/r600/r600_buffer.c
@@ -27,6 +27,7 @@
 #include "r600_pipe.h"
 #include "util/u_upload_mgr.h"
 #include "util/u_memory.h"
+#include "util/u_surface.h"
 
 static void r600_buffer_destroy(struct pipe_screen *screen,
                                struct pipe_resource *buf)
@@ -179,13 +180,27 @@ static void r600_buffer_transfer_unmap(struct 
pipe_context *pipe,
        struct r600_transfer *rtransfer = (struct r600_transfer*)transfer;
 
        if (rtransfer->staging) {
-               struct pipe_box box;
-               u_box_1d(rtransfer->offset + transfer->box.x % 
R600_MAP_BUFFER_ALIGNMENT,
-                        transfer->box.width, &box);
+               struct pipe_resource *dst, *src;
+               unsigned soffset, doffset, size;
 
+               dst = transfer->resource;
+               src = &rtransfer->staging->b.b;
+               size = transfer->box.width;
+               doffset = transfer->box.x;
+               soffset = rtransfer->offset + transfer->box.x % 
R600_MAP_BUFFER_ALIGNMENT;
                /* Copy the staging buffer into the original one. */
-               r600_copy_buffer(pipe, transfer->resource, transfer->box.x,
-                                &rtransfer->staging->b.b, &box);
+               if (rctx->rings.dma.cs && !(size % 4) && !(doffset % 4) && 
!(soffset)) {
+                       if (rctx->screen->chip_class >= EVERGREEN) {
+                               evergreen_dma_copy(rctx, dst, src, doffset, 
soffset, size);
+                       } else {
+                               r600_dma_copy(rctx, dst, src, doffset, soffset, 
size);
+                       }
+               } else {
+                       struct pipe_box box;
+
+                       u_box_1d(soffset, size, &box);
+                       r600_copy_buffer(pipe, dst, doffset, src, &box);
+               }
                pipe_resource_reference((struct 
pipe_resource**)&rtransfer->staging, NULL);
        }
        util_slab_free(&rctx->pool_transfers, transfer);
diff --git a/src/gallium/drivers/r600/r600_hw_context.c 
b/src/gallium/drivers/r600/r600_hw_context.c
index 9cef87f..d7518a5 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -762,8 +762,6 @@ void r600_context_flush(struct r600_context *ctx, unsigned 
flags)
                }
        }
 #endif
-
-       r600_begin_new_cs(ctx);
 }
 
 void r600_begin_new_cs(struct r600_context *ctx)
@@ -1126,3 +1124,49 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx,
                dst_offset += byte_count;
        }
 }
+
+void r600_need_dma_space(struct r600_context *ctx, unsigned num_dw)
+{
+       /* The number of dwords we already used in the DMA so far. */
+       num_dw += ctx->rings.dma.cs->cdw;
+       /* Flush if there's not enough space. */
+       if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
+               ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC);
+       }
+}
+
+void r600_dma_copy(struct r600_context *rctx,
+               struct pipe_resource *dst,
+               struct pipe_resource *src,
+               unsigned long dst_offset,
+               unsigned long src_offset,
+               unsigned long size)
+{
+       struct radeon_winsys_cs *cs = rctx->rings.dma.cs;
+       unsigned i, ncopy, csize, shift;
+       struct r600_resource *rdst = (struct r600_resource*)dst;
+       struct r600_resource *rsrc = (struct r600_resource*)src;
+
+       /* make sure that the dma ring is only one active */
+       rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC);
+
+       size >>= 2;
+       shift = 2;
+       ncopy = (size / 0xffff) + !!(size % 0xffff);
+
+       r600_need_dma_space(rctx, ncopy * 5);
+       for (i = 0; i < ncopy; i++) {
+               csize = size < 0xffff ? size : 0xffff;
+               /* emit reloc before writting cs so that cs is always in 
consistent state */
+               r600_context_bo_reloc(rctx, &rctx->rings.dma, rsrc, 
RADEON_USAGE_READ);
+               r600_context_bo_reloc(rctx, &rctx->rings.dma, rdst, 
RADEON_USAGE_WRITE);
+               cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize);
+               cs->buf[cs->cdw++] = dst_offset & 0xfffffffc;
+               cs->buf[cs->cdw++] = src_offset & 0xfffffffc;
+               cs->buf[cs->cdw++] = (dst_offset >> 32UL) & 0xff;
+               cs->buf[cs->cdw++] = (src_offset >> 32UL) & 0xff;
+               dst_offset += csize << shift;
+               src_offset += csize << shift;
+               size -= csize;
+       }
+}
diff --git a/src/gallium/drivers/r600/r600_pipe.c 
b/src/gallium/drivers/r600/r600_pipe.c
index c72ee8f..6767412 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -30,6 +30,7 @@
 #include "util/u_memory.h"
 #include "util/u_simple_shaders.h"
 #include "util/u_upload_mgr.h"
+#include "util/u_math.h"
 #include "vl/vl_decoder.h"
 #include "vl/vl_video_buffer.h"
 #include "os/os_time.h"
@@ -128,12 +129,13 @@ static void r600_flush(struct pipe_context *ctx, unsigned 
flags)
        }
 
        r600_context_flush(rctx, flags);
+       rctx->rings.gfx.flushing = false;
+       r600_begin_new_cs(rctx);
 
        /* Re-enable render condition. */
        if (render_cond) {
                ctx->render_condition(ctx, render_cond, render_cond_mode);
        }
-       rctx->rings.gfx.flushing = false;
 }
 
 static void r600_flush_from_st(struct pipe_context *ctx,
@@ -1111,8 +1113,10 @@ struct pipe_screen *r600_screen_create(struct 
radeon_winsys *ws)
 
        if (rscreen->chip_class >= EVERGREEN) {
                rscreen->screen.is_format_supported = 
evergreen_is_format_supported;
+               rscreen->dma_blit = &evergreen_dma_blit;
        } else {
                rscreen->screen.is_format_supported = r600_is_format_supported;
+               rscreen->dma_blit = &r600_dma_blit;
        }
        rscreen->screen.is_video_format_supported = 
vl_video_buffer_is_format_supported;
        rscreen->screen.context_create = r600_create_context;
diff --git a/src/gallium/drivers/r600/r600_pipe.h 
b/src/gallium/drivers/r600/r600_pipe.h
index 5cb0805..31dcd05 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -220,6 +220,14 @@ enum r600_msaa_texture_mode {
        MSAA_TEXTURE_COMPRESSED
 };
 
+typedef boolean (*r600g_dma_blit_t)(struct pipe_context *ctx,
+                               struct pipe_resource *dst,
+                               unsigned dst_level,
+                               unsigned dst_x, unsigned dst_y, unsigned dst_z,
+                               struct pipe_resource *src,
+                               unsigned src_level,
+                               const struct pipe_box *src_box);
+
 struct r600_screen {
        struct pipe_screen              screen;
        struct radeon_winsys            *ws;
@@ -243,6 +251,7 @@ struct r600_screen {
        uint32_t                        *trace_ptr;
        unsigned                        cs_count;
 #endif
+       r600g_dma_blit_t                dma_blit;
 };
 
 struct r600_pipe_sampler_view {
diff --git a/src/gallium/drivers/r600/r600_state.c 
b/src/gallium/drivers/r600/r600_state.c
index e9fffaa..f5a914b 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -2947,3 +2947,193 @@ void r600_update_db_shader_control(struct r600_context 
* rctx)
                rctx->db_misc_state.atom.dirty = true;
        }
 }
+
+static INLINE unsigned r600_array_mode(unsigned mode)
+{
+       switch (mode) {
+       case RADEON_SURF_MODE_LINEAR_ALIGNED:   return 
V_0280A0_ARRAY_LINEAR_ALIGNED;
+               break;
+       case RADEON_SURF_MODE_1D:               return 
V_0280A0_ARRAY_1D_TILED_THIN1;
+               break;
+       case RADEON_SURF_MODE_2D:               return 
V_0280A0_ARRAY_2D_TILED_THIN1;
+       default:
+       case RADEON_SURF_MODE_LINEAR:           return 
V_0280A0_ARRAY_LINEAR_GENERAL;
+       }
+}
+
+static boolean r600_dma_copy_tile(struct r600_context *rctx,
+                               struct pipe_resource *dst,
+                               unsigned dst_level,
+                               unsigned dst_x,
+                               unsigned dst_y,
+                               unsigned dst_z,
+                               struct pipe_resource *src,
+                               unsigned src_level,
+                               unsigned src_x,
+                               unsigned src_y,
+                               unsigned src_z,
+                               unsigned copy_height,
+                               unsigned pitch,
+                               unsigned bpp)
+{
+       struct radeon_winsys_cs *cs = rctx->rings.dma.cs;
+       struct r600_texture *rsrc = (struct r600_texture*)src;
+       struct r600_texture *rdst = (struct r600_texture*)dst;
+       unsigned array_mode, lbpp, pitch_tile_max, slice_tile_max, size;
+       unsigned ncopy, height, cheight, detile, i, x, y, z, src_mode, dst_mode;
+       unsigned long base, addr;
+
+       /* make sure that the dma ring is only one active */
+       rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC);
+
+       dst_mode = rdst->surface.level[dst_level].mode;
+       src_mode = rsrc->surface.level[src_level].mode;
+       /* downcast linear aligned to linear to simplify test */
+       src_mode = src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? 
RADEON_SURF_MODE_LINEAR : src_mode;
+       dst_mode = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? 
RADEON_SURF_MODE_LINEAR : dst_mode;
+       assert(dst_mode != src_mode);
+
+       y = 0;
+       lbpp = util_logbase2(bpp);
+       pitch_tile_max = ((pitch / bpp) >> 3) - 1;
+
+       if (dst_mode == RADEON_SURF_MODE_LINEAR) {
+               /* T2L */
+               array_mode = r600_array_mode(src_mode);
+               slice_tile_max = (((pitch * 
rsrc->surface.level[src_level].npix_y) >> 6) / bpp) - 1;
+               /* linear height must be the same as the slice tile max height, 
it's ok even
+                * if the linear destination/source have smaller heigh as the 
size of the
+                * dma packet will be using the copy_height which is always 
smaller or equal
+                * to the linear height
+                */
+               height = rsrc->surface.level[src_level].npix_y;
+               detile = 1;
+               x = src_x;
+               y = src_y;
+               z = src_z;
+               base = rsrc->surface.level[src_level].offset;
+               addr = rdst->surface.level[dst_level].offset;
+               addr += rdst->surface.level[dst_level].slice_size * dst_z;
+               addr += dst_y * pitch + dst_x * bpp;
+       } else {
+               /* L2T */
+               array_mode = r600_array_mode(dst_mode);
+               slice_tile_max = (((pitch * 
rdst->surface.level[dst_level].npix_y) >> 6) / bpp) - 1;
+               /* linear height must be the same as the slice tile max height, 
it's ok even
+                * if the linear destination/source have smaller heigh as the 
size of the
+                * dma packet will be using the copy_height which is always 
smaller or equal
+                * to the linear height
+                */
+               height = rdst->surface.level[dst_level].npix_y;
+               detile = 0;
+               x = dst_x;
+               y = dst_y;
+               z = dst_z;
+               base = rdst->surface.level[dst_level].offset;
+               addr = rsrc->surface.level[src_level].offset;
+               addr += rsrc->surface.level[src_level].slice_size * src_z;
+               addr += src_y * pitch + src_x * bpp;
+       }
+       /* check that we are in dw/base alignment constraint */
+       if ((addr & 0x3) || (base & 0xff)) {
+               return FALSE;
+       }
+
+       size = (copy_height * pitch) >> 2;
+       ncopy = (size / 0x0000ffff) + !!(size % 0x0000ffff);
+       r600_need_dma_space(rctx, ncopy * 7);
+       for (i = 0; i < ncopy; i++) {
+               cheight = copy_height;
+               if (((cheight * pitch) >> 2) > 0x0000ffff) {
+                       cheight = (0x0000ffff << 2) / pitch;
+               }
+               size = (cheight * pitch) >> 2;
+               /* emit reloc before writting cs so that cs is always in 
consistent state */
+               r600_context_bo_reloc(rctx, &rctx->rings.dma, &rsrc->resource, 
RADEON_USAGE_READ);
+               r600_context_bo_reloc(rctx, &rctx->rings.dma, &rdst->resource, 
RADEON_USAGE_WRITE);
+               cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, 1, 0, size);
+               cs->buf[cs->cdw++] = base >> 8;
+               cs->buf[cs->cdw++] = (detile << 31) | (array_mode << 27) |
+                                       (lbpp << 24) | ((height - 1) << 10) |
+                                       pitch_tile_max;
+               cs->buf[cs->cdw++] = (slice_tile_max << 12) | (z << 0);
+               cs->buf[cs->cdw++] = (x << 3) | (y << 17);
+               cs->buf[cs->cdw++] = addr & 0xfffffffc;
+               cs->buf[cs->cdw++] = (addr >> 32UL) & 0xff;
+               copy_height -= cheight;
+               addr += cheight * pitch;
+               y += cheight;
+       }
+       return TRUE;
+}
+
+boolean r600_dma_blit(struct pipe_context *ctx,
+                       struct pipe_resource *dst,
+                       unsigned dst_level,
+                       unsigned dst_x, unsigned dst_y, unsigned dst_z,
+                       struct pipe_resource *src,
+                       unsigned src_level,
+                       const struct pipe_box *src_box)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_texture *rsrc = (struct r600_texture*)src;
+       struct r600_texture *rdst = (struct r600_texture*)dst;
+       unsigned dst_pitch, src_pitch, bpp, dst_mode, src_mode, copy_height;
+       unsigned src_w, dst_w;
+
+       if (rctx->rings.dma.cs == NULL) {
+               return FALSE;
+       }
+       if (src->format != dst->format) {
+               return FALSE;
+       }
+
+       bpp = rdst->surface.bpe;
+       dst_pitch = rdst->surface.level[dst_level].pitch_bytes;
+       src_pitch = rsrc->surface.level[src_level].pitch_bytes;
+       src_w = rsrc->surface.level[src_level].npix_x;
+       dst_w = rdst->surface.level[dst_level].npix_x;
+       copy_height = src_box->height / rsrc->surface.blk_h;
+
+       dst_mode = rdst->surface.level[dst_level].mode;
+       src_mode = rsrc->surface.level[src_level].mode;
+       /* downcast linear aligned to linear to simplify test */
+       src_mode = src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? 
RADEON_SURF_MODE_LINEAR : src_mode;
+       dst_mode = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? 
RADEON_SURF_MODE_LINEAR : dst_mode;
+
+       if (src_pitch != dst_pitch || src_box->x || dst_x || src_w != dst_w) {
+               /* strick requirement on r6xx/r7xx */
+               return FALSE;
+       }
+       /* lot of constraint on alignment this should capture them all */
+       if ((src_pitch & 0x7) || (src_box->y & 0x7) || (dst_y & 0x7)) {
+               return FALSE;
+       }
+
+       if (src_mode == dst_mode) {
+               unsigned long dst_offset, src_offset, size;
+
+               /* simple dma blit would do NOTE code here assume :
+                *   src_box.x/y == 0
+                *   dst_x/y == 0
+                *   dst_pitch == src_pitch
+                */
+               src_offset= rsrc->surface.level[src_level].offset;
+               src_offset += rsrc->surface.level[src_level].slice_size * 
src_box->z;
+               src_offset += src_box->y * src_pitch + src_box->x * bpp;
+               dst_offset = rdst->surface.level[dst_level].offset;
+               dst_offset += rdst->surface.level[dst_level].slice_size * dst_z;
+               dst_offset += dst_y * dst_pitch + dst_x * bpp;
+               size = src_box->height * src_pitch;
+               /* must be dw aligned */
+               if ((dst_offset & 0x3) || (src_offset & 0x3) || (size & 0x3)) {
+                       return FALSE;
+               }
+               r600_dma_copy(rctx, dst, src, dst_offset, src_offset, size);
+       } else {
+               return r600_dma_copy_tile(rctx, dst, dst_level, dst_x, dst_y, 
dst_z,
+                                       src, src_level, src_box->x, src_box->y, 
src_box->z,
+                                       copy_height, dst_pitch, bpp);
+       }
+       return TRUE;
+}
diff --git a/src/gallium/drivers/r600/r600_state_common.c 
b/src/gallium/drivers/r600/r600_state_common.c
index c7f672f..b547d64 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -1273,6 +1273,9 @@ static void r600_draw_vbo(struct pipe_context *ctx, const 
struct pipe_draw_info
                return;
        }
 
+       /* make sure that the gfx ring is only one active */
+       rctx->rings.dma.flush(rctx, RADEON_FLUSH_ASYNC);
+
        if (!r600_update_derived_state(rctx)) {
                /* useless to render because current rendering command
                 * can't be achieved
@@ -1280,9 +1283,6 @@ static void r600_draw_vbo(struct pipe_context *ctx, const 
struct pipe_draw_info
                return;
        }
 
-       /* make sure that the gfx ring is only one active */
-       rctx->rings.dma.flush(rctx, RADEON_FLUSH_ASYNC);
-
        if (info.indexed) {
                /* Initialize the index buffer struct. */
                pipe_resource_reference(&ib.buffer, rctx->index_buffer.buffer);
diff --git a/src/gallium/drivers/r600/r600_texture.c 
b/src/gallium/drivers/r600/r600_texture.c
index 5b7873d..3403fec 100644
--- a/src/gallium/drivers/r600/r600_texture.c
+++ b/src/gallium/drivers/r600/r600_texture.c
@@ -35,13 +35,19 @@
 /* Copy from a full GPU texture to a transfer's staging one. */
 static void r600_copy_to_staging_texture(struct pipe_context *ctx, struct 
r600_transfer *rtransfer)
 {
+       struct r600_context *rctx = (struct r600_context*)ctx;
        struct pipe_transfer *transfer = (struct pipe_transfer*)rtransfer;
        struct pipe_resource *dst = &rtransfer->staging->b.b;
        struct pipe_resource *src = transfer->resource;
 
        if (src->nr_samples <= 1) {
-               ctx->resource_copy_region(ctx, dst, 0, 0, 0, 0,
-                                         src, transfer->level, &transfer->box);
+               if (!rctx->screen->dma_blit(ctx, dst, 0, 0, 0, 0,
+                                           src, transfer->level,
+                                           &transfer->box)) {
+                       /* async dma could not be use */
+                       ctx->resource_copy_region(ctx, dst, 0, 0, 0, 0,
+                                                 src, transfer->level, 
&transfer->box);
+               }
        } else {
                /* Resolve the resource. */
                struct pipe_blit_info blit;
@@ -66,16 +72,22 @@ static void r600_copy_to_staging_texture(struct 
pipe_context *ctx, struct r600_t
 /* Copy from a transfer's staging texture to a full GPU one. */
 static void r600_copy_from_staging_texture(struct pipe_context *ctx, struct 
r600_transfer *rtransfer)
 {
+       struct r600_context *rctx = (struct r600_context*)ctx;
        struct pipe_transfer *transfer = (struct pipe_transfer*)rtransfer;
        struct pipe_resource *texture = transfer->resource;
        struct pipe_box sbox;
 
        u_box_3d(0, 0, 0, transfer->box.width, transfer->box.height, 
transfer->box.depth, &sbox);
 
-       ctx->resource_copy_region(ctx, texture, transfer->level,
-                                 transfer->box.x, transfer->box.y, 
transfer->box.z,
-                                 &rtransfer->staging->b.b,
-                                 0, &sbox);
+       if (!rctx->screen->dma_blit(ctx, texture, transfer->level,
+                                   transfer->box.x, transfer->box.y, 
transfer->box.z,
+                                   &rtransfer->staging->b.b, 0, &sbox)) {
+               /* async dma could not be use */
+               ctx->resource_copy_region(ctx, texture, transfer->level,
+                                         transfer->box.x, transfer->box.y, 
transfer->box.z,
+                                         &rtransfer->staging->b.b,
+                                         0, &sbox);
+       }
 }
 
 unsigned r600_texture_get_offset(struct r600_texture *rtex,
diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h
index dd64aca..621e7a1 100644
--- a/src/gallium/drivers/r600/r600d.h
+++ b/src/gallium/drivers/r600/r600d.h
@@ -3681,4 +3681,19 @@
 #define SQ_TEX_INST_SAMPLE_C_G_LB      0x1E
 #define SQ_TEX_INST_SAMPLE_C_G_LZ      0x1F
 
+/* async DMA packets */
+#define DMA_PACKET(cmd, t, s, n)       ((((cmd) & 0xF) << 28) |        \
+                                       (((t) & 0x1) << 23) |           \
+                                       (((s) & 0x1) << 22) |           \
+                                       (((n) & 0xFFFF) << 0))
+/* async DMA Packet types */
+#define DMA_PACKET_WRITE               0x2
+#define DMA_PACKET_COPY                        0x3
+#define DMA_PACKET_INDIRECT_BUFFER     0x4
+#define DMA_PACKET_SEMAPHORE           0x5
+#define DMA_PACKET_FENCE               0x6
+#define DMA_PACKET_TRAP                        0x7
+#define DMA_PACKET_CONSTANT_FILL       0xd /* 7xx only */
+#define DMA_PACKET_NOP                 0xf
+
 #endif
-- 
1.7.11.7

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to