previously we always insert 128nops behind vm_flush, which
may lead to DAMframe size above 256 dw and automatially aligned
to 512 dw.

now we calculate how many DWs already inserted after vm_flush
and make up for the reset to pad up to 128dws before emit_ib.
that way we only take 256 dw per submit.

v2:
drop insert_nops in vm_flush
re-calculate the estimate frame size for gfx8 ring
v3:
calculate the gap between vm_flush and IB in cntx_cntl
on an member field of ring structure
v4:
set last_vm_flush_pos even for case of no vm flush.
v5:
move pipeline sync out of VM flush and always insert it.

Change-Id: I670adf8c5e7bdfe5f4fd191c24f62a5d7a170d3a
Signed-off-by: Monk Liu <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c   |  8 +++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c   | 31 -------------------------------
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 19 +++++++++++++++----
 4 files changed, 23 insertions(+), 36 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index ff39858..a1cd429 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
@@ -154,6 +154,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned 
num_ibs,
 
        alloc_size = ring->funcs->emit_frame_size + num_ibs *
                ring->funcs->emit_ib_size;
+       need_ctx_switch = ring->current_ctx != fence_ctx;
 
        r = amdgpu_ring_alloc(ring, alloc_size);
        if (r) {
@@ -164,7 +165,12 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned 
num_ibs,
        if (ring->funcs->init_cond_exec)
                patch_offset = amdgpu_ring_init_cond_exec(ring);
 
-       need_ctx_switch = ring->current_ctx != fence_ctx;
+       /* pipeline sync is to prevent following DE change SH register
+        * before previous submit totally done (EOP signaled)
+        */
+       if (ring->funcs->emit_pipeline_sync)
+               amdgpu_ring_emit_pipeline_sync(ring);
+
        if (vm) {
                r = amdgpu_vm_flush(ring, job);
                if (r) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index e0f8061..4c546fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -173,6 +173,7 @@ struct amdgpu_ring {
 #if defined(CONFIG_DEBUG_FS)
        struct dentry *ent;
 #endif
+       u32 last_vm_flush_pos;
 };
 
 int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index d05546e..4c1102f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -343,32 +343,6 @@ int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct 
amdgpu_ring *ring,
        return r;
 }
 
-static bool amdgpu_vm_ring_has_compute_vm_bug(struct amdgpu_ring *ring)
-{
-       struct amdgpu_device *adev = ring->adev;
-       const struct amdgpu_ip_block *ip_block;
-
-       if (ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE)
-               /* only compute rings */
-               return false;
-
-       ip_block = amdgpu_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX);
-       if (!ip_block)
-               return false;
-
-       if (ip_block->version->major <= 7) {
-               /* gfx7 has no workaround */
-               return true;
-       } else if (ip_block->version->major == 8) {
-               if (adev->gfx.mec_fw_version >= 673)
-                       /* gfx8 is fixed in MEC firmware 673 */
-                       return false;
-               else
-                       return true;
-       }
-       return false;
-}
-
 /**
  * amdgpu_vm_flush - hardware flush the vm
  *
@@ -391,11 +365,6 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct 
amdgpu_job *job)
                id->oa_size != job->oa_size);
        int r;
 
-       if (ring->funcs->emit_pipeline_sync && (
-           job->vm_needs_flush || gds_switch_needed ||
-           amdgpu_vm_ring_has_compute_vm_bug(ring)))
-               amdgpu_ring_emit_pipeline_sync(ring);
-
        if (ring->funcs->emit_vm_flush && (job->vm_needs_flush ||
            amdgpu_vm_is_gpu_reset(adev, id))) {
                struct fence *fence;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index d5719eb2..8421da2 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -6578,7 +6578,6 @@ static void gfx_v8_0_ring_emit_fence_gfx(struct 
amdgpu_ring *ring, u64 addr,
                          DATA_SEL(write64bit ? 2 : 1) | INT_SEL(int_sel ? 2 : 
0));
        amdgpu_ring_write(ring, lower_32_bits(seq));
        amdgpu_ring_write(ring, upper_32_bits(seq));
-
 }
 
 static void gfx_v8_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
@@ -6595,6 +6594,8 @@ static void gfx_v8_0_ring_emit_pipeline_sync(struct 
amdgpu_ring *ring)
        amdgpu_ring_write(ring, upper_32_bits(addr) & 0xffffffff);
        amdgpu_ring_write(ring, seq);
        amdgpu_ring_write(ring, 0xffffffff);
+       if (usepfp)
+               ring->last_vm_flush_pos = ring->wptr;
        amdgpu_ring_write(ring, 4); /* poll interval */
 }
 
@@ -6641,9 +6642,9 @@ static void gfx_v8_0_ring_emit_vm_flush(struct 
amdgpu_ring *ring,
        if (usepfp) {
                /* sync PFP to ME, otherwise we might get invalid PFP reads */
                amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));
+               /* mark down last pos of vm_flush for padding NOPs before CE ib 
*/
+               ring->last_vm_flush_pos = ring->wptr;
                amdgpu_ring_write(ring, 0x0);
-               /* GFX8 emits 128 dw nop to prevent CE access VM before 
vm_flush finish */
-               amdgpu_ring_insert_nop(ring, 128);
        }
 }
 
@@ -6749,6 +6750,16 @@ static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring 
*ring, uint32_t flags)
        if (amdgpu_sriov_vf(ring->adev))
                gfx_v8_0_ring_emit_de_meta_init(ring,
                        (flags & AMDGPU_VM_DOMAIN) ? AMDGPU_CSA_VADDR : 
ring->adev->virt.csa_vmid0_addr);
+
+       /* TODO: can we use one simple fomular to cover below logic? */
+       if (ring->wptr > ring->last_vm_flush_pos) {
+               if (ring->wptr - ring->last_vm_flush_pos < 128)
+                       amdgpu_ring_insert_nop(ring, 128 - (ring->wptr - 
ring->last_vm_flush_pos));
+       } else {
+               if (ring->ptr_mask - ring->last_vm_flush_pos + ring->wptr < 128)
+                       amdgpu_ring_insert_nop(ring,
+                               128 - (ring->ptr_mask - ring->last_vm_flush_pos 
+ ring->wptr));
+       }
 }
 
 static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
@@ -7035,7 +7046,7 @@ static const struct amdgpu_ring_funcs 
gfx_v8_0_ring_funcs_gfx = {
                5 + /* gfx_v8_0_ring_emit_hdp_invalidate */
                6 + 6 + 6 +/* gfx_v8_0_ring_emit_fence_gfx x3 for user fence, 
vm fence */
                7 + /* gfx_v8_0_ring_emit_pipeline_sync */
-               128 + 19 + /* gfx_v8_0_ring_emit_vm_flush */
+               128 - 63 + 19 + /* gfx_v8_0_ring_emit_vm_flush, and padding 
NOPs prior to IB */
                2 + /* gfx_v8_ring_emit_sb */
                3 + 4 + 29, /* gfx_v8_ring_emit_cntxcntl including vgt 
flush/meta-data */
        .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_gfx */
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to