Avoid constant register reloads while emitting IBs by using a local write
pointer and only updating the size at the end of each helper.
Signed-off-by: Tvrtko Ursulin <[email protected]>
---
drivers/gpu/drm/amd/amdgpu/si_dma.c | 84 +++++++++++++++++------------
1 file changed, 51 insertions(+), 33 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/si_dma.c
b/drivers/gpu/drm/amd/amdgpu/si_dma.c
index 7f18e4875287..9e26c7598d74 100644
--- a/drivers/gpu/drm/amd/amdgpu/si_dma.c
+++ b/drivers/gpu/drm/amd/amdgpu/si_dma.c
@@ -323,14 +323,16 @@ static void si_dma_vm_copy_pte(struct amdgpu_ib *ib,
uint64_t pe, uint64_t src,
unsigned count)
{
+ u32 *ptr = &ib->ptr[ib->length_dw];
unsigned bytes = count * 8;
- ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_COPY,
- 1, 0, 0, bytes);
- ib->ptr[ib->length_dw++] = lower_32_bits(pe);
- ib->ptr[ib->length_dw++] = lower_32_bits(src);
- ib->ptr[ib->length_dw++] = upper_32_bits(pe) & 0xff;
- ib->ptr[ib->length_dw++] = upper_32_bits(src) & 0xff;
+ *ptr++ = DMA_PACKET(DMA_PACKET_COPY, 1, 0, 0, bytes);
+ *ptr++ = lower_32_bits(pe);
+ *ptr++ = lower_32_bits(src);
+ *ptr++ = upper_32_bits(pe) & 0xff;
+ *ptr++ = upper_32_bits(src) & 0xff;
+
+ ib->length_dw = ptr - ib->ptr;
}
/**
@@ -348,16 +350,19 @@ static void si_dma_vm_write_pte(struct amdgpu_ib *ib,
uint64_t pe,
uint64_t value, unsigned count,
uint32_t incr)
{
+ u32 *ptr = &ib->ptr[ib->length_dw];
unsigned ndw = count * 2;
- ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_WRITE, 0, 0, 0, ndw);
- ib->ptr[ib->length_dw++] = lower_32_bits(pe);
- ib->ptr[ib->length_dw++] = upper_32_bits(pe);
+ *ptr++ = DMA_PACKET(DMA_PACKET_WRITE, 0, 0, 0, ndw);
+ *ptr++ = lower_32_bits(pe);
+ *ptr++ = upper_32_bits(pe);
for (; ndw > 0; ndw -= 2) {
- ib->ptr[ib->length_dw++] = lower_32_bits(value);
- ib->ptr[ib->length_dw++] = upper_32_bits(value);
+ *ptr++ = lower_32_bits(value);
+ *ptr++ = upper_32_bits(value);
value += incr;
}
+
+ ib->length_dw = ptr - ib->ptr;
}
/**
@@ -377,6 +382,7 @@ static void si_dma_vm_set_pte_pde(struct amdgpu_ib *ib,
uint64_t addr, unsigned count,
uint32_t incr, uint64_t flags)
{
+ u32 *ptr = &ib->ptr[ib->length_dw];
uint64_t value;
unsigned ndw;
@@ -391,19 +397,21 @@ static void si_dma_vm_set_pte_pde(struct amdgpu_ib *ib,
value = 0;
/* for physically contiguous pages (vram) */
- ib->ptr[ib->length_dw++] = DMA_PTE_PDE_PACKET(ndw);
- ib->ptr[ib->length_dw++] = pe; /* dst addr */
- ib->ptr[ib->length_dw++] = upper_32_bits(pe) & 0xff;
- ib->ptr[ib->length_dw++] = lower_32_bits(flags); /* mask */
- ib->ptr[ib->length_dw++] = upper_32_bits(flags);
- ib->ptr[ib->length_dw++] = value; /* value */
- ib->ptr[ib->length_dw++] = upper_32_bits(value);
- ib->ptr[ib->length_dw++] = incr; /* increment size */
- ib->ptr[ib->length_dw++] = 0;
+ *ptr++ = DMA_PTE_PDE_PACKET(ndw);
+ *ptr++ = pe; /* dst addr */
+ *ptr++ = upper_32_bits(pe) & 0xff;
+ *ptr++ = lower_32_bits(flags); /* mask */
+ *ptr++ = upper_32_bits(flags);
+ *ptr++ = value; /* value */
+ *ptr++ = upper_32_bits(value);
+ *ptr++ = incr; /* increment size */
+ *ptr++ = 0;
pe += ndw * 4;
addr += (ndw / 2) * incr;
count -= ndw / 2;
}
+
+ ib->length_dw = ptr - ib->ptr;
}
/**
@@ -415,8 +423,12 @@ static void si_dma_vm_set_pte_pde(struct amdgpu_ib *ib,
*/
static void si_dma_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
{
- while (ib->length_dw & 0x7)
- ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_NOP, 0, 0, 0,
0);
+ int pad = 8 - (ib->length_dw & 0x7);
+
+ if (pad && pad < 8) {
+ memset32(ib->ptr, DMA_PACKET(DMA_PACKET_NOP, 0, 0, 0, 0), pad);
+ ib->length_dw += pad;
+ }
}
/**
@@ -783,12 +795,15 @@ static void si_dma_emit_copy_buffer(struct amdgpu_ib *ib,
uint32_t byte_count,
uint32_t copy_flags)
{
- ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_COPY,
- 1, 0, 0, byte_count);
- ib->ptr[ib->length_dw++] = lower_32_bits(dst_offset);
- ib->ptr[ib->length_dw++] = lower_32_bits(src_offset);
- ib->ptr[ib->length_dw++] = upper_32_bits(dst_offset) & 0xff;
- ib->ptr[ib->length_dw++] = upper_32_bits(src_offset) & 0xff;
+ u32 *ptr = &ib->ptr[ib->length_dw];
+
+ *ptr++ = DMA_PACKET(DMA_PACKET_COPY, 1, 0, 0, byte_count);
+ *ptr++ = lower_32_bits(dst_offset);
+ *ptr++ = lower_32_bits(src_offset);
+ *ptr++ = upper_32_bits(dst_offset) & 0xff;
+ *ptr++ = upper_32_bits(src_offset) & 0xff;
+
+ ib->length_dw = ptr - ib->ptr;
}
/**
@@ -806,11 +821,14 @@ static void si_dma_emit_fill_buffer(struct amdgpu_ib *ib,
uint64_t dst_offset,
uint32_t byte_count)
{
- ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_CONSTANT_FILL,
- 0, 0, 0, byte_count / 4);
- ib->ptr[ib->length_dw++] = lower_32_bits(dst_offset);
- ib->ptr[ib->length_dw++] = src_data;
- ib->ptr[ib->length_dw++] = upper_32_bits(dst_offset) << 16;
+ u32 *ptr = &ib->ptr[ib->length_dw];
+
+ *ptr++ = DMA_PACKET(DMA_PACKET_CONSTANT_FILL, 0, 0, 0, byte_count / 4);
+ *ptr++ = lower_32_bits(dst_offset);
+ *ptr++ = src_data;
+ *ptr++ = upper_32_bits(dst_offset) << 16;
+
+ ib->length_dw = ptr - ib->ptr;
}
--
2.48.0