Re: [Mesa-dev] [PATCH 1/3] radeonsi: allow si_cp_dma_clear_buffer to clear GDS from any IB

2019-01-22 Thread Marek Olšák
On Thu, Nov 29, 2018 at 11:23 AM Koenig, Christian 
wrote:

> Hi Marek,
>
> you stumbled over a pretty fundamental bug in the memory management here.
> Essentially we where leaking BOs when we ran into an OOM situation. Patch
> to fix this is on the mailing list.
>
> A second problem is that eviction doesn't seem to work when GDS BOs aren't
> idle. In other words when two applications try to use GDS at the same time
> they don't wait for each other, but rather one looses with an OOM message.
>

FYI, the OOM message is the last issue with GDS. Everything else is good.

I have a workaround in Mesa that re-submits the CS ioctl if it has failed
with -ENOMEM. I guess there is no downside to it other than the OOM spam in
dmesg.

Marek
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/3] radeonsi: allow si_cp_dma_clear_buffer to clear GDS from any IB

2018-11-29 Thread Koenig, Christian
Hi Marek,

you stumbled over a pretty fundamental bug in the memory management here. 
Essentially we where leaking BOs when we ran into an OOM situation. Patch to 
fix this is on the mailing list.

A second problem is that eviction doesn't seem to work when GDS BOs aren't 
idle. In other words when two applications try to use GDS at the same time they 
don't wait for each other, but rather one looses with an OOM message.

Currently investigating why this is happening,
Christian.

Am 29.11.18 um 02:29 schrieb Marek Olšák:
Hi Christian,

I just pushed the commits.

The best way to reproduce the out-of-memory errors is to run 2 instances of the 
test simultaneously:

R600_DEBUG=testgdsmm glxgears &
R600_DEBUG=testgdsmm glxgears &

It takes about 10 seconds to finish and you'll get a lot of errors.

If you run it again, all GDS allocations will fail:

R600_DEBUG=testgdsmm glxgears
amdgpu: Failed to allocate a buffer:
amdgpu:size  : 32768 bytes
amdgpu:alignment : 4 bytes
amdgpu:domains   : 8

Marek

On Wed, Nov 28, 2018 at 2:16 PM Christian König 
mailto:ckoenig.leichtzumer...@gmail.com>> 
wrote:
Are those committed yet? They don't seem to apply cleanly on master.

Christian.

Am 27.11.18 um 02:56 schrieb Marek Olšák:
> From: Marek Olšák mailto:marek.ol...@amd.com>>
>
> ---
>   .../drivers/radeonsi/si_compute_blit.c|  4 +-
>   src/gallium/drivers/radeonsi/si_cp_dma.c  | 49 ++-
>   src/gallium/drivers/radeonsi/si_pipe.h|  8 +--
>   .../drivers/radeonsi/si_test_dma_perf.c   |  3 +-
>   4 files changed, 33 insertions(+), 31 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c 
> b/src/gallium/drivers/radeonsi/si_compute_blit.c
> index 20e4f591fbb..086793637f0 100644
> --- a/src/gallium/drivers/radeonsi/si_compute_blit.c
> +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
> @@ -212,22 +212,22 @@ void si_clear_buffer(struct si_context *sctx, struct 
> pipe_resource *dst,
>*/
>   if (clear_value_size > 4 ||
>   (clear_value_size == 4 &&
>offset % 4 == 0 &&
>(size > 32*1024 || sctx->chip_class <= VI))) {
>   si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0,
>   aligned_size, clear_value,
>   clear_value_size, coher);
>   } else {
>   assert(clear_value_size == 4);
> - si_cp_dma_clear_buffer(sctx, dst, offset,
> -aligned_size, *clear_value, 
> coher,
> + si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, offset,
> +aligned_size, *clear_value, 0, 
> coher,
>  get_cache_policy(sctx, coher, 
> size));
>   }
>
>   offset += aligned_size;
>   size -= aligned_size;
>   }
>
>   /* Handle non-dword alignment. */
>   if (size) {
>   assert(dst);
> diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c 
> b/src/gallium/drivers/radeonsi/si_cp_dma.c
> index 839b31b7fdf..33220d9f0fa 100644
> --- a/src/gallium/drivers/radeonsi/si_cp_dma.c
> +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
> @@ -47,25 +47,24 @@ static inline unsigned cp_dma_max_byte_count(struct 
> si_context *sctx)
>
>   /* make it aligned for optimal performance */
>   return max & ~(SI_CPDMA_ALIGNMENT - 1);
>   }
>
>
>   /* Emit a CP DMA packet to do a copy from one buffer to another, or to clear
>* a buffer. The size must fit in bits [20:0]. If CP_DMA_CLEAR is set, 
> src_va is a 32-bit
>* clear value.
>*/
> -static void si_emit_cp_dma(struct si_context *sctx, uint64_t dst_va,
> -uint64_t src_va, unsigned size, unsigned flags,
> -enum si_cache_policy cache_policy)
> +static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs,
> +uint64_t dst_va, uint64_t src_va, unsigned size,
> +unsigned flags, enum si_cache_policy cache_policy)
>   {
> - struct radeon_cmdbuf *cs = sctx->gfx_cs;
>   uint32_t header = 0, command = 0;
>
>   assert(size <= cp_dma_max_byte_count(sctx));
>   assert(sctx->chip_class != SI || cache_policy == L2_BYPASS);
>
>   if (sctx->chip_class >= GFX9)
>   command |= S_414_BYTE_COUNT_GFX9(size);
>   else
>   command |= S_414_BYTE_COUNT_GFX6(size);
>
> @@ -139,21 +138,21 @@ static void si_emit_cp_dma(struct si_context *sctx, 
> uint64_t dst_va,
>   }
>
>   void si_cp_dma_wait_for_idle(struct si_context *sctx)
>   {
>   /* Issue a dummy DMA that copies zero bytes.
>*
>* The DMA engine will see that there's no work to do and skip this
>* DMA request, however, the CP will see the sync flag and still wait
> 

Re: [Mesa-dev] [PATCH 1/3] radeonsi: allow si_cp_dma_clear_buffer to clear GDS from any IB

2018-11-28 Thread Marek Olšák
Hi Christian,

I just pushed the commits.

The best way to reproduce the out-of-memory errors is to run 2 instances of
the test simultaneously:

R600_DEBUG=testgdsmm glxgears &
R600_DEBUG=testgdsmm glxgears &

It takes about 10 seconds to finish and you'll get a lot of errors.

If you run it again, all GDS allocations will fail:

R600_DEBUG=testgdsmm glxgears
amdgpu: Failed to allocate a buffer:
amdgpu:size  : 32768 bytes
amdgpu:alignment : 4 bytes
amdgpu:domains   : 8

Marek

On Wed, Nov 28, 2018 at 2:16 PM Christian König <
ckoenig.leichtzumer...@gmail.com> wrote:

> Are those committed yet? They don't seem to apply cleanly on master.
>
> Christian.
>
> Am 27.11.18 um 02:56 schrieb Marek Olšák:
> > From: Marek Olšák 
> >
> > ---
> >   .../drivers/radeonsi/si_compute_blit.c|  4 +-
> >   src/gallium/drivers/radeonsi/si_cp_dma.c  | 49 ++-
> >   src/gallium/drivers/radeonsi/si_pipe.h|  8 +--
> >   .../drivers/radeonsi/si_test_dma_perf.c   |  3 +-
> >   4 files changed, 33 insertions(+), 31 deletions(-)
> >
> > diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c
> b/src/gallium/drivers/radeonsi/si_compute_blit.c
> > index 20e4f591fbb..086793637f0 100644
> > --- a/src/gallium/drivers/radeonsi/si_compute_blit.c
> > +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
> > @@ -212,22 +212,22 @@ void si_clear_buffer(struct si_context *sctx,
> struct pipe_resource *dst,
> >*/
> >   if (clear_value_size > 4 ||
> >   (clear_value_size == 4 &&
> >offset % 4 == 0 &&
> >(size > 32*1024 || sctx->chip_class <= VI))) {
> >   si_compute_do_clear_or_copy(sctx, dst, offset,
> NULL, 0,
> >   aligned_size,
> clear_value,
> >   clear_value_size,
> coher);
> >   } else {
> >   assert(clear_value_size == 4);
> > - si_cp_dma_clear_buffer(sctx, dst, offset,
> > -aligned_size, *clear_value,
> coher,
> > + si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst,
> offset,
> > +aligned_size, *clear_value,
> 0, coher,
> >  get_cache_policy(sctx,
> coher, size));
> >   }
> >
> >   offset += aligned_size;
> >   size -= aligned_size;
> >   }
> >
> >   /* Handle non-dword alignment. */
> >   if (size) {
> >   assert(dst);
> > diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c
> b/src/gallium/drivers/radeonsi/si_cp_dma.c
> > index 839b31b7fdf..33220d9f0fa 100644
> > --- a/src/gallium/drivers/radeonsi/si_cp_dma.c
> > +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
> > @@ -47,25 +47,24 @@ static inline unsigned cp_dma_max_byte_count(struct
> si_context *sctx)
> >
> >   /* make it aligned for optimal performance */
> >   return max & ~(SI_CPDMA_ALIGNMENT - 1);
> >   }
> >
> >
> >   /* Emit a CP DMA packet to do a copy from one buffer to another, or to
> clear
> >* a buffer. The size must fit in bits [20:0]. If CP_DMA_CLEAR is set,
> src_va is a 32-bit
> >* clear value.
> >*/
> > -static void si_emit_cp_dma(struct si_context *sctx, uint64_t dst_va,
> > -uint64_t src_va, unsigned size, unsigned flags,
> > -enum si_cache_policy cache_policy)
> > +static void si_emit_cp_dma(struct si_context *sctx, struct
> radeon_cmdbuf *cs,
> > +uint64_t dst_va, uint64_t src_va, unsigned size,
> > +unsigned flags, enum si_cache_policy
> cache_policy)
> >   {
> > - struct radeon_cmdbuf *cs = sctx->gfx_cs;
> >   uint32_t header = 0, command = 0;
> >
> >   assert(size <= cp_dma_max_byte_count(sctx));
> >   assert(sctx->chip_class != SI || cache_policy == L2_BYPASS);
> >
> >   if (sctx->chip_class >= GFX9)
> >   command |= S_414_BYTE_COUNT_GFX9(size);
> >   else
> >   command |= S_414_BYTE_COUNT_GFX6(size);
> >
> > @@ -139,21 +138,21 @@ static void si_emit_cp_dma(struct si_context
> *sctx, uint64_t dst_va,
> >   }
> >
> >   void si_cp_dma_wait_for_idle(struct si_context *sctx)
> >   {
> >   /* Issue a dummy DMA that copies zero bytes.
> >*
> >* The DMA engine will see that there's no work to do and skip this
> >* DMA request, however, the CP will see the sync flag and still
> wait
> >* for all DMAs to complete.
> >*/
> > - si_emit_cp_dma(sctx, 0, 0, 0, CP_DMA_SYNC, L2_BYPASS);
> > + si_emit_cp_dma(sctx, sctx->gfx_cs, 0, 0, 0, CP_DMA_SYNC,
> L2_BYPASS);
> >   }
> >
> >   static void si_cp_dma_prepare(struct si_context *sctx, struct
> pipe_resource *dst,
> > struct pipe_resource *src, unsigned
> byte_count,
> >

Re: [Mesa-dev] [PATCH 1/3] radeonsi: allow si_cp_dma_clear_buffer to clear GDS from any IB

2018-11-28 Thread Marek Olšák
No. I can push them after they are reviewed or acked.

Marek

On Wed, Nov 28, 2018 at 2:16 PM Christian König <
ckoenig.leichtzumer...@gmail.com> wrote:

> Are those committed yet? They don't seem to apply cleanly on master.
>
> Christian.
>
> Am 27.11.18 um 02:56 schrieb Marek Olšák:
> > From: Marek Olšák 
> >
> > ---
> >   .../drivers/radeonsi/si_compute_blit.c|  4 +-
> >   src/gallium/drivers/radeonsi/si_cp_dma.c  | 49 ++-
> >   src/gallium/drivers/radeonsi/si_pipe.h|  8 +--
> >   .../drivers/radeonsi/si_test_dma_perf.c   |  3 +-
> >   4 files changed, 33 insertions(+), 31 deletions(-)
> >
> > diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c
> b/src/gallium/drivers/radeonsi/si_compute_blit.c
> > index 20e4f591fbb..086793637f0 100644
> > --- a/src/gallium/drivers/radeonsi/si_compute_blit.c
> > +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
> > @@ -212,22 +212,22 @@ void si_clear_buffer(struct si_context *sctx,
> struct pipe_resource *dst,
> >*/
> >   if (clear_value_size > 4 ||
> >   (clear_value_size == 4 &&
> >offset % 4 == 0 &&
> >(size > 32*1024 || sctx->chip_class <= VI))) {
> >   si_compute_do_clear_or_copy(sctx, dst, offset,
> NULL, 0,
> >   aligned_size,
> clear_value,
> >   clear_value_size,
> coher);
> >   } else {
> >   assert(clear_value_size == 4);
> > - si_cp_dma_clear_buffer(sctx, dst, offset,
> > -aligned_size, *clear_value,
> coher,
> > + si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst,
> offset,
> > +aligned_size, *clear_value,
> 0, coher,
> >  get_cache_policy(sctx,
> coher, size));
> >   }
> >
> >   offset += aligned_size;
> >   size -= aligned_size;
> >   }
> >
> >   /* Handle non-dword alignment. */
> >   if (size) {
> >   assert(dst);
> > diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c
> b/src/gallium/drivers/radeonsi/si_cp_dma.c
> > index 839b31b7fdf..33220d9f0fa 100644
> > --- a/src/gallium/drivers/radeonsi/si_cp_dma.c
> > +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
> > @@ -47,25 +47,24 @@ static inline unsigned cp_dma_max_byte_count(struct
> si_context *sctx)
> >
> >   /* make it aligned for optimal performance */
> >   return max & ~(SI_CPDMA_ALIGNMENT - 1);
> >   }
> >
> >
> >   /* Emit a CP DMA packet to do a copy from one buffer to another, or to
> clear
> >* a buffer. The size must fit in bits [20:0]. If CP_DMA_CLEAR is set,
> src_va is a 32-bit
> >* clear value.
> >*/
> > -static void si_emit_cp_dma(struct si_context *sctx, uint64_t dst_va,
> > -uint64_t src_va, unsigned size, unsigned flags,
> > -enum si_cache_policy cache_policy)
> > +static void si_emit_cp_dma(struct si_context *sctx, struct
> radeon_cmdbuf *cs,
> > +uint64_t dst_va, uint64_t src_va, unsigned size,
> > +unsigned flags, enum si_cache_policy
> cache_policy)
> >   {
> > - struct radeon_cmdbuf *cs = sctx->gfx_cs;
> >   uint32_t header = 0, command = 0;
> >
> >   assert(size <= cp_dma_max_byte_count(sctx));
> >   assert(sctx->chip_class != SI || cache_policy == L2_BYPASS);
> >
> >   if (sctx->chip_class >= GFX9)
> >   command |= S_414_BYTE_COUNT_GFX9(size);
> >   else
> >   command |= S_414_BYTE_COUNT_GFX6(size);
> >
> > @@ -139,21 +138,21 @@ static void si_emit_cp_dma(struct si_context
> *sctx, uint64_t dst_va,
> >   }
> >
> >   void si_cp_dma_wait_for_idle(struct si_context *sctx)
> >   {
> >   /* Issue a dummy DMA that copies zero bytes.
> >*
> >* The DMA engine will see that there's no work to do and skip this
> >* DMA request, however, the CP will see the sync flag and still
> wait
> >* for all DMAs to complete.
> >*/
> > - si_emit_cp_dma(sctx, 0, 0, 0, CP_DMA_SYNC, L2_BYPASS);
> > + si_emit_cp_dma(sctx, sctx->gfx_cs, 0, 0, 0, CP_DMA_SYNC,
> L2_BYPASS);
> >   }
> >
> >   static void si_cp_dma_prepare(struct si_context *sctx, struct
> pipe_resource *dst,
> > struct pipe_resource *src, unsigned
> byte_count,
> > uint64_t remaining_size, unsigned user_flags,
> > enum si_coherency coher, bool *is_first,
> > unsigned *packet_flags)
> >   {
> >   /* Fast exit for a CPDMA prefetch. */
> >   if ((user_flags & SI_CPDMA_SKIP_ALL) == SI_CPDMA_SKIP_ALL) {
> > @@ -200,51 +199,53 @@ static void si_cp_dma_prepare(struct si_context
> *sctx, struct pipe_resource *dst
> > 

Re: [Mesa-dev] [PATCH 1/3] radeonsi: allow si_cp_dma_clear_buffer to clear GDS from any IB

2018-11-28 Thread Christian König

Are those committed yet? They don't seem to apply cleanly on master.

Christian.

Am 27.11.18 um 02:56 schrieb Marek Olšák:

From: Marek Olšák 

---
  .../drivers/radeonsi/si_compute_blit.c|  4 +-
  src/gallium/drivers/radeonsi/si_cp_dma.c  | 49 ++-
  src/gallium/drivers/radeonsi/si_pipe.h|  8 +--
  .../drivers/radeonsi/si_test_dma_perf.c   |  3 +-
  4 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c 
b/src/gallium/drivers/radeonsi/si_compute_blit.c
index 20e4f591fbb..086793637f0 100644
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -212,22 +212,22 @@ void si_clear_buffer(struct si_context *sctx, struct 
pipe_resource *dst,
 */
if (clear_value_size > 4 ||
(clear_value_size == 4 &&
 offset % 4 == 0 &&
 (size > 32*1024 || sctx->chip_class <= VI))) {
si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0,
aligned_size, clear_value,
clear_value_size, coher);
} else {
assert(clear_value_size == 4);
-   si_cp_dma_clear_buffer(sctx, dst, offset,
-  aligned_size, *clear_value, 
coher,
+   si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, offset,
+  aligned_size, *clear_value, 0, 
coher,
   get_cache_policy(sctx, coher, 
size));
}
  
  		offset += aligned_size;

size -= aligned_size;
}
  
  	/* Handle non-dword alignment. */

if (size) {
assert(dst);
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c 
b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 839b31b7fdf..33220d9f0fa 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -47,25 +47,24 @@ static inline unsigned cp_dma_max_byte_count(struct 
si_context *sctx)
  
  	/* make it aligned for optimal performance */

return max & ~(SI_CPDMA_ALIGNMENT - 1);
  }
  
  
  /* Emit a CP DMA packet to do a copy from one buffer to another, or to clear

   * a buffer. The size must fit in bits [20:0]. If CP_DMA_CLEAR is set, src_va 
is a 32-bit
   * clear value.
   */
-static void si_emit_cp_dma(struct si_context *sctx, uint64_t dst_va,
-  uint64_t src_va, unsigned size, unsigned flags,
-  enum si_cache_policy cache_policy)
+static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs,
+  uint64_t dst_va, uint64_t src_va, unsigned size,
+  unsigned flags, enum si_cache_policy cache_policy)
  {
-   struct radeon_cmdbuf *cs = sctx->gfx_cs;
uint32_t header = 0, command = 0;
  
  	assert(size <= cp_dma_max_byte_count(sctx));

assert(sctx->chip_class != SI || cache_policy == L2_BYPASS);
  
  	if (sctx->chip_class >= GFX9)

command |= S_414_BYTE_COUNT_GFX9(size);
else
command |= S_414_BYTE_COUNT_GFX6(size);
  
@@ -139,21 +138,21 @@ static void si_emit_cp_dma(struct si_context *sctx, uint64_t dst_va,

  }
  
  void si_cp_dma_wait_for_idle(struct si_context *sctx)

  {
/* Issue a dummy DMA that copies zero bytes.
 *
 * The DMA engine will see that there's no work to do and skip this
 * DMA request, however, the CP will see the sync flag and still wait
 * for all DMAs to complete.
 */
-   si_emit_cp_dma(sctx, 0, 0, 0, CP_DMA_SYNC, L2_BYPASS);
+   si_emit_cp_dma(sctx, sctx->gfx_cs, 0, 0, 0, CP_DMA_SYNC, L2_BYPASS);
  }
  
  static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst,

  struct pipe_resource *src, unsigned byte_count,
  uint64_t remaining_size, unsigned user_flags,
  enum si_coherency coher, bool *is_first,
  unsigned *packet_flags)
  {
/* Fast exit for a CPDMA prefetch. */
if ((user_flags & SI_CPDMA_SKIP_ALL) == SI_CPDMA_SKIP_ALL) {
@@ -200,51 +199,53 @@ static void si_cp_dma_prepare(struct si_context *sctx, 
struct pipe_resource *dst
 */
if (!(user_flags & SI_CPDMA_SKIP_SYNC_AFTER) &&
byte_count == remaining_size) {
*packet_flags |= CP_DMA_SYNC;
  
  		if (coher == SI_COHERENCY_SHADER)

*packet_flags |= CP_DMA_PFP_SYNC_ME;
}
  }
  
-void si_cp_dma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,

-   uint64_t offset, uint64_t size, unsigned value,
-   

Re: [Mesa-dev] [PATCH 1/3] radeonsi: allow si_cp_dma_clear_buffer to clear GDS from any IB

2018-11-26 Thread Marek Olšák
GDS = Global Data Share (on-chip memory)
IB = indirect buffer (command buffer)

Marek

On Mon, Nov 26, 2018 at 9:07 PM Mike Lothian  wrote:

> Hi
>
> What's GDS and IB stand for?
>
> Thanks
>
> Mike
>
> On Tue, 27 Nov 2018, 01:57 Marek Olšák,  wrote:
>
>> From: Marek Olšák 
>>
>> ---
>>  .../drivers/radeonsi/si_compute_blit.c|  4 +-
>>  src/gallium/drivers/radeonsi/si_cp_dma.c  | 49 ++-
>>  src/gallium/drivers/radeonsi/si_pipe.h|  8 +--
>>  .../drivers/radeonsi/si_test_dma_perf.c   |  3 +-
>>  4 files changed, 33 insertions(+), 31 deletions(-)
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c
>> b/src/gallium/drivers/radeonsi/si_compute_blit.c
>> index 20e4f591fbb..086793637f0 100644
>> --- a/src/gallium/drivers/radeonsi/si_compute_blit.c
>> +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
>> @@ -212,22 +212,22 @@ void si_clear_buffer(struct si_context *sctx,
>> struct pipe_resource *dst,
>>  */
>> if (clear_value_size > 4 ||
>> (clear_value_size == 4 &&
>>  offset % 4 == 0 &&
>>  (size > 32*1024 || sctx->chip_class <= VI))) {
>> si_compute_do_clear_or_copy(sctx, dst, offset,
>> NULL, 0,
>> aligned_size,
>> clear_value,
>> clear_value_size,
>> coher);
>> } else {
>> assert(clear_value_size == 4);
>> -   si_cp_dma_clear_buffer(sctx, dst, offset,
>> -  aligned_size,
>> *clear_value, coher,
>> +   si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst,
>> offset,
>> +  aligned_size,
>> *clear_value, 0, coher,
>>get_cache_policy(sctx,
>> coher, size));
>> }
>>
>> offset += aligned_size;
>> size -= aligned_size;
>> }
>>
>> /* Handle non-dword alignment. */
>> if (size) {
>> assert(dst);
>> diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c
>> b/src/gallium/drivers/radeonsi/si_cp_dma.c
>> index 839b31b7fdf..33220d9f0fa 100644
>> --- a/src/gallium/drivers/radeonsi/si_cp_dma.c
>> +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
>> @@ -47,25 +47,24 @@ static inline unsigned cp_dma_max_byte_count(struct
>> si_context *sctx)
>>
>> /* make it aligned for optimal performance */
>> return max & ~(SI_CPDMA_ALIGNMENT - 1);
>>  }
>>
>>
>>  /* Emit a CP DMA packet to do a copy from one buffer to another, or to
>> clear
>>   * a buffer. The size must fit in bits [20:0]. If CP_DMA_CLEAR is set,
>> src_va is a 32-bit
>>   * clear value.
>>   */
>> -static void si_emit_cp_dma(struct si_context *sctx, uint64_t dst_va,
>> -  uint64_t src_va, unsigned size, unsigned flags,
>> -  enum si_cache_policy cache_policy)
>> +static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf
>> *cs,
>> +  uint64_t dst_va, uint64_t src_va, unsigned
>> size,
>> +  unsigned flags, enum si_cache_policy
>> cache_policy)
>>  {
>> -   struct radeon_cmdbuf *cs = sctx->gfx_cs;
>> uint32_t header = 0, command = 0;
>>
>> assert(size <= cp_dma_max_byte_count(sctx));
>> assert(sctx->chip_class != SI || cache_policy == L2_BYPASS);
>>
>> if (sctx->chip_class >= GFX9)
>> command |= S_414_BYTE_COUNT_GFX9(size);
>> else
>> command |= S_414_BYTE_COUNT_GFX6(size);
>>
>> @@ -139,21 +138,21 @@ static void si_emit_cp_dma(struct si_context *sctx,
>> uint64_t dst_va,
>>  }
>>
>>  void si_cp_dma_wait_for_idle(struct si_context *sctx)
>>  {
>> /* Issue a dummy DMA that copies zero bytes.
>>  *
>>  * The DMA engine will see that there's no work to do and skip
>> this
>>  * DMA request, however, the CP will see the sync flag and still
>> wait
>>  * for all DMAs to complete.
>>  */
>> -   si_emit_cp_dma(sctx, 0, 0, 0, CP_DMA_SYNC, L2_BYPASS);
>> +   si_emit_cp_dma(sctx, sctx->gfx_cs, 0, 0, 0, CP_DMA_SYNC,
>> L2_BYPASS);
>>  }
>>
>>  static void si_cp_dma_prepare(struct si_context *sctx, struct
>> pipe_resource *dst,
>>   struct pipe_resource *src, unsigned
>> byte_count,
>>   uint64_t remaining_size, unsigned
>> user_flags,
>>   enum si_coherency coher, bool *is_first,
>>   unsigned *packet_flags)
>>  {
>> /* Fast exit for a CPDMA prefetch. */
>> if ((user_flags & SI_CPDMA_SKIP_ALL) == SI_CPDMA_SKIP_ALL) {
>> @@ -200,51 +199,53 @@ static void si_cp_dma_prepare(struct si_context
>> *sctx, struct pipe_resource *dst
>>  */
>>  

Re: [Mesa-dev] [PATCH 1/3] radeonsi: allow si_cp_dma_clear_buffer to clear GDS from any IB

2018-11-26 Thread Mike Lothian
Hi

What's GDS and IB stand for?

Thanks

Mike

On Tue, 27 Nov 2018, 01:57 Marek Olšák,  wrote:

> From: Marek Olšák 
>
> ---
>  .../drivers/radeonsi/si_compute_blit.c|  4 +-
>  src/gallium/drivers/radeonsi/si_cp_dma.c  | 49 ++-
>  src/gallium/drivers/radeonsi/si_pipe.h|  8 +--
>  .../drivers/radeonsi/si_test_dma_perf.c   |  3 +-
>  4 files changed, 33 insertions(+), 31 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c
> b/src/gallium/drivers/radeonsi/si_compute_blit.c
> index 20e4f591fbb..086793637f0 100644
> --- a/src/gallium/drivers/radeonsi/si_compute_blit.c
> +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
> @@ -212,22 +212,22 @@ void si_clear_buffer(struct si_context *sctx, struct
> pipe_resource *dst,
>  */
> if (clear_value_size > 4 ||
> (clear_value_size == 4 &&
>  offset % 4 == 0 &&
>  (size > 32*1024 || sctx->chip_class <= VI))) {
> si_compute_do_clear_or_copy(sctx, dst, offset,
> NULL, 0,
> aligned_size,
> clear_value,
> clear_value_size,
> coher);
> } else {
> assert(clear_value_size == 4);
> -   si_cp_dma_clear_buffer(sctx, dst, offset,
> -  aligned_size, *clear_value,
> coher,
> +   si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst,
> offset,
> +  aligned_size, *clear_value,
> 0, coher,
>get_cache_policy(sctx,
> coher, size));
> }
>
> offset += aligned_size;
> size -= aligned_size;
> }
>
> /* Handle non-dword alignment. */
> if (size) {
> assert(dst);
> diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c
> b/src/gallium/drivers/radeonsi/si_cp_dma.c
> index 839b31b7fdf..33220d9f0fa 100644
> --- a/src/gallium/drivers/radeonsi/si_cp_dma.c
> +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
> @@ -47,25 +47,24 @@ static inline unsigned cp_dma_max_byte_count(struct
> si_context *sctx)
>
> /* make it aligned for optimal performance */
> return max & ~(SI_CPDMA_ALIGNMENT - 1);
>  }
>
>
>  /* Emit a CP DMA packet to do a copy from one buffer to another, or to
> clear
>   * a buffer. The size must fit in bits [20:0]. If CP_DMA_CLEAR is set,
> src_va is a 32-bit
>   * clear value.
>   */
> -static void si_emit_cp_dma(struct si_context *sctx, uint64_t dst_va,
> -  uint64_t src_va, unsigned size, unsigned flags,
> -  enum si_cache_policy cache_policy)
> +static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf
> *cs,
> +  uint64_t dst_va, uint64_t src_va, unsigned size,
> +  unsigned flags, enum si_cache_policy
> cache_policy)
>  {
> -   struct radeon_cmdbuf *cs = sctx->gfx_cs;
> uint32_t header = 0, command = 0;
>
> assert(size <= cp_dma_max_byte_count(sctx));
> assert(sctx->chip_class != SI || cache_policy == L2_BYPASS);
>
> if (sctx->chip_class >= GFX9)
> command |= S_414_BYTE_COUNT_GFX9(size);
> else
> command |= S_414_BYTE_COUNT_GFX6(size);
>
> @@ -139,21 +138,21 @@ static void si_emit_cp_dma(struct si_context *sctx,
> uint64_t dst_va,
>  }
>
>  void si_cp_dma_wait_for_idle(struct si_context *sctx)
>  {
> /* Issue a dummy DMA that copies zero bytes.
>  *
>  * The DMA engine will see that there's no work to do and skip this
>  * DMA request, however, the CP will see the sync flag and still
> wait
>  * for all DMAs to complete.
>  */
> -   si_emit_cp_dma(sctx, 0, 0, 0, CP_DMA_SYNC, L2_BYPASS);
> +   si_emit_cp_dma(sctx, sctx->gfx_cs, 0, 0, 0, CP_DMA_SYNC,
> L2_BYPASS);
>  }
>
>  static void si_cp_dma_prepare(struct si_context *sctx, struct
> pipe_resource *dst,
>   struct pipe_resource *src, unsigned
> byte_count,
>   uint64_t remaining_size, unsigned user_flags,
>   enum si_coherency coher, bool *is_first,
>   unsigned *packet_flags)
>  {
> /* Fast exit for a CPDMA prefetch. */
> if ((user_flags & SI_CPDMA_SKIP_ALL) == SI_CPDMA_SKIP_ALL) {
> @@ -200,51 +199,53 @@ static void si_cp_dma_prepare(struct si_context
> *sctx, struct pipe_resource *dst
>  */
> if (!(user_flags & SI_CPDMA_SKIP_SYNC_AFTER) &&
> byte_count == remaining_size) {
> *packet_flags |= CP_DMA_SYNC;
>
> if (coher == SI_COHERENCY_SHADER)
> *packet_flags |= CP_DMA_PFP_SYNC_ME;
> }
>  }