Re: [Mesa-dev] [PATCH 14/25] radeonsi: implement PIPE_FLUSH_{TOP, BOTTOM}_OF_PIPE

2017-11-03 Thread Nicolai Hähnle

On 03.11.2017 19:46, Marek Olšák wrote:

On Fri, Nov 3, 2017 at 3:48 PM, Nicolai Hähnle  wrote:

On 31.10.2017 17:21, Marek Olšák wrote:


On Sun, Oct 22, 2017 at 9:07 PM, Nicolai Hähnle 
wrote:


From: Nicolai Hähnle 

---
   src/gallium/drivers/radeonsi/si_fence.c | 83
-
   1 file changed, 82 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_fence.c
b/src/gallium/drivers/radeonsi/si_fence.c
index b7b02b55831..bc0ae302945 100644
--- a/src/gallium/drivers/radeonsi/si_fence.c
+++ b/src/gallium/drivers/radeonsi/si_fence.c
@@ -22,33 +22,41 @@
*
*/

   #include 

   #include "util/os_time.h"
   #include "util/u_memory.h"
   #include "util/u_queue.h"

   #include "si_pipe.h"
+#include "radeon/r600_cs.h"
+
+struct si_fine_fence {
+   struct r600_resource *buf;
+   unsigned offset;
+};

   struct si_multi_fence {
  struct pipe_reference reference;
  struct pipe_fence_handle *gfx;
  struct pipe_fence_handle *sdma;
  struct tc_unflushed_batch_token *tc_token;
  struct util_queue_fence ready;

  /* If the context wasn't flushed at fence creation, this is
non-NULL. */
  struct {
  struct r600_common_context *ctx;
  unsigned ib_index;
  } gfx_unflushed;
+
+   struct si_fine_fence fine;
   };

   static void si_add_fence_dependency(struct r600_common_context *rctx,
  struct pipe_fence_handle *fence)
   {
  struct radeon_winsys *ws = rctx->ws;

  if (rctx->dma.cs)
  ws->cs_add_fence_dependency(rctx->dma.cs, fence);
  ws->cs_add_fence_dependency(rctx->gfx.cs, fence);
@@ -59,20 +67,21 @@ static void si_fence_reference(struct pipe_screen
*screen,
 struct pipe_fence_handle *src)
   {
  struct radeon_winsys *ws = ((struct
r600_common_screen*)screen)->ws;
  struct si_multi_fence **rdst = (struct si_multi_fence **)dst;
  struct si_multi_fence *rsrc = (struct si_multi_fence *)src;

  if (pipe_reference(&(*rdst)->reference, >reference)) {
  ws->fence_reference(&(*rdst)->gfx, NULL);
  ws->fence_reference(&(*rdst)->sdma, NULL);
  tc_unflushed_batch_token_reference(&(*rdst)->tc_token,
NULL);
+   r600_resource_reference(&(*rdst)->fine.buf, NULL);
  FREE(*rdst);
  }
   *rdst = rsrc;
   }

   static struct si_multi_fence *si_create_multi_fence()
   {
  struct si_multi_fence *fence = CALLOC_STRUCT(si_multi_fence);
  if (!fence)
  return NULL;
@@ -132,20 +141,66 @@ static void si_fence_server_sync(struct
pipe_context *ctx,
   * this fence dependency is signalled.
   *
   * Should we flush the context to allow more GPU parallelism?
   */
  if (rfence->sdma)
  si_add_fence_dependency(rctx, rfence->sdma);
  if (rfence->gfx)
  si_add_fence_dependency(rctx, rfence->gfx);
   }

+static bool si_fine_fence_signaled(struct radeon_winsys *rws,
+  const struct si_fine_fence *fine)
+{
+   char *map = rws->buffer_map(fine->buf->buf, NULL,
PIPE_TRANSFER_READ |
+
PIPE_TRANSFER_UNSYNCHRONIZED);
+   if (!map)
+   return false;
+
+   uint32_t *fence = (uint32_t*)(map + fine->offset);
+   return *fence != 0;



I think this is not coherent on GFX9 if MTYPE != UC. MTYPE is set by the
GEM_CREATE ioctl.



Hmm, this stuff tends to give me headaches.

For both top-of-pipe and bottom-of-pipe, a direct write to memory is used
(WRITE_DATA with dst_sel=5 (memory); RELEASE_MEM with dst_sel=0
(memory_controller)), bypassing the GPU L2.

So the only issue could be with CPU caches, and with HDP if the buffer lands
in VRAM for some reason. The buffer we're using here is the same as for
queries, and AFAIK we don't do any special flushing for those either, so I
don't think this is really a problem?


Right. It's not a problem. In the worst case, the fence write will be
visible after the next L2 flush on GFX9. I don't know the L2 cache
policy for CP packets on GFX9, but I guess it's "stream" or something
stronger even.

Note that allocator_zeroed_memory allocates from VRAM and the buffer
is actually never mapped (i.e. we could make it unmappable). We don't
have any suballocator instance for cached GTT memory at the moment.

Also, u_suballocator clears memory using the GPU. If the clear hasn't
been started by the GPU, the memory contains garbage, which is not
ideal for CPU fences.


Good points, I'll rework the allocation.

Cheers,
Nicolai



Marek




--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org

Re: [Mesa-dev] [PATCH 14/25] radeonsi: implement PIPE_FLUSH_{TOP, BOTTOM}_OF_PIPE

2017-11-03 Thread Marek Olšák
On Fri, Nov 3, 2017 at 3:48 PM, Nicolai Hähnle  wrote:
> On 31.10.2017 17:21, Marek Olšák wrote:
>>
>> On Sun, Oct 22, 2017 at 9:07 PM, Nicolai Hähnle 
>> wrote:
>>>
>>> From: Nicolai Hähnle 
>>>
>>> ---
>>>   src/gallium/drivers/radeonsi/si_fence.c | 83
>>> -
>>>   1 file changed, 82 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/src/gallium/drivers/radeonsi/si_fence.c
>>> b/src/gallium/drivers/radeonsi/si_fence.c
>>> index b7b02b55831..bc0ae302945 100644
>>> --- a/src/gallium/drivers/radeonsi/si_fence.c
>>> +++ b/src/gallium/drivers/radeonsi/si_fence.c
>>> @@ -22,33 +22,41 @@
>>>*
>>>*/
>>>
>>>   #include 
>>>
>>>   #include "util/os_time.h"
>>>   #include "util/u_memory.h"
>>>   #include "util/u_queue.h"
>>>
>>>   #include "si_pipe.h"
>>> +#include "radeon/r600_cs.h"
>>> +
>>> +struct si_fine_fence {
>>> +   struct r600_resource *buf;
>>> +   unsigned offset;
>>> +};
>>>
>>>   struct si_multi_fence {
>>>  struct pipe_reference reference;
>>>  struct pipe_fence_handle *gfx;
>>>  struct pipe_fence_handle *sdma;
>>>  struct tc_unflushed_batch_token *tc_token;
>>>  struct util_queue_fence ready;
>>>
>>>  /* If the context wasn't flushed at fence creation, this is
>>> non-NULL. */
>>>  struct {
>>>  struct r600_common_context *ctx;
>>>  unsigned ib_index;
>>>  } gfx_unflushed;
>>> +
>>> +   struct si_fine_fence fine;
>>>   };
>>>
>>>   static void si_add_fence_dependency(struct r600_common_context *rctx,
>>>  struct pipe_fence_handle *fence)
>>>   {
>>>  struct radeon_winsys *ws = rctx->ws;
>>>
>>>  if (rctx->dma.cs)
>>>  ws->cs_add_fence_dependency(rctx->dma.cs, fence);
>>>  ws->cs_add_fence_dependency(rctx->gfx.cs, fence);
>>> @@ -59,20 +67,21 @@ static void si_fence_reference(struct pipe_screen
>>> *screen,
>>> struct pipe_fence_handle *src)
>>>   {
>>>  struct radeon_winsys *ws = ((struct
>>> r600_common_screen*)screen)->ws;
>>>  struct si_multi_fence **rdst = (struct si_multi_fence **)dst;
>>>  struct si_multi_fence *rsrc = (struct si_multi_fence *)src;
>>>
>>>  if (pipe_reference(&(*rdst)->reference, >reference)) {
>>>  ws->fence_reference(&(*rdst)->gfx, NULL);
>>>  ws->fence_reference(&(*rdst)->sdma, NULL);
>>>  tc_unflushed_batch_token_reference(&(*rdst)->tc_token,
>>> NULL);
>>> +   r600_resource_reference(&(*rdst)->fine.buf, NULL);
>>>  FREE(*rdst);
>>>  }
>>>   *rdst = rsrc;
>>>   }
>>>
>>>   static struct si_multi_fence *si_create_multi_fence()
>>>   {
>>>  struct si_multi_fence *fence = CALLOC_STRUCT(si_multi_fence);
>>>  if (!fence)
>>>  return NULL;
>>> @@ -132,20 +141,66 @@ static void si_fence_server_sync(struct
>>> pipe_context *ctx,
>>>   * this fence dependency is signalled.
>>>   *
>>>   * Should we flush the context to allow more GPU parallelism?
>>>   */
>>>  if (rfence->sdma)
>>>  si_add_fence_dependency(rctx, rfence->sdma);
>>>  if (rfence->gfx)
>>>  si_add_fence_dependency(rctx, rfence->gfx);
>>>   }
>>>
>>> +static bool si_fine_fence_signaled(struct radeon_winsys *rws,
>>> +  const struct si_fine_fence *fine)
>>> +{
>>> +   char *map = rws->buffer_map(fine->buf->buf, NULL,
>>> PIPE_TRANSFER_READ |
>>> +
>>> PIPE_TRANSFER_UNSYNCHRONIZED);
>>> +   if (!map)
>>> +   return false;
>>> +
>>> +   uint32_t *fence = (uint32_t*)(map + fine->offset);
>>> +   return *fence != 0;
>>
>>
>> I think this is not coherent on GFX9 if MTYPE != UC. MTYPE is set by the
>> GEM_CREATE ioctl.
>
>
> Hmm, this stuff tends to give me headaches.
>
> For both top-of-pipe and bottom-of-pipe, a direct write to memory is used
> (WRITE_DATA with dst_sel=5 (memory); RELEASE_MEM with dst_sel=0
> (memory_controller)), bypassing the GPU L2.
>
> So the only issue could be with CPU caches, and with HDP if the buffer lands
> in VRAM for some reason. The buffer we're using here is the same as for
> queries, and AFAIK we don't do any special flushing for those either, so I
> don't think this is really a problem?

Right. It's not a problem. In the worst case, the fence write will be
visible after the next L2 flush on GFX9. I don't know the L2 cache
policy for CP packets on GFX9, but I guess it's "stream" or something
stronger even.

Note that allocator_zeroed_memory allocates from VRAM and the buffer
is actually never mapped (i.e. we could make it unmappable). We don't
have any suballocator instance for cached GTT memory at the moment.

Also, u_suballocator clears memory using the GPU. If the clear hasn't
been 

Re: [Mesa-dev] [PATCH 14/25] radeonsi: implement PIPE_FLUSH_{TOP, BOTTOM}_OF_PIPE

2017-11-03 Thread Nicolai Hähnle

On 31.10.2017 17:21, Marek Olšák wrote:

On Sun, Oct 22, 2017 at 9:07 PM, Nicolai Hähnle  wrote:

From: Nicolai Hähnle 

---
  src/gallium/drivers/radeonsi/si_fence.c | 83 -
  1 file changed, 82 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_fence.c 
b/src/gallium/drivers/radeonsi/si_fence.c
index b7b02b55831..bc0ae302945 100644
--- a/src/gallium/drivers/radeonsi/si_fence.c
+++ b/src/gallium/drivers/radeonsi/si_fence.c
@@ -22,33 +22,41 @@
   *
   */

  #include 

  #include "util/os_time.h"
  #include "util/u_memory.h"
  #include "util/u_queue.h"

  #include "si_pipe.h"
+#include "radeon/r600_cs.h"
+
+struct si_fine_fence {
+   struct r600_resource *buf;
+   unsigned offset;
+};

  struct si_multi_fence {
 struct pipe_reference reference;
 struct pipe_fence_handle *gfx;
 struct pipe_fence_handle *sdma;
 struct tc_unflushed_batch_token *tc_token;
 struct util_queue_fence ready;

 /* If the context wasn't flushed at fence creation, this is non-NULL. 
*/
 struct {
 struct r600_common_context *ctx;
 unsigned ib_index;
 } gfx_unflushed;
+
+   struct si_fine_fence fine;
  };

  static void si_add_fence_dependency(struct r600_common_context *rctx,
 struct pipe_fence_handle *fence)
  {
 struct radeon_winsys *ws = rctx->ws;

 if (rctx->dma.cs)
 ws->cs_add_fence_dependency(rctx->dma.cs, fence);
 ws->cs_add_fence_dependency(rctx->gfx.cs, fence);
@@ -59,20 +67,21 @@ static void si_fence_reference(struct pipe_screen *screen,
struct pipe_fence_handle *src)
  {
 struct radeon_winsys *ws = ((struct r600_common_screen*)screen)->ws;
 struct si_multi_fence **rdst = (struct si_multi_fence **)dst;
 struct si_multi_fence *rsrc = (struct si_multi_fence *)src;

 if (pipe_reference(&(*rdst)->reference, >reference)) {
 ws->fence_reference(&(*rdst)->gfx, NULL);
 ws->fence_reference(&(*rdst)->sdma, NULL);
 tc_unflushed_batch_token_reference(&(*rdst)->tc_token, NULL);
+   r600_resource_reference(&(*rdst)->fine.buf, NULL);
 FREE(*rdst);
 }
  *rdst = rsrc;
  }

  static struct si_multi_fence *si_create_multi_fence()
  {
 struct si_multi_fence *fence = CALLOC_STRUCT(si_multi_fence);
 if (!fence)
 return NULL;
@@ -132,20 +141,66 @@ static void si_fence_server_sync(struct pipe_context *ctx,
  * this fence dependency is signalled.
  *
  * Should we flush the context to allow more GPU parallelism?
  */
 if (rfence->sdma)
 si_add_fence_dependency(rctx, rfence->sdma);
 if (rfence->gfx)
 si_add_fence_dependency(rctx, rfence->gfx);
  }

+static bool si_fine_fence_signaled(struct radeon_winsys *rws,
+  const struct si_fine_fence *fine)
+{
+   char *map = rws->buffer_map(fine->buf->buf, NULL, PIPE_TRANSFER_READ |
+ 
PIPE_TRANSFER_UNSYNCHRONIZED);
+   if (!map)
+   return false;
+
+   uint32_t *fence = (uint32_t*)(map + fine->offset);
+   return *fence != 0;


I think this is not coherent on GFX9 if MTYPE != UC. MTYPE is set by the
GEM_CREATE ioctl.


Hmm, this stuff tends to give me headaches.

For both top-of-pipe and bottom-of-pipe, a direct write to memory is 
used (WRITE_DATA with dst_sel=5 (memory); RELEASE_MEM with dst_sel=0 
(memory_controller)), bypassing the GPU L2.


So the only issue could be with CPU caches, and with HDP if the buffer 
lands in VRAM for some reason. The buffer we're using here is the same 
as for queries, and AFAIK we don't do any special flushing for those 
either, so I don't think this is really a problem?




+}
+
+static void si_fine_fence_set(struct si_context *ctx,
+ struct si_fine_fence *fine,
+ unsigned flags)
+{
+   assert(util_bitcount(flags & (PIPE_FLUSH_TOP_OF_PIPE | 
PIPE_FLUSH_BOTTOM_OF_PIPE)) == 1);
+
+   u_suballocator_alloc(ctx->b.allocator_zeroed_memory, 4, 4,
+>offset, (struct pipe_resource 
**)>buf);
+   if (!fine->buf)
+   return;
+
+   uint64_t fence_va = fine->buf->gpu_address + fine->offset;
+
+   radeon_add_to_buffer_list(>b, >b.gfx, fine->buf,
+ RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
+   if (flags & PIPE_FLUSH_TOP_OF_PIPE) {
+   struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
+   radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
+   radeon_emit(cs, S_370_DST_SEL(V_370_MEM_ASYNC) |
+   S_370_WR_CONFIRM(1) |
+   

Re: [Mesa-dev] [PATCH 14/25] radeonsi: implement PIPE_FLUSH_{TOP, BOTTOM}_OF_PIPE

2017-10-31 Thread Marek Olšák
On Sun, Oct 22, 2017 at 9:07 PM, Nicolai Hähnle  wrote:
> From: Nicolai Hähnle 
>
> ---
>  src/gallium/drivers/radeonsi/si_fence.c | 83 
> -
>  1 file changed, 82 insertions(+), 1 deletion(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_fence.c 
> b/src/gallium/drivers/radeonsi/si_fence.c
> index b7b02b55831..bc0ae302945 100644
> --- a/src/gallium/drivers/radeonsi/si_fence.c
> +++ b/src/gallium/drivers/radeonsi/si_fence.c
> @@ -22,33 +22,41 @@
>   *
>   */
>
>  #include 
>
>  #include "util/os_time.h"
>  #include "util/u_memory.h"
>  #include "util/u_queue.h"
>
>  #include "si_pipe.h"
> +#include "radeon/r600_cs.h"
> +
> +struct si_fine_fence {
> +   struct r600_resource *buf;
> +   unsigned offset;
> +};
>
>  struct si_multi_fence {
> struct pipe_reference reference;
> struct pipe_fence_handle *gfx;
> struct pipe_fence_handle *sdma;
> struct tc_unflushed_batch_token *tc_token;
> struct util_queue_fence ready;
>
> /* If the context wasn't flushed at fence creation, this is non-NULL. 
> */
> struct {
> struct r600_common_context *ctx;
> unsigned ib_index;
> } gfx_unflushed;
> +
> +   struct si_fine_fence fine;
>  };
>
>  static void si_add_fence_dependency(struct r600_common_context *rctx,
> struct pipe_fence_handle *fence)
>  {
> struct radeon_winsys *ws = rctx->ws;
>
> if (rctx->dma.cs)
> ws->cs_add_fence_dependency(rctx->dma.cs, fence);
> ws->cs_add_fence_dependency(rctx->gfx.cs, fence);
> @@ -59,20 +67,21 @@ static void si_fence_reference(struct pipe_screen *screen,
>struct pipe_fence_handle *src)
>  {
> struct radeon_winsys *ws = ((struct r600_common_screen*)screen)->ws;
> struct si_multi_fence **rdst = (struct si_multi_fence **)dst;
> struct si_multi_fence *rsrc = (struct si_multi_fence *)src;
>
> if (pipe_reference(&(*rdst)->reference, >reference)) {
> ws->fence_reference(&(*rdst)->gfx, NULL);
> ws->fence_reference(&(*rdst)->sdma, NULL);
> tc_unflushed_batch_token_reference(&(*rdst)->tc_token, NULL);
> +   r600_resource_reference(&(*rdst)->fine.buf, NULL);
> FREE(*rdst);
> }
>  *rdst = rsrc;
>  }
>
>  static struct si_multi_fence *si_create_multi_fence()
>  {
> struct si_multi_fence *fence = CALLOC_STRUCT(si_multi_fence);
> if (!fence)
> return NULL;
> @@ -132,20 +141,66 @@ static void si_fence_server_sync(struct pipe_context 
> *ctx,
>  * this fence dependency is signalled.
>  *
>  * Should we flush the context to allow more GPU parallelism?
>  */
> if (rfence->sdma)
> si_add_fence_dependency(rctx, rfence->sdma);
> if (rfence->gfx)
> si_add_fence_dependency(rctx, rfence->gfx);
>  }
>
> +static bool si_fine_fence_signaled(struct radeon_winsys *rws,
> +  const struct si_fine_fence *fine)
> +{
> +   char *map = rws->buffer_map(fine->buf->buf, NULL, PIPE_TRANSFER_READ |
> + 
> PIPE_TRANSFER_UNSYNCHRONIZED);
> +   if (!map)
> +   return false;
> +
> +   uint32_t *fence = (uint32_t*)(map + fine->offset);
> +   return *fence != 0;

I think this is not coherent on GFX9 if MTYPE != UC. MTYPE is set by the
GEM_CREATE ioctl.

> +}
> +
> +static void si_fine_fence_set(struct si_context *ctx,
> + struct si_fine_fence *fine,
> + unsigned flags)
> +{
> +   assert(util_bitcount(flags & (PIPE_FLUSH_TOP_OF_PIPE | 
> PIPE_FLUSH_BOTTOM_OF_PIPE)) == 1);
> +
> +   u_suballocator_alloc(ctx->b.allocator_zeroed_memory, 4, 4,
> +>offset, (struct pipe_resource 
> **)>buf);
> +   if (!fine->buf)
> +   return;
> +
> +   uint64_t fence_va = fine->buf->gpu_address + fine->offset;
> +
> +   radeon_add_to_buffer_list(>b, >b.gfx, fine->buf,
> + RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
> +   if (flags & PIPE_FLUSH_TOP_OF_PIPE) {
> +   struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
> +   radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
> +   radeon_emit(cs, S_370_DST_SEL(V_370_MEM_ASYNC) |
> +   S_370_WR_CONFIRM(1) |
> +   S_370_ENGINE_SEL(V_370_PFP));
> +   radeon_emit(cs, fence_va);
> +   radeon_emit(cs, fence_va >> 32);
> +   radeon_emit(cs, 0x8000);
> +   } else if (flags & PIPE_FLUSH_BOTTOM_OF_PIPE) {
> +   si_gfx_write_event_eop(>b, V_028A90_BOTTOM_OF_PIPE_TS, 0,
> +