Re: [Mesa-dev] [PATCH] anv: implement pipeline statistics queries

2017-02-21 Thread Ilia Mirkin
On Wed, Feb 22, 2017 at 12:07 AM, Jason Ekstrand  wrote:
> Hey Look.  I'm actually reading your patch now!
>
> I read through the whole thing and over-all I think it looks fairly good.
>
> On Sun, Nov 27, 2016 at 11:23 AM, Ilia Mirkin  wrote:
>>
>> The strategy is to just keep n anv_query_pool_slot entries per query
>> instead of one. The available bit is only valid in the last one.
>
>
> Seems like a reasonable approach.  To be honest, I'm not a huge fan of the
> "available" bit (or 64 bits as the case may be) but I'm not sure how we'd
> get away without it.
>
> Maybe it would be better to do something like:
>
> struct anv_query_entry {
>uint64_t begin;
>uint64_t end;
> };
>
> struct anv_query_pool_slot {
>uint64_t available
>struct anv_query_entry entries[0];
> };
>
> Food for thought.

Seems reasonable.

>
>>
>> Signed-off-by: Ilia Mirkin 
>> ---
>>
>> I think this is in a pretty good state now. I've tested both the direct
>> and
>> buffer paths with a hacked up cube application, and I'm seeing
>> non-ridiculous
>> values for the various counters, although I haven't 100% verified them for
>> accuracy.
>>
>> This also implements the hsw/bdw workaround for dividing frag invocations
>> by 4,
>> copied from hsw_queryobj. I tested this on SKL and it seem to divide the
>> values
>> as expected.
>>
>> The cube patch I've been testing with is at
>> http://paste.debian.net/899374/
>> You can flip between copying to a buffer and explicit retrieval by
>> commenting
>> out the relevant function calls.
>>
>>  src/intel/vulkan/anv_device.c  |   2 +-
>>  src/intel/vulkan/anv_private.h |   4 +
>>  src/intel/vulkan/anv_query.c   |  99 ++
>>  src/intel/vulkan/genX_cmd_buffer.c | 260
>> -
>>  4 files changed, 308 insertions(+), 57 deletions(-)
>>
>> diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
>> index 99eb73c..7ad1970 100644
>> --- a/src/intel/vulkan/anv_device.c
>> +++ b/src/intel/vulkan/anv_device.c
>> @@ -427,7 +427,7 @@ void anv_GetPhysicalDeviceFeatures(
>>.textureCompressionASTC_LDR   = pdevice->info.gen >= 9,
>> /* FINISHME CHV */
>>.textureCompressionBC = true,
>>.occlusionQueryPrecise= true,
>> -  .pipelineStatisticsQuery  = false,
>> +  .pipelineStatisticsQuery  = true,
>>.fragmentStoresAndAtomics = true,
>>.shaderTessellationAndGeometryPointSize   = true,
>>.shaderImageGatherExtended= false,
>> diff --git a/src/intel/vulkan/anv_private.h
>> b/src/intel/vulkan/anv_private.h
>> index 2fc543d..7271609 100644
>> --- a/src/intel/vulkan/anv_private.h
>> +++ b/src/intel/vulkan/anv_private.h
>> @@ -1763,6 +1763,8 @@ struct anv_render_pass {
>> struct anv_subpass   subpasses[0];
>>  };
>>
>> +#define ANV_PIPELINE_STATISTICS_COUNT 11
>> +
>>  struct anv_query_pool_slot {
>> uint64_t begin;
>> uint64_t end;
>> @@ -1772,6 +1774,8 @@ struct anv_query_pool_slot {
>>  struct anv_query_pool {
>> VkQueryType  type;
>> uint32_t slots;
>> +   uint32_t pipeline_statistics;
>> +   uint32_t slot_stride;
>> struct anv_bobo;
>>  };
>>
>> diff --git a/src/intel/vulkan/anv_query.c b/src/intel/vulkan/anv_query.c
>> index 293257b..dc00859 100644
>> --- a/src/intel/vulkan/anv_query.c
>> +++ b/src/intel/vulkan/anv_query.c
>> @@ -38,8 +38,10 @@ VkResult anv_CreateQueryPool(
>> ANV_FROM_HANDLE(anv_device, device, _device);
>> struct anv_query_pool *pool;
>> VkResult result;
>> -   uint32_t slot_size;
>> -   uint64_t size;
>> +   uint32_t slot_size = sizeof(struct anv_query_pool_slot);
>>
>> +   uint32_t slot_stride = 1;
>
>
> Strides are usually in bytes, not slots...

slot_pitch? :) IMHO stride/pitch doesn't have a unit implied by the
name itself. I'm pretty sure I've seen it refer to both bytes and
higher-level units.

>
>>
>> +   uint64_t size = pCreateInfo->queryCount * slot_size;
>
>
> Might make sense to move this to after we compute the slot_stride.
>
>>
>> +   uint32_t pipeline_statistics = 0;
>>
>> assert(pCreateInfo->sType ==
>> VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
>>
>> @@ -48,12 +50,16 @@ VkResult anv_CreateQueryPool(
>> case VK_QUERY_TYPE_TIMESTAMP:
>>break;
>> case VK_QUERY_TYPE_PIPELINE_STATISTICS:
>> -  return VK_ERROR_INCOMPATIBLE_DRIVER;
>> +  pipeline_statistics = pCreateInfo->pipelineStatistics &
>> + ((1 << ANV_PIPELINE_STATISTICS_COUNT) - 1);
>> +  slot_stride = _mesa_bitcount(pipeline_statistics);
>> +  size *= slot_stride;
>> +  break;
>> default:
>>assert(!"Invalid query type");
>> +

Re: [Mesa-dev] [PATCH] anv: implement pipeline statistics queries

2017-02-21 Thread Jason Ekstrand
Hey Look.  I'm actually reading your patch now!

I read through the whole thing and over-all I think it looks fairly good.

On Sun, Nov 27, 2016 at 11:23 AM, Ilia Mirkin  wrote:

> The strategy is to just keep n anv_query_pool_slot entries per query
> instead of one. The available bit is only valid in the last one.
>

Seems like a reasonable approach.  To be honest, I'm not a huge fan of the
"available" bit (or 64 bits as the case may be) but I'm not sure how we'd
get away without it.

Maybe it would be better to do something like:

struct anv_query_entry {
   uint64_t begin;
   uint64_t end;
};

struct anv_query_pool_slot {
   uint64_t available
   struct anv_query_entry entries[0];
};

Food for thought.


> Signed-off-by: Ilia Mirkin 
> ---
>
> I think this is in a pretty good state now. I've tested both the direct and
> buffer paths with a hacked up cube application, and I'm seeing
> non-ridiculous
> values for the various counters, although I haven't 100% verified them for
> accuracy.
>
> This also implements the hsw/bdw workaround for dividing frag invocations
> by 4,
> copied from hsw_queryobj. I tested this on SKL and it seem to divide the
> values
> as expected.
>
> The cube patch I've been testing with is at http://paste.debian.net/899374
> /
> You can flip between copying to a buffer and explicit retrieval by
> commenting
> out the relevant function calls.
>
>  src/intel/vulkan/anv_device.c  |   2 +-
>  src/intel/vulkan/anv_private.h |   4 +
>  src/intel/vulkan/anv_query.c   |  99 ++
>  src/intel/vulkan/genX_cmd_buffer.c | 260 ++
> ++-
>  4 files changed, 308 insertions(+), 57 deletions(-)
>
> diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
> index 99eb73c..7ad1970 100644
> --- a/src/intel/vulkan/anv_device.c
> +++ b/src/intel/vulkan/anv_device.c
> @@ -427,7 +427,7 @@ void anv_GetPhysicalDeviceFeatures(
>.textureCompressionASTC_LDR   = pdevice->info.gen >= 9,
> /* FINISHME CHV */
>.textureCompressionBC = true,
>.occlusionQueryPrecise= true,
> -  .pipelineStatisticsQuery  = false,
> +  .pipelineStatisticsQuery  = true,
>.fragmentStoresAndAtomics = true,
>.shaderTessellationAndGeometryPointSize   = true,
>.shaderImageGatherExtended= false,
> diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private
> .h
> index 2fc543d..7271609 100644
> --- a/src/intel/vulkan/anv_private.h
> +++ b/src/intel/vulkan/anv_private.h
> @@ -1763,6 +1763,8 @@ struct anv_render_pass {
> struct anv_subpass   subpasses[0];
>  };
>
> +#define ANV_PIPELINE_STATISTICS_COUNT 11
> +
>  struct anv_query_pool_slot {
> uint64_t begin;
> uint64_t end;
> @@ -1772,6 +1774,8 @@ struct anv_query_pool_slot {
>  struct anv_query_pool {
> VkQueryType  type;
> uint32_t slots;
> +   uint32_t pipeline_statistics;
> +   uint32_t slot_stride;
> struct anv_bobo;
>  };
>
> diff --git a/src/intel/vulkan/anv_query.c b/src/intel/vulkan/anv_query.c
> index 293257b..dc00859 100644
> --- a/src/intel/vulkan/anv_query.c
> +++ b/src/intel/vulkan/anv_query.c
> @@ -38,8 +38,10 @@ VkResult anv_CreateQueryPool(
> ANV_FROM_HANDLE(anv_device, device, _device);
> struct anv_query_pool *pool;
> VkResult result;
> -   uint32_t slot_size;
> -   uint64_t size;
> +   uint32_t slot_size = sizeof(struct anv_query_pool_slot);
>
+   uint32_t slot_stride = 1;
>

Strides are usually in bytes, not slots...


> +   uint64_t size = pCreateInfo->queryCount * slot_size;
>

Might make sense to move this to after we compute the slot_stride.


> +   uint32_t pipeline_statistics = 0;
>
> assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_C
> REATE_INFO);
>
> @@ -48,12 +50,16 @@ VkResult anv_CreateQueryPool(
> case VK_QUERY_TYPE_TIMESTAMP:
>break;
> case VK_QUERY_TYPE_PIPELINE_STATISTICS:
> -  return VK_ERROR_INCOMPATIBLE_DRIVER;
> +  pipeline_statistics = pCreateInfo->pipelineStatistics &
> + ((1 << ANV_PIPELINE_STATISTICS_COUNT) - 1);
> +  slot_stride = _mesa_bitcount(pipeline_statistics);
> +  size *= slot_stride;
> +  break;
> default:
>assert(!"Invalid query type");
> +  return VK_ERROR_INCOMPATIBLE_DRIVER;
> }
>
> -   slot_size = sizeof(struct anv_query_pool_slot);
> pool = vk_alloc2(>alloc, pAllocator, sizeof(*pool), 8,
>   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
> if (pool == NULL)
> @@ -61,8 +67,9 @@ VkResult anv_CreateQueryPool(
>
> pool->type = pCreateInfo->queryType;
> pool->slots = pCreateInfo->queryCount;
> +   

Re: [Mesa-dev] [PATCH] anv: implement pipeline statistics queries

2017-02-21 Thread Ilia Mirkin
Looks like after nearly 3 months of no reviews that lead to R-b's[1],
this patch no longer applies to master. I'm abandoning it. If anyone's
interested, feel free to pick it up and make it your own.

Cheers,

  -ilia

[1] Robert did look at it, but didn't have significant feedback or say
what I needed to do to get a R-b, nor is he one of the main anv
contributors, so I'd have to get someone else's anyways.


On Tue, Jan 24, 2017 at 2:48 PM, Ilia Mirkin  wrote:
> 2-month ping. [ok, it hasn't been 2 months on the dot, but ... close.]
>
> On Tue, Jan 10, 2017 at 5:49 PM, Ilia Mirkin  wrote:
>> ping.
>>
>> On Thu, Dec 22, 2016 at 11:14 AM, Ilia Mirkin  wrote:
>>> Ping? Any further comments/feedback/reviews?
>>>
>>>
>>> On Dec 5, 2016 11:22 AM, "Ilia Mirkin"  wrote:
>>>
>>> On Mon, Dec 5, 2016 at 11:11 AM, Robert Bragg  wrote:


 On Sun, Nov 27, 2016 at 7:23 PM, Ilia Mirkin  wrote:
>
> The strategy is to just keep n anv_query_pool_slot entries per query
> instead of one. The available bit is only valid in the last one.
>
> Signed-off-by: Ilia Mirkin 
> ---
>
> I think this is in a pretty good state now. I've tested both the direct
> and
> buffer paths with a hacked up cube application, and I'm seeing
> non-ridiculous
> values for the various counters, although I haven't 100% verified them
> for
> accuracy.
>
> This also implements the hsw/bdw workaround for dividing frag invocations
> by 4,
> copied from hsw_queryobj. I tested this on SKL and it seem to divide the
> values
> as expected.
>
> The cube patch I've been testing with is at
> http://paste.debian.net/899374/
> You can flip between copying to a buffer and explicit retrieval by
> commenting
> out the relevant function calls.
>
>  src/intel/vulkan/anv_device.c  |   2 +-
>  src/intel/vulkan/anv_private.h |   4 +
>  src/intel/vulkan/anv_query.c   |  99 ++
>  src/intel/vulkan/genX_cmd_buffer.c | 260
> -
>  4 files changed, 308 insertions(+), 57 deletions(-)
>
>
> diff --git a/src/intel/vulkan/anv_device.c
> b/src/intel/vulkan/anv_device.c
> index 99eb73c..7ad1970 100644
> --- a/src/intel/vulkan/anv_device.c
> +++ b/src/intel/vulkan/anv_device.c
> @@ -427,7 +427,7 @@ void anv_GetPhysicalDeviceFeatures(
>.textureCompressionASTC_LDR   = pdevice->info.gen >=
> 9,
> /* FINISHME CHV */
>.textureCompressionBC = true,
>.occlusionQueryPrecise= true,
> -  .pipelineStatisticsQuery  = false,
> +  .pipelineStatisticsQuery  = true,
>.fragmentStoresAndAtomics = true,
>.shaderTessellationAndGeometryPointSize   = true,
>.shaderImageGatherExtended= false,
> diff --git a/src/intel/vulkan/anv_private.h
> b/src/intel/vulkan/anv_private.h
> index 2fc543d..7271609 100644
> --- a/src/intel/vulkan/anv_private.h
> +++ b/src/intel/vulkan/anv_private.h
> @@ -1763,6 +1763,8 @@ struct anv_render_pass {
> struct anv_subpass   subpasses[0];
>  };
>
> +#define ANV_PIPELINE_STATISTICS_COUNT 11
> +
>  struct anv_query_pool_slot {
> uint64_t begin;
> uint64_t end;
> @@ -1772,6 +1774,8 @@ struct anv_query_pool_slot {
>  struct anv_query_pool {
> VkQueryType  type;
> uint32_t slots;
> +   uint32_t pipeline_statistics;
> +   uint32_t slot_stride;
> struct anv_bobo;
>  };
>
> diff --git a/src/intel/vulkan/anv_query.c b/src/intel/vulkan/anv_query.c
> index 293257b..dc00859 100644
> --- a/src/intel/vulkan/anv_query.c
> +++ b/src/intel/vulkan/anv_query.c
> @@ -38,8 +38,10 @@ VkResult anv_CreateQueryPool(
> ANV_FROM_HANDLE(anv_device, device, _device);
> struct anv_query_pool *pool;
> VkResult result;
> -   uint32_t slot_size;
> -   uint64_t size;
> +   uint32_t slot_size = sizeof(struct anv_query_pool_slot);
> +   uint32_t slot_stride = 1;
> +   uint64_t size = pCreateInfo->queryCount * slot_size;
> +   uint32_t pipeline_statistics = 0;
>
> assert(pCreateInfo->sType ==
> VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
>
> @@ -48,12 +50,16 @@ VkResult anv_CreateQueryPool(
> case VK_QUERY_TYPE_TIMESTAMP:
>break;
> case VK_QUERY_TYPE_PIPELINE_STATISTICS:
> -  return 

Re: [Mesa-dev] [PATCH] anv: implement pipeline statistics queries

2017-02-16 Thread Robert Bragg
On Wed, Feb 15, 2017 at 11:04 PM, Ilia Mirkin  wrote:
> On Tue, Jan 24, 2017 at 5:27 PM, Robert Bragg  wrote:
>> Depending on how strictly we consider that the queries should only 
>> measure
>> the commands they bracket then I think some stalling will be necessary to
>> serialize the work associated with a query and defer reading the end 
>> state
>> until after the relevant stages have completed their work.
>>
>> We aren't very precise about this in GL currently, but in Begin maybe we
>> should stall until everything >= the statistic-stage is idle and in End
>> stall until everything <= the statistic-stage is idle before reading
>> (where
>> 'statistic-stage' here is the pipeline stage associated with the pipeline
>> statistic being queried (or respectively the min/max stage for a set)).
>>
>> For reference in my implementation of INTEL_performance_query facing this
>> same question, I'm currently just stalling before and after queries:
>>
>>
>> https://github.com/rib/mesa/blob/wip/rib/oa-next/src/mesa/drivers/dri/i965/brw_performance_query.c#L994
>>
>> https://github.com/rib/mesa/blob/wip/rib/oa-next/src/mesa/drivers/dri/i965/brw_performance_query.c#L1136
>
> So that's essentially what I'm doing here, I think. (And what the GL
> driver does.)
>>
>> Yup, the upshot might just be a comment explaining the need for a
>> stall. I think we probably need a stall in CmdEndQuery too, otherwise
>> the command streamer may read the end counter before the work has
>> finished.
>
> Robert,
>
> Can you give me some examples of how I might implement this? I'm not
> so familiar with the Intel HW to know this offhand. Mostly hoping you
> can point me at a mapping of which bit in what command corresponds to
> which stage.

Heh, actually just after I sent out my series for
GL_INTEL_performance_query yesterday I of course remembered that I
needed to fold back command streamer synchronization from a later
patch to the one for pipeline statistics.

My last reply was just trying to suggest replacing the "TODO: This
might only be necessary for certain stats" comment - so nothing to
really implement. I had thought you might be missing a corresponding
stall in the CmdEndQuery but just checking it looks like you already
have one with the same TODO comment. Sorry I didn't double check that
at the time.

I'm not sure it's worth worrying about trying to apply fine grained
control over flushing, even though I suggested that idea originally.
After looking into that possibility more I don't think the HW actually
supports very detailed control (with one exception maybe being to use
DEPTH_STALL with occlusion queries).

My (limited) understanding is that a PIPE_CONTROL with CS_STALL and
STALL_AT_SCOREBOARD should generally suffice to stall the command
streamer until the pipeline has been drained. Since this is what you
are already doing my last reply was trying to say that it maybe just
needs a better comment to explain why we need:

+  /* TODO: This might only be necessary for certain stats */
+  anv_batch_emit(_buffer->batch, GENX(PIPE_CONTROL), pc) {
+ pc.CommandStreamerStallEnable = true;
+ pc.StallAtPixelScoreboard = true;
+  }

instead of "TODO: This might only be necessary for certain stats".


I don't know if it's a clear explanation, but feel free to steal
anything from my latest attempt to comment the need for stalling in
this patch for INTEL_performance_query.

https://lists.freedesktop.org/archives/mesa-dev/2017-February/144670.html

Btw, in case you ask, I've never found a good explanation of what
'stall at scoreboard' really means since I'm not really familiar with
what the scoreboard is. :-/ One impression I've got is that it's just
the least-restrictive way of satisfying the restrictions on using
CS_STALL. I think the scoreboard is something related to dependency
tracking while scheduling threads to execute on EUs so I currently
imagine it to mean "stall until there are no more threads left to
schedule for pixel shading" - maybe someone else knows better.

One other data point here is that the Intel driver on windows uses
PIPE_CONTROL + CS_STALL + STALL_AT_SCOREBOARD in its implementation of
INTEL_performance_query and query objects, so hopefully they've found
that to be enough. So even if this does nothing to explain why, all
things being equal it could be good to be consistent if we're ever
trying to compare metrics between different drivers.

Regards,
- Robert


>
> Thanks,
>
>   -ilia
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] anv: implement pipeline statistics queries

2017-02-15 Thread Ilia Mirkin
On Tue, Jan 24, 2017 at 5:27 PM, Robert Bragg  wrote:
> Depending on how strictly we consider that the queries should only measure
> the commands they bracket then I think some stalling will be necessary to
> serialize the work associated with a query and defer reading the end state
> until after the relevant stages have completed their work.
>
> We aren't very precise about this in GL currently, but in Begin maybe we
> should stall until everything >= the statistic-stage is idle and in End
> stall until everything <= the statistic-stage is idle before reading
> (where
> 'statistic-stage' here is the pipeline stage associated with the pipeline
> statistic being queried (or respectively the min/max stage for a set)).
>
> For reference in my implementation of INTEL_performance_query facing this
> same question, I'm currently just stalling before and after queries:
>
>
> https://github.com/rib/mesa/blob/wip/rib/oa-next/src/mesa/drivers/dri/i965/brw_performance_query.c#L994
>
> https://github.com/rib/mesa/blob/wip/rib/oa-next/src/mesa/drivers/dri/i965/brw_performance_query.c#L1136

 So that's essentially what I'm doing here, I think. (And what the GL
 driver does.)
>
> Yup, the upshot might just be a comment explaining the need for a
> stall. I think we probably need a stall in CmdEndQuery too, otherwise
> the command streamer may read the end counter before the work has
> finished.

Robert,

Can you give me some examples of how I might implement this? I'm not
so familiar with the Intel HW to know this offhand. Mostly hoping you
can point me at a mapping of which bit in what command corresponds to
which stage.

Thanks,

  -ilia
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] anv: implement pipeline statistics queries

2017-01-24 Thread Robert Bragg
On Tue, Jan 24, 2017 at 2:37 PM, Ilia Mirkin  wrote:
> On Tue, Jan 24, 2017 at 5:27 PM, Robert Bragg  wrote:
>>> +/*
>>> + * GPR0 = GPR0 >> 2;
>>> + *
>>> + * Note that the upper 30 bits of GPR are lost!
>>> + */
>>> +static void
>>> +shr_gpr0_by_2_bits(struct anv_batch *batch)
>>> +{
>>> +   shl_gpr0_by_30_bits(batch);
>>> +   emit_load_alu_reg_reg32(batch, CS_GPR(0) + 4, CS_GPR(0));
>>> +   emit_load_alu_reg_imm32(batch, CS_GPR(0) + 4, 0);
>>
>>
>> I recently noticed from inspecting the original hsw_queryobj,c code
>> that this looks suspicious.
>>
>> Conceptually it makes sense to implement a right shift as lshift by
>> 32-n and then keeping the upper 32bits, but the emit_load_ functions
>> take a destination followed by a source and so it looks like after the
>> left shift it's copying the least significant 32bits of R0 over the
>> most significant and then setting the most significant 32bits of R0 to
>> zero. It looks like the first load_alu is redundant if the second one
>> just writes zero to the same location.
>>
>> Maybe I'm misreading something here though, this comment it just based
>> on inspection.
>
> What you're missing, I think, is that
>
> emit_load_alu_reg_reg32(batch, CS_GPR(0) + 4, CS_GPR(0));
>
> does CS_GPR(0) = CS_GPR(0) + 4, and not the inverse as one logically
> might have thought. I copied the semantics from the hsw_queryobj.c
> file, but I think they stink. But it stinks even more to have 2
> functions with inverted argument meanings.
>
> Does that make sense?

oh yeah sorry, not sure how I convinced myself it took dst then src.

>
> [So we have GPR0 which is a 64-bit entity, and do GPR0 <<= 30; GPR0_LO
> = GPR0_HI; GPR0_HI = 0; and then we can store GPR0 somewhere.]
>
> As for re-using your generalized shifter, I don't think that'd make
> sense to introduce in this change. It feels like a component on its
> own, which should be integrated (or not) separately. When/if it is,
> this and hsw_queryobj.c could migrate to using it.

Yup definitely, this code works for the current need so no need to
mess around with it here - thanks for clarifying my misreading.

- Robert

>
> Cheers,
>
>   -ilia
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] anv: implement pipeline statistics queries

2017-01-24 Thread Ilia Mirkin
On Tue, Jan 24, 2017 at 5:27 PM, Robert Bragg  wrote:
>> +/*
>> + * GPR0 = GPR0 >> 2;
>> + *
>> + * Note that the upper 30 bits of GPR are lost!
>> + */
>> +static void
>> +shr_gpr0_by_2_bits(struct anv_batch *batch)
>> +{
>> +   shl_gpr0_by_30_bits(batch);
>> +   emit_load_alu_reg_reg32(batch, CS_GPR(0) + 4, CS_GPR(0));
>> +   emit_load_alu_reg_imm32(batch, CS_GPR(0) + 4, 0);
>
>
> I recently noticed from inspecting the original hsw_queryobj,c code
> that this looks suspicious.
>
> Conceptually it makes sense to implement a right shift as lshift by
> 32-n and then keeping the upper 32bits, but the emit_load_ functions
> take a destination followed by a source and so it looks like after the
> left shift it's copying the least significant 32bits of R0 over the
> most significant and then setting the most significant 32bits of R0 to
> zero. It looks like the first load_alu is redundant if the second one
> just writes zero to the same location.
>
> Maybe I'm misreading something here though, this comment it just based
> on inspection.

What you're missing, I think, is that

emit_load_alu_reg_reg32(batch, CS_GPR(0) + 4, CS_GPR(0));

does CS_GPR(0) = CS_GPR(0) + 4, and not the inverse as one logically
might have thought. I copied the semantics from the hsw_queryobj.c
file, but I think they stink. But it stinks even more to have 2
functions with inverted argument meanings.

Does that make sense?

[So we have GPR0 which is a 64-bit entity, and do GPR0 <<= 30; GPR0_LO
= GPR0_HI; GPR0_HI = 0; and then we can store GPR0 somewhere.]

As for re-using your generalized shifter, I don't think that'd make
sense to introduce in this change. It feels like a component on its
own, which should be integrated (or not) separately. When/if it is,
this and hsw_queryobj.c could migrate to using it.

Cheers,

  -ilia
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] anv: implement pipeline statistics queries

2017-01-24 Thread Robert Bragg
Sorry for the delay responding here; some comments below...


On Tue, Jan 24, 2017 at 11:48 AM, Ilia Mirkin  wrote:
> 2-month ping. [ok, it hasn't been 2 months on the dot, but ... close.]
>
> On Tue, Jan 10, 2017 at 5:49 PM, Ilia Mirkin  wrote:
>> ping.
>>
>> On Thu, Dec 22, 2016 at 11:14 AM, Ilia Mirkin  wrote:
>>> Ping? Any further comments/feedback/reviews?
>>>
>>>
>>> On Dec 5, 2016 11:22 AM, "Ilia Mirkin"  wrote:
>>>
>>> On Mon, Dec 5, 2016 at 11:11 AM, Robert Bragg  wrote:


 On Sun, Nov 27, 2016 at 7:23 PM, Ilia Mirkin  wrote:
>
> The strategy is to just keep n anv_query_pool_slot entries per query
> instead of one. The available bit is only valid in the last one.
>
> Signed-off-by: Ilia Mirkin 
> ---
>
> I think this is in a pretty good state now. I've tested both the direct
> and
> buffer paths with a hacked up cube application, and I'm seeing
> non-ridiculous
> values for the various counters, although I haven't 100% verified them
> for
> accuracy.
>
> This also implements the hsw/bdw workaround for dividing frag invocations
> by 4,
> copied from hsw_queryobj. I tested this on SKL and it seem to divide the
> values
> as expected.
>
> The cube patch I've been testing with is at
> http://paste.debian.net/899374/
> You can flip between copying to a buffer and explicit retrieval by
> commenting
> out the relevant function calls.
>
>  src/intel/vulkan/anv_device.c  |   2 +-
>  src/intel/vulkan/anv_private.h |   4 +
>  src/intel/vulkan/anv_query.c   |  99 ++
>  src/intel/vulkan/genX_cmd_buffer.c | 260
> -
>  4 files changed, 308 insertions(+), 57 deletions(-)
>
>
> diff --git a/src/intel/vulkan/anv_device.c
> b/src/intel/vulkan/anv_device.c
> index 99eb73c..7ad1970 100644
> --- a/src/intel/vulkan/anv_device.c
> +++ b/src/intel/vulkan/anv_device.c
> @@ -427,7 +427,7 @@ void anv_GetPhysicalDeviceFeatures(
>.textureCompressionASTC_LDR   = pdevice->info.gen >=
> 9,
> /* FINISHME CHV */
>.textureCompressionBC = true,
>.occlusionQueryPrecise= true,
> -  .pipelineStatisticsQuery  = false,
> +  .pipelineStatisticsQuery  = true,
>.fragmentStoresAndAtomics = true,
>.shaderTessellationAndGeometryPointSize   = true,
>.shaderImageGatherExtended= false,
> diff --git a/src/intel/vulkan/anv_private.h
> b/src/intel/vulkan/anv_private.h
> index 2fc543d..7271609 100644
> --- a/src/intel/vulkan/anv_private.h
> +++ b/src/intel/vulkan/anv_private.h
> @@ -1763,6 +1763,8 @@ struct anv_render_pass {
> struct anv_subpass   subpasses[0];
>  };
>
> +#define ANV_PIPELINE_STATISTICS_COUNT 11
> +
>  struct anv_query_pool_slot {
> uint64_t begin;
> uint64_t end;
> @@ -1772,6 +1774,8 @@ struct anv_query_pool_slot {
>  struct anv_query_pool {
> VkQueryType  type;
> uint32_t slots;
> +   uint32_t pipeline_statistics;
> +   uint32_t slot_stride;
> struct anv_bobo;
>  };
>
> diff --git a/src/intel/vulkan/anv_query.c b/src/intel/vulkan/anv_query.c
> index 293257b..dc00859 100644
> --- a/src/intel/vulkan/anv_query.c
> +++ b/src/intel/vulkan/anv_query.c
> @@ -38,8 +38,10 @@ VkResult anv_CreateQueryPool(
> ANV_FROM_HANDLE(anv_device, device, _device);
> struct anv_query_pool *pool;
> VkResult result;
> -   uint32_t slot_size;
> -   uint64_t size;
> +   uint32_t slot_size = sizeof(struct anv_query_pool_slot);
> +   uint32_t slot_stride = 1;
> +   uint64_t size = pCreateInfo->queryCount * slot_size;
> +   uint32_t pipeline_statistics = 0;
>
> assert(pCreateInfo->sType ==
> VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
>
> @@ -48,12 +50,16 @@ VkResult anv_CreateQueryPool(
> case VK_QUERY_TYPE_TIMESTAMP:
>break;
> case VK_QUERY_TYPE_PIPELINE_STATISTICS:
> -  return VK_ERROR_INCOMPATIBLE_DRIVER;
> +  pipeline_statistics = pCreateInfo->pipelineStatistics &
> + ((1 << ANV_PIPELINE_STATISTICS_COUNT) - 1);
> +  slot_stride = _mesa_bitcount(pipeline_statistics);
> +  size *= slot_stride;
> +  break;
> default:
>assert(!"Invalid query type");
> +  return 

Re: [Mesa-dev] [PATCH] anv: implement pipeline statistics queries

2017-01-24 Thread Ilia Mirkin
2-month ping. [ok, it hasn't been 2 months on the dot, but ... close.]

On Tue, Jan 10, 2017 at 5:49 PM, Ilia Mirkin  wrote:
> ping.
>
> On Thu, Dec 22, 2016 at 11:14 AM, Ilia Mirkin  wrote:
>> Ping? Any further comments/feedback/reviews?
>>
>>
>> On Dec 5, 2016 11:22 AM, "Ilia Mirkin"  wrote:
>>
>> On Mon, Dec 5, 2016 at 11:11 AM, Robert Bragg  wrote:
>>>
>>>
>>> On Sun, Nov 27, 2016 at 7:23 PM, Ilia Mirkin  wrote:

 The strategy is to just keep n anv_query_pool_slot entries per query
 instead of one. The available bit is only valid in the last one.

 Signed-off-by: Ilia Mirkin 
 ---

 I think this is in a pretty good state now. I've tested both the direct
 and
 buffer paths with a hacked up cube application, and I'm seeing
 non-ridiculous
 values for the various counters, although I haven't 100% verified them
 for
 accuracy.

 This also implements the hsw/bdw workaround for dividing frag invocations
 by 4,
 copied from hsw_queryobj. I tested this on SKL and it seem to divide the
 values
 as expected.

 The cube patch I've been testing with is at
 http://paste.debian.net/899374/
 You can flip between copying to a buffer and explicit retrieval by
 commenting
 out the relevant function calls.

  src/intel/vulkan/anv_device.c  |   2 +-
  src/intel/vulkan/anv_private.h |   4 +
  src/intel/vulkan/anv_query.c   |  99 ++
  src/intel/vulkan/genX_cmd_buffer.c | 260
 -
  4 files changed, 308 insertions(+), 57 deletions(-)


 diff --git a/src/intel/vulkan/anv_device.c
 b/src/intel/vulkan/anv_device.c
 index 99eb73c..7ad1970 100644
 --- a/src/intel/vulkan/anv_device.c
 +++ b/src/intel/vulkan/anv_device.c
 @@ -427,7 +427,7 @@ void anv_GetPhysicalDeviceFeatures(
.textureCompressionASTC_LDR   = pdevice->info.gen >=
 9,
 /* FINISHME CHV */
.textureCompressionBC = true,
.occlusionQueryPrecise= true,
 -  .pipelineStatisticsQuery  = false,
 +  .pipelineStatisticsQuery  = true,
.fragmentStoresAndAtomics = true,
.shaderTessellationAndGeometryPointSize   = true,
.shaderImageGatherExtended= false,
 diff --git a/src/intel/vulkan/anv_private.h
 b/src/intel/vulkan/anv_private.h
 index 2fc543d..7271609 100644
 --- a/src/intel/vulkan/anv_private.h
 +++ b/src/intel/vulkan/anv_private.h
 @@ -1763,6 +1763,8 @@ struct anv_render_pass {
 struct anv_subpass   subpasses[0];
  };

 +#define ANV_PIPELINE_STATISTICS_COUNT 11
 +
  struct anv_query_pool_slot {
 uint64_t begin;
 uint64_t end;
 @@ -1772,6 +1774,8 @@ struct anv_query_pool_slot {
  struct anv_query_pool {
 VkQueryType  type;
 uint32_t slots;
 +   uint32_t pipeline_statistics;
 +   uint32_t slot_stride;
 struct anv_bobo;
  };

 diff --git a/src/intel/vulkan/anv_query.c b/src/intel/vulkan/anv_query.c
 index 293257b..dc00859 100644
 --- a/src/intel/vulkan/anv_query.c
 +++ b/src/intel/vulkan/anv_query.c
 @@ -38,8 +38,10 @@ VkResult anv_CreateQueryPool(
 ANV_FROM_HANDLE(anv_device, device, _device);
 struct anv_query_pool *pool;
 VkResult result;
 -   uint32_t slot_size;
 -   uint64_t size;
 +   uint32_t slot_size = sizeof(struct anv_query_pool_slot);
 +   uint32_t slot_stride = 1;
 +   uint64_t size = pCreateInfo->queryCount * slot_size;
 +   uint32_t pipeline_statistics = 0;

 assert(pCreateInfo->sType ==
 VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);

 @@ -48,12 +50,16 @@ VkResult anv_CreateQueryPool(
 case VK_QUERY_TYPE_TIMESTAMP:
break;
 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 -  return VK_ERROR_INCOMPATIBLE_DRIVER;
 +  pipeline_statistics = pCreateInfo->pipelineStatistics &
 + ((1 << ANV_PIPELINE_STATISTICS_COUNT) - 1);
 +  slot_stride = _mesa_bitcount(pipeline_statistics);
 +  size *= slot_stride;
 +  break;
 default:
assert(!"Invalid query type");
 +  return VK_ERROR_INCOMPATIBLE_DRIVER;
 }

 -   slot_size = sizeof(struct anv_query_pool_slot);
 pool = vk_alloc2(>alloc, pAllocator, sizeof(*pool), 8,
   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 if (pool == NULL)
 @@ 

Re: [Mesa-dev] [PATCH] anv: implement pipeline statistics queries

2017-01-10 Thread Ilia Mirkin
ping.

On Thu, Dec 22, 2016 at 11:14 AM, Ilia Mirkin  wrote:
> Ping? Any further comments/feedback/reviews?
>
>
> On Dec 5, 2016 11:22 AM, "Ilia Mirkin"  wrote:
>
> On Mon, Dec 5, 2016 at 11:11 AM, Robert Bragg  wrote:
>>
>>
>> On Sun, Nov 27, 2016 at 7:23 PM, Ilia Mirkin  wrote:
>>>
>>> The strategy is to just keep n anv_query_pool_slot entries per query
>>> instead of one. The available bit is only valid in the last one.
>>>
>>> Signed-off-by: Ilia Mirkin 
>>> ---
>>>
>>> I think this is in a pretty good state now. I've tested both the direct
>>> and
>>> buffer paths with a hacked up cube application, and I'm seeing
>>> non-ridiculous
>>> values for the various counters, although I haven't 100% verified them
>>> for
>>> accuracy.
>>>
>>> This also implements the hsw/bdw workaround for dividing frag invocations
>>> by 4,
>>> copied from hsw_queryobj. I tested this on SKL and it seem to divide the
>>> values
>>> as expected.
>>>
>>> The cube patch I've been testing with is at
>>> http://paste.debian.net/899374/
>>> You can flip between copying to a buffer and explicit retrieval by
>>> commenting
>>> out the relevant function calls.
>>>
>>>  src/intel/vulkan/anv_device.c  |   2 +-
>>>  src/intel/vulkan/anv_private.h |   4 +
>>>  src/intel/vulkan/anv_query.c   |  99 ++
>>>  src/intel/vulkan/genX_cmd_buffer.c | 260
>>> -
>>>  4 files changed, 308 insertions(+), 57 deletions(-)
>>>
>>>
>>> diff --git a/src/intel/vulkan/anv_device.c
>>> b/src/intel/vulkan/anv_device.c
>>> index 99eb73c..7ad1970 100644
>>> --- a/src/intel/vulkan/anv_device.c
>>> +++ b/src/intel/vulkan/anv_device.c
>>> @@ -427,7 +427,7 @@ void anv_GetPhysicalDeviceFeatures(
>>>.textureCompressionASTC_LDR   = pdevice->info.gen >=
>>> 9,
>>> /* FINISHME CHV */
>>>.textureCompressionBC = true,
>>>.occlusionQueryPrecise= true,
>>> -  .pipelineStatisticsQuery  = false,
>>> +  .pipelineStatisticsQuery  = true,
>>>.fragmentStoresAndAtomics = true,
>>>.shaderTessellationAndGeometryPointSize   = true,
>>>.shaderImageGatherExtended= false,
>>> diff --git a/src/intel/vulkan/anv_private.h
>>> b/src/intel/vulkan/anv_private.h
>>> index 2fc543d..7271609 100644
>>> --- a/src/intel/vulkan/anv_private.h
>>> +++ b/src/intel/vulkan/anv_private.h
>>> @@ -1763,6 +1763,8 @@ struct anv_render_pass {
>>> struct anv_subpass   subpasses[0];
>>>  };
>>>
>>> +#define ANV_PIPELINE_STATISTICS_COUNT 11
>>> +
>>>  struct anv_query_pool_slot {
>>> uint64_t begin;
>>> uint64_t end;
>>> @@ -1772,6 +1774,8 @@ struct anv_query_pool_slot {
>>>  struct anv_query_pool {
>>> VkQueryType  type;
>>> uint32_t slots;
>>> +   uint32_t pipeline_statistics;
>>> +   uint32_t slot_stride;
>>> struct anv_bobo;
>>>  };
>>>
>>> diff --git a/src/intel/vulkan/anv_query.c b/src/intel/vulkan/anv_query.c
>>> index 293257b..dc00859 100644
>>> --- a/src/intel/vulkan/anv_query.c
>>> +++ b/src/intel/vulkan/anv_query.c
>>> @@ -38,8 +38,10 @@ VkResult anv_CreateQueryPool(
>>> ANV_FROM_HANDLE(anv_device, device, _device);
>>> struct anv_query_pool *pool;
>>> VkResult result;
>>> -   uint32_t slot_size;
>>> -   uint64_t size;
>>> +   uint32_t slot_size = sizeof(struct anv_query_pool_slot);
>>> +   uint32_t slot_stride = 1;
>>> +   uint64_t size = pCreateInfo->queryCount * slot_size;
>>> +   uint32_t pipeline_statistics = 0;
>>>
>>> assert(pCreateInfo->sType ==
>>> VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
>>>
>>> @@ -48,12 +50,16 @@ VkResult anv_CreateQueryPool(
>>> case VK_QUERY_TYPE_TIMESTAMP:
>>>break;
>>> case VK_QUERY_TYPE_PIPELINE_STATISTICS:
>>> -  return VK_ERROR_INCOMPATIBLE_DRIVER;
>>> +  pipeline_statistics = pCreateInfo->pipelineStatistics &
>>> + ((1 << ANV_PIPELINE_STATISTICS_COUNT) - 1);
>>> +  slot_stride = _mesa_bitcount(pipeline_statistics);
>>> +  size *= slot_stride;
>>> +  break;
>>> default:
>>>assert(!"Invalid query type");
>>> +  return VK_ERROR_INCOMPATIBLE_DRIVER;
>>> }
>>>
>>> -   slot_size = sizeof(struct anv_query_pool_slot);
>>> pool = vk_alloc2(>alloc, pAllocator, sizeof(*pool), 8,
>>>   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
>>> if (pool == NULL)
>>> @@ -61,8 +67,9 @@ VkResult anv_CreateQueryPool(
>>>
>>> pool->type = pCreateInfo->queryType;
>>> pool->slots = pCreateInfo->queryCount;
>>> +   pool->pipeline_statistics = pipeline_statistics;
>>> +   pool->slot_stride = slot_stride;
>>>
>>> -   size = 

Re: [Mesa-dev] [PATCH] anv: implement pipeline statistics queries

2016-12-22 Thread Ilia Mirkin
Ping? Any further comments/feedback/reviews?

On Dec 5, 2016 11:22 AM, "Ilia Mirkin"  wrote:

On Mon, Dec 5, 2016 at 11:11 AM, Robert Bragg  wrote:
>
>
> On Sun, Nov 27, 2016 at 7:23 PM, Ilia Mirkin  wrote:
>>
>> The strategy is to just keep n anv_query_pool_slot entries per query
>> instead of one. The available bit is only valid in the last one.
>>
>> Signed-off-by: Ilia Mirkin 
>> ---
>>
>> I think this is in a pretty good state now. I've tested both the direct
>> and
>> buffer paths with a hacked up cube application, and I'm seeing
>> non-ridiculous
>> values for the various counters, although I haven't 100% verified them
for
>> accuracy.
>>
>> This also implements the hsw/bdw workaround for dividing frag invocations
>> by 4,
>> copied from hsw_queryobj. I tested this on SKL and it seem to divide the
>> values
>> as expected.
>>
>> The cube patch I've been testing with is at
>> http://paste.debian.net/899374/
>> You can flip between copying to a buffer and explicit retrieval by
>> commenting
>> out the relevant function calls.
>>
>>  src/intel/vulkan/anv_device.c  |   2 +-
>>  src/intel/vulkan/anv_private.h |   4 +
>>  src/intel/vulkan/anv_query.c   |  99 ++
>>  src/intel/vulkan/genX_cmd_buffer.c | 260
>> -
>>  4 files changed, 308 insertions(+), 57 deletions(-)
>>
>>
>> diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.
c
>> index 99eb73c..7ad1970 100644
>> --- a/src/intel/vulkan/anv_device.c
>> +++ b/src/intel/vulkan/anv_device.c
>> @@ -427,7 +427,7 @@ void anv_GetPhysicalDeviceFeatures(
>>.textureCompressionASTC_LDR   = pdevice->info.gen >=
9,
>> /* FINISHME CHV */
>>.textureCompressionBC = true,
>>.occlusionQueryPrecise= true,
>> -  .pipelineStatisticsQuery  = false,
>> +  .pipelineStatisticsQuery  = true,
>>.fragmentStoresAndAtomics = true,
>>.shaderTessellationAndGeometryPointSize   = true,
>>.shaderImageGatherExtended= false,
>> diff --git a/src/intel/vulkan/anv_private.h
>> b/src/intel/vulkan/anv_private.h
>> index 2fc543d..7271609 100644
>> --- a/src/intel/vulkan/anv_private.h
>> +++ b/src/intel/vulkan/anv_private.h
>> @@ -1763,6 +1763,8 @@ struct anv_render_pass {
>> struct anv_subpass   subpasses[0];
>>  };
>>
>> +#define ANV_PIPELINE_STATISTICS_COUNT 11
>> +
>>  struct anv_query_pool_slot {
>> uint64_t begin;
>> uint64_t end;
>> @@ -1772,6 +1774,8 @@ struct anv_query_pool_slot {
>>  struct anv_query_pool {
>> VkQueryType  type;
>> uint32_t slots;
>> +   uint32_t pipeline_statistics;
>> +   uint32_t slot_stride;
>> struct anv_bobo;
>>  };
>>
>> diff --git a/src/intel/vulkan/anv_query.c b/src/intel/vulkan/anv_query.c
>> index 293257b..dc00859 100644
>> --- a/src/intel/vulkan/anv_query.c
>> +++ b/src/intel/vulkan/anv_query.c
>> @@ -38,8 +38,10 @@ VkResult anv_CreateQueryPool(
>> ANV_FROM_HANDLE(anv_device, device, _device);
>> struct anv_query_pool *pool;
>> VkResult result;
>> -   uint32_t slot_size;
>> -   uint64_t size;
>> +   uint32_t slot_size = sizeof(struct anv_query_pool_slot);
>> +   uint32_t slot_stride = 1;
>> +   uint64_t size = pCreateInfo->queryCount * slot_size;
>> +   uint32_t pipeline_statistics = 0;
>>
>> assert(pCreateInfo->sType ==
>> VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
>>
>> @@ -48,12 +50,16 @@ VkResult anv_CreateQueryPool(
>> case VK_QUERY_TYPE_TIMESTAMP:
>>break;
>> case VK_QUERY_TYPE_PIPELINE_STATISTICS:
>> -  return VK_ERROR_INCOMPATIBLE_DRIVER;
>> +  pipeline_statistics = pCreateInfo->pipelineStatistics &
>> + ((1 << ANV_PIPELINE_STATISTICS_COUNT) - 1);
>> +  slot_stride = _mesa_bitcount(pipeline_statistics);
>> +  size *= slot_stride;
>> +  break;
>> default:
>>assert(!"Invalid query type");
>> +  return VK_ERROR_INCOMPATIBLE_DRIVER;
>> }
>>
>> -   slot_size = sizeof(struct anv_query_pool_slot);
>> pool = vk_alloc2(>alloc, pAllocator, sizeof(*pool), 8,
>>   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
>> if (pool == NULL)
>> @@ -61,8 +67,9 @@ VkResult anv_CreateQueryPool(
>>
>> pool->type = pCreateInfo->queryType;
>> pool->slots = pCreateInfo->queryCount;
>> +   pool->pipeline_statistics = pipeline_statistics;
>> +   pool->slot_stride = slot_stride;
>>
>> -   size = pCreateInfo->queryCount * slot_size;
>> result = anv_bo_init_new(>bo, device, size);
>> if (result != VK_SUCCESS)
>>goto fail;
>> @@ -95,6 +102,27 @@ void anv_DestroyQueryPool(
>> vk_free2(>alloc, pAllocator, pool);
>>  

Re: [Mesa-dev] [PATCH] anv: implement pipeline statistics queries

2016-12-05 Thread Ilia Mirkin
On Mon, Dec 5, 2016 at 11:11 AM, Robert Bragg  wrote:
>
>
> On Sun, Nov 27, 2016 at 7:23 PM, Ilia Mirkin  wrote:
>>
>> The strategy is to just keep n anv_query_pool_slot entries per query
>> instead of one. The available bit is only valid in the last one.
>>
>> Signed-off-by: Ilia Mirkin 
>> ---
>>
>> I think this is in a pretty good state now. I've tested both the direct
>> and
>> buffer paths with a hacked up cube application, and I'm seeing
>> non-ridiculous
>> values for the various counters, although I haven't 100% verified them for
>> accuracy.
>>
>> This also implements the hsw/bdw workaround for dividing frag invocations
>> by 4,
>> copied from hsw_queryobj. I tested this on SKL and it seem to divide the
>> values
>> as expected.
>>
>> The cube patch I've been testing with is at
>> http://paste.debian.net/899374/
>> You can flip between copying to a buffer and explicit retrieval by
>> commenting
>> out the relevant function calls.
>>
>>  src/intel/vulkan/anv_device.c  |   2 +-
>>  src/intel/vulkan/anv_private.h |   4 +
>>  src/intel/vulkan/anv_query.c   |  99 ++
>>  src/intel/vulkan/genX_cmd_buffer.c | 260
>> -
>>  4 files changed, 308 insertions(+), 57 deletions(-)
>>
>>
>> diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
>> index 99eb73c..7ad1970 100644
>> --- a/src/intel/vulkan/anv_device.c
>> +++ b/src/intel/vulkan/anv_device.c
>> @@ -427,7 +427,7 @@ void anv_GetPhysicalDeviceFeatures(
>>.textureCompressionASTC_LDR   = pdevice->info.gen >= 9,
>> /* FINISHME CHV */
>>.textureCompressionBC = true,
>>.occlusionQueryPrecise= true,
>> -  .pipelineStatisticsQuery  = false,
>> +  .pipelineStatisticsQuery  = true,
>>.fragmentStoresAndAtomics = true,
>>.shaderTessellationAndGeometryPointSize   = true,
>>.shaderImageGatherExtended= false,
>> diff --git a/src/intel/vulkan/anv_private.h
>> b/src/intel/vulkan/anv_private.h
>> index 2fc543d..7271609 100644
>> --- a/src/intel/vulkan/anv_private.h
>> +++ b/src/intel/vulkan/anv_private.h
>> @@ -1763,6 +1763,8 @@ struct anv_render_pass {
>> struct anv_subpass   subpasses[0];
>>  };
>>
>> +#define ANV_PIPELINE_STATISTICS_COUNT 11
>> +
>>  struct anv_query_pool_slot {
>> uint64_t begin;
>> uint64_t end;
>> @@ -1772,6 +1774,8 @@ struct anv_query_pool_slot {
>>  struct anv_query_pool {
>> VkQueryType  type;
>> uint32_t slots;
>> +   uint32_t pipeline_statistics;
>> +   uint32_t slot_stride;
>> struct anv_bobo;
>>  };
>>
>> diff --git a/src/intel/vulkan/anv_query.c b/src/intel/vulkan/anv_query.c
>> index 293257b..dc00859 100644
>> --- a/src/intel/vulkan/anv_query.c
>> +++ b/src/intel/vulkan/anv_query.c
>> @@ -38,8 +38,10 @@ VkResult anv_CreateQueryPool(
>> ANV_FROM_HANDLE(anv_device, device, _device);
>> struct anv_query_pool *pool;
>> VkResult result;
>> -   uint32_t slot_size;
>> -   uint64_t size;
>> +   uint32_t slot_size = sizeof(struct anv_query_pool_slot);
>> +   uint32_t slot_stride = 1;
>> +   uint64_t size = pCreateInfo->queryCount * slot_size;
>> +   uint32_t pipeline_statistics = 0;
>>
>> assert(pCreateInfo->sType ==
>> VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
>>
>> @@ -48,12 +50,16 @@ VkResult anv_CreateQueryPool(
>> case VK_QUERY_TYPE_TIMESTAMP:
>>break;
>> case VK_QUERY_TYPE_PIPELINE_STATISTICS:
>> -  return VK_ERROR_INCOMPATIBLE_DRIVER;
>> +  pipeline_statistics = pCreateInfo->pipelineStatistics &
>> + ((1 << ANV_PIPELINE_STATISTICS_COUNT) - 1);
>> +  slot_stride = _mesa_bitcount(pipeline_statistics);
>> +  size *= slot_stride;
>> +  break;
>> default:
>>assert(!"Invalid query type");
>> +  return VK_ERROR_INCOMPATIBLE_DRIVER;
>> }
>>
>> -   slot_size = sizeof(struct anv_query_pool_slot);
>> pool = vk_alloc2(>alloc, pAllocator, sizeof(*pool), 8,
>>   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
>> if (pool == NULL)
>> @@ -61,8 +67,9 @@ VkResult anv_CreateQueryPool(
>>
>> pool->type = pCreateInfo->queryType;
>> pool->slots = pCreateInfo->queryCount;
>> +   pool->pipeline_statistics = pipeline_statistics;
>> +   pool->slot_stride = slot_stride;
>>
>> -   size = pCreateInfo->queryCount * slot_size;
>> result = anv_bo_init_new(>bo, device, size);
>> if (result != VK_SUCCESS)
>>goto fail;
>> @@ -95,6 +102,27 @@ void anv_DestroyQueryPool(
>> vk_free2(>alloc, pAllocator, pool);
>>  }
>>
>> +static void *
>> +store_query_result(void *pData, VkQueryResultFlags flags,
>> +   uint64_t 

Re: [Mesa-dev] [PATCH] anv: implement pipeline statistics queries

2016-12-05 Thread Robert Bragg
On Sun, Nov 27, 2016 at 7:23 PM, Ilia Mirkin  wrote:

> The strategy is to just keep n anv_query_pool_slot entries per query
> instead of one. The available bit is only valid in the last one.
>
> Signed-off-by: Ilia Mirkin 
> ---
>
> I think this is in a pretty good state now. I've tested both the direct and
> buffer paths with a hacked up cube application, and I'm seeing
> non-ridiculous
> values for the various counters, although I haven't 100% verified them for
> accuracy.
>
> This also implements the hsw/bdw workaround for dividing frag invocations
> by 4,
> copied from hsw_queryobj. I tested this on SKL and it seem to divide the
> values
> as expected.
>
> The cube patch I've been testing with is at http://paste.debian.net/
> 899374/
> You can flip between copying to a buffer and explicit retrieval by
> commenting
> out the relevant function calls.
>
>  src/intel/vulkan/anv_device.c  |   2 +-
>  src/intel/vulkan/anv_private.h |   4 +
>  src/intel/vulkan/anv_query.c   |  99 ++
>  src/intel/vulkan/genX_cmd_buffer.c | 260 ++
> ++-
>  4 files changed, 308 insertions(+), 57 deletions(-)
>

> diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
> index 99eb73c..7ad1970 100644
> --- a/src/intel/vulkan/anv_device.c
> +++ b/src/intel/vulkan/anv_device.c
> @@ -427,7 +427,7 @@ void anv_GetPhysicalDeviceFeatures(
>.textureCompressionASTC_LDR   = pdevice->info.gen >= 9,
> /* FINISHME CHV */
>.textureCompressionBC = true,
>.occlusionQueryPrecise= true,
> -  .pipelineStatisticsQuery  = false,
> +  .pipelineStatisticsQuery  = true,
>.fragmentStoresAndAtomics = true,
>.shaderTessellationAndGeometryPointSize   = true,
>.shaderImageGatherExtended= false,
> diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_
> private.h
> index 2fc543d..7271609 100644
> --- a/src/intel/vulkan/anv_private.h
> +++ b/src/intel/vulkan/anv_private.h
> @@ -1763,6 +1763,8 @@ struct anv_render_pass {
> struct anv_subpass   subpasses[0];
>  };
>
> +#define ANV_PIPELINE_STATISTICS_COUNT 11
> +
>  struct anv_query_pool_slot {
> uint64_t begin;
> uint64_t end;
> @@ -1772,6 +1774,8 @@ struct anv_query_pool_slot {
>  struct anv_query_pool {
> VkQueryType  type;
> uint32_t slots;
> +   uint32_t pipeline_statistics;
> +   uint32_t slot_stride;
> struct anv_bobo;
>  };
>
> diff --git a/src/intel/vulkan/anv_query.c b/src/intel/vulkan/anv_query.c
> index 293257b..dc00859 100644
> --- a/src/intel/vulkan/anv_query.c
> +++ b/src/intel/vulkan/anv_query.c
> @@ -38,8 +38,10 @@ VkResult anv_CreateQueryPool(
> ANV_FROM_HANDLE(anv_device, device, _device);
> struct anv_query_pool *pool;
> VkResult result;
> -   uint32_t slot_size;
> -   uint64_t size;
> +   uint32_t slot_size = sizeof(struct anv_query_pool_slot);
> +   uint32_t slot_stride = 1;
> +   uint64_t size = pCreateInfo->queryCount * slot_size;
> +   uint32_t pipeline_statistics = 0;
>
> assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_
> CREATE_INFO);
>
> @@ -48,12 +50,16 @@ VkResult anv_CreateQueryPool(
> case VK_QUERY_TYPE_TIMESTAMP:
>break;
> case VK_QUERY_TYPE_PIPELINE_STATISTICS:
> -  return VK_ERROR_INCOMPATIBLE_DRIVER;
> +  pipeline_statistics = pCreateInfo->pipelineStatistics &
> + ((1 << ANV_PIPELINE_STATISTICS_COUNT) - 1);
> +  slot_stride = _mesa_bitcount(pipeline_statistics);
> +  size *= slot_stride;
> +  break;
> default:
>assert(!"Invalid query type");
> +  return VK_ERROR_INCOMPATIBLE_DRIVER;
> }
>
> -   slot_size = sizeof(struct anv_query_pool_slot);
> pool = vk_alloc2(>alloc, pAllocator, sizeof(*pool), 8,
>   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
> if (pool == NULL)
> @@ -61,8 +67,9 @@ VkResult anv_CreateQueryPool(
>
> pool->type = pCreateInfo->queryType;
> pool->slots = pCreateInfo->queryCount;
> +   pool->pipeline_statistics = pipeline_statistics;
> +   pool->slot_stride = slot_stride;
>
> -   size = pCreateInfo->queryCount * slot_size;
> result = anv_bo_init_new(>bo, device, size);
> if (result != VK_SUCCESS)
>goto fail;
> @@ -95,6 +102,27 @@ void anv_DestroyQueryPool(
> vk_free2(>alloc, pAllocator, pool);
>  }
>
> +static void *
> +store_query_result(void *pData, VkQueryResultFlags flags,
> +   uint64_t result, uint64_t available)
> +{
> +   if (flags & VK_QUERY_RESULT_64_BIT) {
> +  uint64_t *dst = pData;
> +  *dst++ = result;
> +  if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
> + 

[Mesa-dev] [PATCH] anv: implement pipeline statistics queries

2016-11-27 Thread Ilia Mirkin
The strategy is to just keep n anv_query_pool_slot entries per query
instead of one. The available bit is only valid in the last one.

Signed-off-by: Ilia Mirkin 
---

I think this is in a pretty good state now. I've tested both the direct and
buffer paths with a hacked up cube application, and I'm seeing non-ridiculous
values for the various counters, although I haven't 100% verified them for
accuracy.

This also implements the hsw/bdw workaround for dividing frag invocations by 4,
copied from hsw_queryobj. I tested this on SKL and it seem to divide the values
as expected.

The cube patch I've been testing with is at http://paste.debian.net/899374/
You can flip between copying to a buffer and explicit retrieval by commenting
out the relevant function calls.

 src/intel/vulkan/anv_device.c  |   2 +-
 src/intel/vulkan/anv_private.h |   4 +
 src/intel/vulkan/anv_query.c   |  99 ++
 src/intel/vulkan/genX_cmd_buffer.c | 260 -
 4 files changed, 308 insertions(+), 57 deletions(-)

diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index 99eb73c..7ad1970 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -427,7 +427,7 @@ void anv_GetPhysicalDeviceFeatures(
   .textureCompressionASTC_LDR   = pdevice->info.gen >= 9, /* 
FINISHME CHV */
   .textureCompressionBC = true,
   .occlusionQueryPrecise= true,
-  .pipelineStatisticsQuery  = false,
+  .pipelineStatisticsQuery  = true,
   .fragmentStoresAndAtomics = true,
   .shaderTessellationAndGeometryPointSize   = true,
   .shaderImageGatherExtended= false,
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index 2fc543d..7271609 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -1763,6 +1763,8 @@ struct anv_render_pass {
struct anv_subpass   subpasses[0];
 };
 
+#define ANV_PIPELINE_STATISTICS_COUNT 11
+
 struct anv_query_pool_slot {
uint64_t begin;
uint64_t end;
@@ -1772,6 +1774,8 @@ struct anv_query_pool_slot {
 struct anv_query_pool {
VkQueryType  type;
uint32_t slots;
+   uint32_t pipeline_statistics;
+   uint32_t slot_stride;
struct anv_bobo;
 };
 
diff --git a/src/intel/vulkan/anv_query.c b/src/intel/vulkan/anv_query.c
index 293257b..dc00859 100644
--- a/src/intel/vulkan/anv_query.c
+++ b/src/intel/vulkan/anv_query.c
@@ -38,8 +38,10 @@ VkResult anv_CreateQueryPool(
ANV_FROM_HANDLE(anv_device, device, _device);
struct anv_query_pool *pool;
VkResult result;
-   uint32_t slot_size;
-   uint64_t size;
+   uint32_t slot_size = sizeof(struct anv_query_pool_slot);
+   uint32_t slot_stride = 1;
+   uint64_t size = pCreateInfo->queryCount * slot_size;
+   uint32_t pipeline_statistics = 0;
 
assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
 
@@ -48,12 +50,16 @@ VkResult anv_CreateQueryPool(
case VK_QUERY_TYPE_TIMESTAMP:
   break;
case VK_QUERY_TYPE_PIPELINE_STATISTICS:
-  return VK_ERROR_INCOMPATIBLE_DRIVER;
+  pipeline_statistics = pCreateInfo->pipelineStatistics &
+ ((1 << ANV_PIPELINE_STATISTICS_COUNT) - 1);
+  slot_stride = _mesa_bitcount(pipeline_statistics);
+  size *= slot_stride;
+  break;
default:
   assert(!"Invalid query type");
+  return VK_ERROR_INCOMPATIBLE_DRIVER;
}
 
-   slot_size = sizeof(struct anv_query_pool_slot);
pool = vk_alloc2(>alloc, pAllocator, sizeof(*pool), 8,
  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (pool == NULL)
@@ -61,8 +67,9 @@ VkResult anv_CreateQueryPool(
 
pool->type = pCreateInfo->queryType;
pool->slots = pCreateInfo->queryCount;
+   pool->pipeline_statistics = pipeline_statistics;
+   pool->slot_stride = slot_stride;
 
-   size = pCreateInfo->queryCount * slot_size;
result = anv_bo_init_new(>bo, device, size);
if (result != VK_SUCCESS)
   goto fail;
@@ -95,6 +102,27 @@ void anv_DestroyQueryPool(
vk_free2(>alloc, pAllocator, pool);
 }
 
+static void *
+store_query_result(void *pData, VkQueryResultFlags flags,
+   uint64_t result, uint64_t available)
+{
+   if (flags & VK_QUERY_RESULT_64_BIT) {
+  uint64_t *dst = pData;
+  *dst++ = result;
+  if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
+ *dst++ = available;
+  return dst;
+   } else {
+  uint32_t *dst = pData;
+  if (result > UINT32_MAX)
+ result = UINT32_MAX;
+  *dst++ = result;
+  if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
+ *dst++ = available;
+  return dst;
+   }
+}
+
 VkResult