Boris Brezillon <boris.brezil...@free-electrons.com> writes: > The V3D engine provides several perf counters. > Implement ->get_driver_query_[group_]info() so that these counters are > exposed through the GL_AMD_performance_monitor extension.
Thanks for working on this! I've successfully used it to inform some work I'm doing on 3DMMES. > Signed-off-by: Boris Brezillon <boris.brezil...@free-electrons.com> > --- > src/gallium/drivers/vc4/vc4_context.h | 13 +++ > src/gallium/drivers/vc4/vc4_job.c | 9 +- > src/gallium/drivers/vc4/vc4_query.c | 197 > ++++++++++++++++++++++++++++++++-- > src/gallium/drivers/vc4/vc4_screen.c | 7 ++ > src/gallium/drivers/vc4/vc4_screen.h | 1 + > 5 files changed, 215 insertions(+), 12 deletions(-) > > diff --git a/src/gallium/drivers/vc4/vc4_context.h > b/src/gallium/drivers/vc4/vc4_context.h > index 4a1e4093f1a0..b6d9f041efc7 100644 > --- a/src/gallium/drivers/vc4/vc4_context.h > +++ b/src/gallium/drivers/vc4/vc4_context.h > @@ -309,6 +309,11 @@ struct vc4_job { > struct vc4_job_key key; > }; > > +struct vc4_hwperfmon { > + uint32_t id; > + uint64_t counters[DRM_VC4_MAX_PERF_COUNTERS]; > +}; > + > struct vc4_context { > struct pipe_context base; > > @@ -387,6 +392,8 @@ struct vc4_context { > struct pipe_viewport_state viewport; > struct vc4_constbuf_stateobj constbuf[PIPE_SHADER_TYPES]; > struct vc4_vertexbuf_stateobj vertexbuf; > + > + struct vc4_hwperfmon *perfmon; > /** @} */ > }; > > @@ -444,6 +451,12 @@ vc4_sampler_state(struct pipe_sampler_state *psampler) > return (struct vc4_sampler_state *)psampler; > } > > +int vc4_get_driver_query_group_info(struct pipe_screen *pscreen, > + unsigned index, > + struct pipe_driver_query_group_info > *info); > +int vc4_get_driver_query_info(struct pipe_screen *pscreen, unsigned index, > + struct pipe_driver_query_info *info); > + > struct pipe_context *vc4_context_create(struct pipe_screen *pscreen, > void *priv, unsigned flags); > void vc4_draw_init(struct pipe_context *pctx); > diff --git a/src/gallium/drivers/vc4/vc4_job.c > b/src/gallium/drivers/vc4/vc4_job.c > index fb0c5bbc78cf..f75a32565603 100644 > --- a/src/gallium/drivers/vc4/vc4_job.c > +++ b/src/gallium/drivers/vc4/vc4_job.c > @@ -362,7 +362,7 @@ vc4_submit_setup_rcl_msaa_surface(struct vc4_job *job, > rsc->writes++; > } > > -#define MAX_CHUNKS 1 > +#define MAX_CHUNKS 2 > > /** > * Submits the job to the kernel and then reinitializes it. > @@ -467,6 +467,13 @@ vc4_job_submit(struct vc4_context *vc4, struct vc4_job > *job) > submit.uniforms = (uintptr_t)job->uniforms.base; > submit.uniforms_size = cl_offset(&job->uniforms); > > + if (vc4->perfmon && screen->has_extended_cl) { > + chunks[nchunks].perfmon.type = VC4_PERFMON_CHUNK; > + chunks[nchunks].perfmon.id = vc4->perfmon->id; > + chunks[nchunks].perfmon.pad = 0; > + nchunks++; > + } > + > if (nchunks) { > submit.flags |= VC4_SUBMIT_CL_EXTENDED; > submit.cl_chunks = (uintptr_t)chunks; > diff --git a/src/gallium/drivers/vc4/vc4_query.c > b/src/gallium/drivers/vc4/vc4_query.c > index ddf8f8fb0c2c..d6b081bb15d7 100644 > --- a/src/gallium/drivers/vc4/vc4_query.c > +++ b/src/gallium/drivers/vc4/vc4_query.c > @@ -32,49 +32,224 @@ > > struct vc4_query > { > - uint8_t pad; > + unsigned num_queries; > + struct vc4_hwperfmon *hwperfmon; > }; > > +static const char *v3d_counter_names[] = { > + "FEP-valid-primitives-no-rendered-pixels", > + "FEP-valid-primitives-rendered-pixels", > + "FEP-clipped-quads", > + "FEP-valid-quads", > + "TLB-quads-not-passing-stencil-test", > + "TLB-quads-not-passing-z-and-stencil-test", Looks like you missed "TLB-quads-passing-z-and-stencil-test" here. > + "TLB-quads-with-zero-coverage", > + "TLB-quads-with-non-zero-coverage", > + "TLB-quads-written-to-color-buffer", > + "PTB-primitives-discarded-outside-viewport", > + "PTB-primitives-need-clipping", > + "PTB-primitives-discared-reversed", > + "QPU-total-idle-clk-cycles", > + "QPU-total-clk-cycles-vertex-coord-shading", > + "QPU-total-clk-cycles-fragment-shading", > + "QPU-total-clk-cycles-executing-valid-instr", > + "QPU-total-clk-cycles-waiting-TMU", > + "QPU-total-clk-cycles-waiting-scoreboard", > + "QPU-total-clk-cycles-waiting-varyings", > + "QPU-total-instr-cache-hit", > + "QPU-total-instr-cache-miss", > + "QPU-total-uniform-cache-hit", > + "QPU-total-uniform-cache-miss", > + "TMU-total-text-quads-processed", > + "TMU-total-text-cache-miss", > + "VPM-total-clk-cycles-VDW-stalled", > + "VPM-total-clk-cycles-VCD-stalled", > + "L2C-total-cache-hit", > + "L2C-total-cache-miss", > +}; It would be great to build some piglit tests if we could. Some easy ones I can think of: - Make sure that rendering a bunch of prims gets us FEP-valid-primitives-rendered pixels. (note that it may exceed the number of prims in the draw, when they cross a tile boundary). - Make sure that rendering prims before starting our query or after ending our query doesn't increment the prims counters. - Make sure that an unscissored glClear(COLOR|DEPTH) spends 0 time in VS/FS, but make sure that drawing does. (I think this would have caught the missing counter) - Make sure that TMU-total-text-quads-processed is incremented for texturing but not shaders that don't do texturing. > +int vc4_get_driver_query_group_info(struct pipe_screen *pscreen, > + unsigned index, > + struct pipe_driver_query_group_info > *info) > +{ > + struct vc4_screen *screen = vc4_screen(pscreen); > + > + if (!screen->has_perfmon_ioctl) > + return 0; > + > + if (!info) > + return 1; > + > + if (index > 0) > + return 0; > + > + info->name = "V3D counters"; > + info->max_active_queries = DRM_VC4_MAX_PERF_COUNTERS; > + info->num_queries = ARRAY_SIZE(v3d_counter_names); > + return 1; > +} > + > +int vc4_get_driver_query_info(struct pipe_screen *pscreen, unsigned index, > + struct pipe_driver_query_info *info) > +{ > + struct vc4_screen *screen = vc4_screen(pscreen); > + > + if (!screen->has_perfmon_ioctl) > + return 0; > + > + if (!info) > + return ARRAY_SIZE(v3d_counter_names); > + > + if (index >= ARRAY_SIZE(v3d_counter_names)) > + return 0; > + > + info->name = v3d_counter_names[index]; > + info->query_type = PIPE_QUERY_DRIVER_SPECIFIC + index; > + info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE; > + info->type = PIPE_DRIVER_QUERY_TYPE_UINT64; > + info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH; Missing initialization of info->group > + return 1; > +} > + > static struct pipe_query * > -vc4_create_query(struct pipe_context *ctx, unsigned query_type, unsigned > index) > +vc4_create_batch_query(struct pipe_context *pctx, unsigned num_queries, > + unsigned *query_types) > { > + struct vc4_context *ctx = vc4_context(pctx); > struct vc4_query *query = calloc(1, sizeof(*query)); > + struct drm_vc4_perfmon_create req; > + struct vc4_hwperfmon *hwperfmon; > + unsigned i, nhwqueries = 0; > + int ret; > + > + if (!query) > + return NULL; > + > + for (i = 0; i < num_queries; i++) { > + if (query_types[i] >= PIPE_QUERY_DRIVER_SPECIFIC) > + nhwqueries++; > + } > + > + /* We can't mix HW and non-HW queries. */ > + if (nhwqueries && nhwqueries != num_queries) > + return NULL; > + > + if (!nhwqueries) > + return (struct pipe_query *)query; > + > + hwperfmon = calloc(1, sizeof(*hwperfmon)); > + if (!hwperfmon) > + goto err_free_query; > + > + for (i = 0; i < num_queries; i++) > + req.events[i] = query_types[i] - PIPE_QUERY_DRIVER_SPECIFIC; > + > + req.ncounters = num_queries; > + ret = vc4_ioctl(ctx->fd, DRM_IOCTL_VC4_PERFMON_CREATE, &req); > + if (ret) > + goto err_free_hwperfmon; > + > + hwperfmon->id = req.id; > + query->hwperfmon = hwperfmon; > + query->num_queries = num_queries; > > /* Note that struct pipe_query isn't actually defined anywhere. */ > return (struct pipe_query *)query; > + > +err_free_hwperfmon: > + free(hwperfmon); > + > +err_free_query: > + free(query); > + > + return NULL; > +} > + > +static struct pipe_query * > +vc4_create_query(struct pipe_context *ctx, unsigned query_type, unsigned > index) > +{ > + return vc4_create_batch_query(ctx, 1, &query_type); > } > > static void > -vc4_destroy_query(struct pipe_context *ctx, struct pipe_query *query) > +vc4_destroy_query(struct pipe_context *pctx, struct pipe_query *pquery) > { > + struct vc4_context *ctx = vc4_context(pctx); > + struct vc4_query *query = (struct vc4_query *)pquery; > + > + if (query->hwperfmon) { > + struct drm_vc4_perfmon_destroy req; > + > + req.id = query->hwperfmon->id; > + vc4_ioctl(ctx->fd, DRM_IOCTL_VC4_PERFMON_DESTROY, &req); > + free(query->hwperfmon); > + } > + > free(query); > } > > static boolean > -vc4_begin_query(struct pipe_context *ctx, struct pipe_query *query) > +vc4_begin_query(struct pipe_context *pctx, struct pipe_query *pquery) > { > + struct vc4_query *query = (struct vc4_query *)pquery; > + struct vc4_context *ctx = vc4_context(pctx); > + > + if (!query->hwperfmon) > + return true; > + > + /* Only one perfmon can be activated per context. */ > + if (ctx->perfmon) > + return false; > + > + ctx->perfmon = query->hwperfmon; We need to vc4_flush() before changing ctx->perfmon here and in end, because you don't want things before the start or after the end to be counted. > return true; > } > > static bool > -vc4_end_query(struct pipe_context *ctx, struct pipe_query *query) > +vc4_end_query(struct pipe_context *pctx, struct pipe_query *pquery) > { > + struct vc4_query *query = (struct vc4_query *)pquery; > + struct vc4_context *ctx = vc4_context(pctx); > + > + if (!query->hwperfmon) > + return true; > + > + if (ctx->perfmon != query->hwperfmon) > + return false; > + > + ctx->perfmon = NULL; > return true; > } > > static boolean > -vc4_get_query_result(struct pipe_context *ctx, struct pipe_query *query, > +vc4_get_query_result(struct pipe_context *pctx, struct pipe_query *pquery, > boolean wait, union pipe_query_result *vresult) > { > - uint64_t *result = &vresult->u64; > + struct vc4_context *ctx = vc4_context(pctx); > + struct vc4_query *query = (struct vc4_query *)pquery; > + struct drm_vc4_perfmon_get_values req; > + unsigned i; > + int ret; > + > + if (!query->hwperfmon) { > + vresult->u64 = 0; > + return true; > + } > > - *result = 0; > + req.id = query->hwperfmon->id; > + req.values_ptr = (uintptr_t)query->hwperfmon->counters; > + ret = vc4_ioctl(ctx->fd, DRM_IOCTL_VC4_PERFMON_GET_VALUES, &req); > + if (ret) > + return false; Something needs to block before handing us back the results here, or you may just get 0s since the job hasn't completed yet. I think that blocking should be the kernel's responsibility. > + > + for (i = 0; i < query->num_queries; i++) > + vresult[i].u64 = query->hwperfmon->counters[i]; You'll also need to capture the values at vc4_begin_query() and return this minus the begin values, or reuse of counter objects will end up continually increasing the values returned from the queries.
signature.asc
Description: PGP signature
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev