This adds new extension to get Intel hardware performance counter through i915 perf event interface.
This extension is mostly based on GL_Intel_performance_query extension which is a generic method to access performance metrics on Intel GPU. Signed-off-by: Zhenyu Wang <[email protected]> --- include/CL/cl_intel.h | 94 +++ src/CMakeLists.txt | 1 + src/cl_api.c | 238 ++++++ src/cl_context.c | 6 + src/cl_context.h | 83 +++ src/intel/intel_defines.h | 1 + src/intel/intel_perf.c | 1814 +++++++++++++++++++++++++++++++++++++++++++++ src/intel_perf.h | 32 + 8 files changed, 2269 insertions(+) create mode 100644 src/intel/intel_perf.c create mode 100644 src/intel_perf.h diff --git a/include/CL/cl_intel.h b/include/CL/cl_intel.h index 28bcb62..f734eb2 100644 --- a/include/CL/cl_intel.h +++ b/include/CL/cl_intel.h @@ -133,6 +133,100 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetMemObjectFdIntel_fn)( cl_mem /* Memory Obejct */, int* /* returned fd */); +/* Intel performance query */ + +#define PERFQUERY_COUNTER_EVENT_INTEL 0x94F0 +#define PERFQUERY_COUNTER_DURATION_NORM_INTEL 0x94F1 +#define PERFQUERY_COUNTER_DURATION_RAW_INTEL 0x94F2 +#define PERFQUERY_COUNTER_THROUGHPUT_INTEL 0x94F3 +#define PERFQUERY_COUNTER_RAW_INTEL 0x94F4 +#define PERFQUERY_COUNTER_TIMESTAMP_INTEL 0x94F5 + +#define PERFQUERY_COUNTER_DATA_UINT32_INTEL 0x94F8 +#define PERFQUERY_COUNTER_DATA_UINT64_INTEL 0x94F9 +#define PERFQUERY_COUNTER_DATA_FLOAT_INTEL 0x94FA +#define PERFQUERY_COUNTER_DATA_DOUBLE_INTEL 0x94FB +#define PERFQUERY_COUNTER_DATA_BOOL32_INTEL 0x94FC + +typedef struct perf_query_object *cl_perf_query_intel; + +extern CL_API_ENTRY void CL_API_CALL +clGetFirstPerfQueryIdIntel(cl_context ctx, cl_uint *queryId); +typedef CL_API_ENTRY void (CL_API_CALL *clGetFirstPerfQueryIdIntel_fn)(cl_context ctx, + cl_uint *queryId); + +extern CL_API_ENTRY void CL_API_CALL +clGetNextPerfQueryIdIntel(cl_context ctx, cl_uint queryId, cl_uint *nextQueryId); +typedef CL_API_ENTRY void (CL_API_CALL *clGetNextPerfQueryIdIntel_fn)(cl_context ctx, + cl_uint queryId, + cl_uint *nextQueryId); + +extern CL_API_ENTRY void CL_API_CALL +clGetPerfQueryInfoIntel(cl_context ctx, + cl_uint queryId, + cl_uint queryNameLength, cl_char *queryName, + cl_uint *dataSize, cl_uint *noCounters, + cl_uint *noInstances); +typedef CL_API_ENTRY void (CL_API_CALL *clGetPerfQueryInfoIntel_fn)(cl_context ctx, + cl_uint queryId, + cl_uint queryNameLength, + cl_char *queryName, + cl_uint *dataSize, + cl_uint *noCounters, + cl_uint *noInstances); + +extern CL_API_ENTRY void CL_API_CALL +clGetPerfCounterInfoIntel(cl_context ctx, + cl_uint queryId, cl_uint counterId, + cl_uint counterNameLength, cl_char *counterName, + cl_uint counterDescLength, cl_char *counterDesc, + cl_uint *counterOffset, cl_uint *counterDataSize, + cl_uint *counterTypeEnum, cl_uint *counterDataTypeEnum, + cl_ulong *rawCounterMaxValue); +typedef CL_API_ENTRY void +(CL_API_CALL *clGetPerfCounterInfoIntel_fn)(cl_context ctx, + cl_uint queryId, + cl_uint counterId, + cl_uint counterNameLength, cl_char *counterName, + cl_uint counterDescLength, cl_char *counterDesc, + cl_uint *counterOffset, cl_uint *counterDataSize, + cl_uint *counterTypeEnum, cl_uint *counterDataTypeEnum, + cl_ulong *rawCounterMaxValue); + +extern CL_API_ENTRY cl_int CL_API_CALL +clCreatePerfQueryIntel(cl_context context, cl_uint queryId, cl_perf_query_intel *queryHandle); +typedef CL_API_ENTRY cl_int +(CL_API_CALL *clCreatePerfQueryIntel_fn)(cl_context context, + cl_uint queryId, + cl_perf_query_intel *queryHandle); + +extern CL_API_ENTRY cl_int CL_API_CALL +clDeletePerfQueryIntel(cl_context context, cl_perf_query_intel queryHandle); +typedef CL_API_ENTRY cl_int +(CL_API_CALL *clDeletePerfQueryIntel_fn)(cl_context context, + cl_perf_query_intel queryHandle); + +extern CL_API_ENTRY cl_int CL_API_CALL +clBeginPerfQueryIntel(cl_context context, cl_perf_query_intel queryHandle); +typedef CL_API_ENTRY cl_int (CL_API_CALL *clBeginPerfQueryIntel_fn)(cl_context context, + cl_perf_query_intel queryHandle); + +extern CL_API_ENTRY cl_int CL_API_CALL +clEndPerfQueryIntel(cl_context context, cl_perf_query_intel queryHandle); +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEndPerfQueryIntel_fn)(cl_context context, + cl_perf_query_intel queryHandle); + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetPerfQueryDataIntel(cl_context context, + cl_perf_query_intel queryHandle, + cl_uint flags, size_t dataSize, void *data, + cl_uint *bytesWritten); +typedef CL_API_ENTRY cl_int +(CL_API_CALL *clGetPerfQueryDataIntel_fn)(cl_context context, + cl_perf_query_intel queryHandle, + cl_uint flags, size_t dataSize, void *data, + cl_uint *bytesWritten); + #ifdef __cplusplus } #endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4e67c71..555a988 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -88,6 +88,7 @@ set(OPENCL_SRC intel/intel_gpgpu.c intel/intel_batchbuffer.c intel/intel_driver.c + intel/intel_perf.c performance.c) if (X11_FOUND) diff --git a/src/cl_api.c b/src/cl_api.c index 3e72deb..5d9de28 100644 --- a/src/cl_api.c +++ b/src/cl_api.c @@ -42,6 +42,8 @@ #include "performance.h" +#include "intel_perf.h" + #ifndef CL_VERSION_1_2 #define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2) #define CL_DEVICE_TYPE_CUSTOM (1 << 4) @@ -3180,6 +3182,15 @@ internal_clGetExtensionFunctionAddress(const char *func_name) EXTFUNC(clCreateBufferFromLibvaIntel) EXTFUNC(clCreateImageFromLibvaIntel) EXTFUNC(clGetMemObjectFdIntel) + EXTFUNC(clGetFirstPerfQueryIdIntel) + EXTFUNC(clGetNextPerfQueryIdIntel) + EXTFUNC(clGetPerfQueryInfoIntel) + EXTFUNC(clGetPerfCounterInfoIntel) + EXTFUNC(clCreatePerfQueryIntel) + EXTFUNC(clDeletePerfQueryIntel) + EXTFUNC(clBeginPerfQueryIntel) + EXTFUNC(clEndPerfQueryIntel) + EXTFUNC(clGetPerfQueryDataIntel) return NULL; } @@ -3348,3 +3359,230 @@ clGetMemObjectFdIntel(cl_context context, error: return err; } + +/* Intel performance query extension */ +static bool +_check_query_id_valid(cl_context ctx, cl_uint id) +{ + return (id >= ctx->perfquery.n_queries) ? false : true; +} + +static bool +_check_counter_id_valid(cl_context ctx, cl_uint query_id, cl_uint counter_id) +{ + if (!_check_query_id_valid(ctx, query_id)) + return false; + return (counter_id >= ctx->perfquery.queries[query_id].n_counters) ? false : true; +} + + +CL_API_ENTRY void CL_API_CALL +clGetFirstPerfQueryIdIntel(cl_context ctx, cl_uint *queryId) +{ + if (!ctx->perfquery.enable) + return; + + intel_perf_query_first(ctx, queryId); +} + +CL_API_ENTRY void CL_API_CALL +clGetNextPerfQueryIdIntel(cl_context ctx, cl_uint queryId, cl_uint *nextQueryId) +{ + if (!ctx->perfquery.enable) + return; + + if (!_check_query_id_valid(ctx, queryId)) + return; + + intel_perf_query_next(ctx, queryId, nextQueryId); +} + +static void +return_string(cl_char *stringRet, cl_uint stringMaxLen, const cl_char *string) +{ + if (!stringRet) + return; + + strncpy(stringRet, string ? string : "", stringMaxLen); + + if (stringMaxLen > 0) + stringRet[stringMaxLen - 1] = '\0'; +} + +CL_API_ENTRY void CL_API_CALL +clGetPerfQueryInfoIntel(cl_context ctx, + cl_uint queryId, + cl_uint queryNameLength, cl_char *queryName, + cl_uint *dataSize, cl_uint *noCounters, + cl_uint *noInstances) +{ + cl_char *name; + cl_uint data_size; + cl_uint no_counter; + cl_uint no_instance; + + if (!ctx->perfquery.enable) + return; + + if (!_check_query_id_valid(ctx, queryId)) + return; + + intel_perf_query_info(ctx, queryId, &name, + &data_size, &no_counter, &no_instance); + return_string(queryName, queryNameLength, name); + + if (dataSize) + *dataSize = data_size; + + if (noCounters) + *noCounters = no_counter; + + if (noInstances) + *noInstances = no_instance; +} + +CL_API_ENTRY void CL_API_CALL +clGetPerfCounterInfoIntel(cl_context ctx, + cl_uint queryId, cl_uint counterId, + cl_uint counterNameLength, cl_char *counterName, + cl_uint counterDescLength, cl_char *counterDesc, + cl_uint *counterOffset, cl_uint *counterDataSize, + cl_uint *counterTypeEnum, cl_uint *counterDataTypeEnum, + cl_ulong *rawCounterMaxValue) +{ + cl_char *name; + cl_char *desc; + cl_uint offset; + cl_uint data_size; + cl_uint counter_type; + cl_uint data_type; + cl_ulong raw_max; + + if (!ctx->perfquery.enable) + return; + + if (!_check_counter_id_valid(ctx, queryId, counterId)) + return; + + intel_perf_counter_info(ctx, queryId, counterId, + &name, &desc, &offset, &data_size, + &counter_type, &data_type, + &raw_max); + + return_string(counterName, counterNameLength, name); + return_string(counterDesc, counterDescLength, desc); + if (counterOffset) + *counterOffset = offset; + if (counterDataSize) + *counterDataSize = data_size; + if (counterTypeEnum) + *counterTypeEnum = counter_type; + if (counterDataTypeEnum) + *counterDataTypeEnum = data_type; + if (rawCounterMaxValue) + *rawCounterMaxValue = raw_max; +} + +CL_API_ENTRY cl_int CL_API_CALL +clCreatePerfQueryIntel(cl_context context, cl_uint queryId, cl_perf_query_intel *queryHandle) +{ + cl_int err = CL_SUCCESS; + cl_perf_query_intel handle; + CHECK_CONTEXT (context); + + if (!context->perfquery.enable) { + err = CL_INVALID_VALUE; + goto error; + } + + if (!_check_query_id_valid(context, queryId)) { + err = CL_INVALID_VALUE; + goto error; + } + + /* current allow 1 instance */ + if (context->perfquery.n_query_instances) { + err = CL_INVALID_VALUE; + goto error; + } + + err = intel_perf_query_create(context, queryId, &handle); + + if (!err && queryHandle) + *queryHandle = handle; + +error: + return err; +} + +CL_API_ENTRY cl_int CL_API_CALL +clDeletePerfQueryIntel(cl_context context, cl_perf_query_intel queryHandle) +{ + cl_int err = CL_SUCCESS; + CHECK_CONTEXT (context); + + if (!context->perfquery.enable) { + err = CL_INVALID_VALUE; + goto error; + } + + err = intel_perf_query_delete(context, queryHandle); + +error: + return err; +} + +CL_API_ENTRY cl_int CL_API_CALL +clBeginPerfQueryIntel(cl_context context, cl_perf_query_intel queryHandle) +{ + cl_int err = CL_SUCCESS; + CHECK_CONTEXT(context); + + if (!context->perfquery.enable) { + err = CL_INVALID_VALUE; + goto error; + } + + err = intel_perf_query_begin(context, queryHandle); + +error: + return err; +} + +CL_API_ENTRY cl_int CL_API_CALL +clEndPerfQueryIntel(cl_context context, cl_perf_query_intel queryHandle) +{ + cl_int err = CL_SUCCESS; + CHECK_CONTEXT(context); + + if (!context->perfquery.enable) { + err = CL_INVALID_VALUE; + goto error; + } + + err = intel_perf_query_end(context, queryHandle); + +error: + return err; +} + +CL_API_ENTRY cl_int CL_API_CALL +clGetPerfQueryDataIntel(cl_context context, + cl_perf_query_intel queryHandle, + cl_uint flags, size_t dataSize, void *data, + cl_uint *bytesWritten) +{ + cl_int err = CL_SUCCESS; + CHECK_CONTEXT(context); + + if (!context->perfquery.enable) { + err = CL_INVALID_VALUE; + goto error; + } + + err = intel_perf_query_get_data(context, queryHandle, flags, + dataSize, data, bytesWritten); + +error: + return err; +} diff --git a/src/cl_context.c b/src/cl_context.c index 0f08e6a..148c7c6 100644 --- a/src/cl_context.c +++ b/src/cl_context.c @@ -32,6 +32,8 @@ #include "CL/cl.h" #include "CL/cl_gl.h" +#include "intel_perf.h" + #include <stdio.h> #include <stdlib.h> #include <stdint.h> @@ -150,6 +152,8 @@ cl_create_context(const cl_context_properties * properties, ctx->pfn_notify = pfn_notify; ctx->user_data = user_data; + intel_perf_query_init(ctx); + exit: if (errcode_ret != NULL) *errcode_ret = err; @@ -196,6 +200,8 @@ cl_context_delete(cl_context ctx) if (atomic_dec(&ctx->ref_n) > 1) return; + intel_perf_query_destroy(ctx); + /* delete the internal programs. */ for (i = CL_INTERNAL_KERNEL_MIN; i < CL_INTERNAL_KERNEL_MAX; i++) { if (ctx->internel_kernels[i]) { diff --git a/src/cl_context.h b/src/cl_context.h index 249fed8..fba81e1 100644 --- a/src/cl_context.h +++ b/src/cl_context.h @@ -26,6 +26,7 @@ #include "cl_khr_icd.h" #include <stdint.h> +#include <stdbool.h> #include <pthread.h> /* DRI device created at create context */ @@ -93,6 +94,28 @@ struct _cl_context_prop { }; }; +struct perf_query_counter; +struct perf_oa_counter; + +struct perf_query +{ + const char *name; + struct perf_query_counter *counters; + int n_counters; + size_t data_size; + + /* OA specific */ + int oa_metrics_set; + int oa_format; + struct perf_oa_counter *oa_counters; + int n_oa_counters; +}; + +#define MAX_PERF_QUERIES 2 +#define MAX_PERF_QUERY_COUNTERS 150 +#define MAX_OA_QUERY_COUNTERS 100 +#define MAX_RAW_OA_COUNTERS 62 + #define IS_EGL_CONTEXT(ctx) (ctx->props.gl_type == CL_GL_EGL_DISPLAY) #define EGL_DISP(ctx) (EGLDisplay)(ctx->props.egl_display) #define EGL_CTX(ctx) (EGLContext)(ctx->props.gl_context) @@ -127,6 +150,66 @@ struct _cl_context { /* User's callback when error occur in context */ void *user_data; /* A pointer to user supplied data */ + struct { + struct perf_query queries[MAX_PERF_QUERIES]; + int n_queries; + bool enable; + + /* A common OA counter that we want to read directly in several places */ + uint64_t (*read_oa_report_timestamp)(uint32_t *report); + + /* Needed to normalize counters aggregated across all EUs */ + int eu_count; + + /* The i915_oa perf event we open to setup + enable the OA counters */ + int perf_oa_event_fd; + + /* An i915_oa perf event fd gives exclusive access to the OA unit that + * will report counter snapshots for a specific counter set/profile in a + * specific layout/format so we can only start OA queries that are + * compatible with the currently open fd... */ + int perf_oa_metrics_set; + int perf_oa_format; + + /* The mmaped circular buffer for collecting samples from perf */ + uint8_t *perf_oa_mmap_base; + size_t perf_oa_buffer_size; + struct perf_event_mmap_page *perf_oa_mmap_page; + + /* The system's page size */ + unsigned int page_size; + + /* TODO: generalize and split these into an array indexed by the + * query type... */ + int n_active_oa_queries; + + /* The number of queries depending on running OA counters which + * extends beyond brw_end_perf_query() since we need to wait until + * the last MI_RPC command has been written. */ + int n_oa_users; + + /* We also get the gpu to write an ID for snapshots corresponding + * to the beginning and end of a query, but for simplicity these + * IDs use a separate namespace. */ + int next_query_start_report_id; + + /** + * An array of queries whose results haven't yet been assembled based on + * the data in buffer objects. + * + * These may be active, or have already ended. However, the results + * have not been requested. + */ + struct perf_query_object **unresolved; + int unresolved_elements; + int unresolved_array_size; + + /* The total number of query objects so we can relinquish + * our exclusive access to perf if the application deletes + * all of its objects. (NB: We only disable perf while + * there are no active queries) */ + int n_query_instances; + } perfquery; }; /* Implement OpenCL function */ diff --git a/src/intel/intel_defines.h b/src/intel/intel_defines.h index 1080a91..50b835d 100644 --- a/src/intel/intel_defines.h +++ b/src/intel/intel_defines.h @@ -321,6 +321,7 @@ #define GEN7_PIPE_CONTROL_WRITE_TIMESTAMP (3 << 14) #define GEN7_PIPE_CONTROL_GLOBAL_GTT_WRITE (1 << 2) +#define MI_REPORT_PERF_COUNT ((0x28 << 23) | (3 - 2)) #define GEN_MAPFILTER_NEAREST 0x0 #define GEN_MAPFILTER_LINEAR 0x1 diff --git a/src/intel/intel_perf.c b/src/intel/intel_perf.c new file mode 100644 index 0000000..65ec2a1 --- /dev/null +++ b/src/intel/intel_perf.c @@ -0,0 +1,1814 @@ +/* + * Copyright 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Author: + * Zhenyu Wang <[email protected]> + */ + +#include <linux/perf_event.h> + +#include <asm/unistd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/ioctl.h> +#include <stropts.h> + +#include <limits.h> +#include <errno.h> +#include <stdint.h> +#include <stdbool.h> +#include <stdlib.h> +#include <stdarg.h> +#include <stdio.h> +#include <unistd.h> +#include <string.h> +#include <assert.h> +#include <inttypes.h> + +#include "intel_driver.h" +#include "intel_perf.h" + +#include "intel/intel_gpgpu.h" +#include "intel/intel_defines.h" +#include "intel/intel_batchbuffer.h" + +#include "cl_context.h" +#include "cl_command_queue.h" +#include "cl_device_id.h" + +#include "i915_drm.h" + +#define DBG(fmt, args...) fprintf(stderr, fmt, ##args) + +/* Describes how to read one OA counter which might be a raw counter read + * directly from a counter snapshot or could be a higher level counter derived + * from one or more raw counters. + * + * Raw counters will have set ->report_offset to the snapshot offset and have + * an accumulator that can consider counter overflow according to the width of + * that counter. + * + * Higher level counters can currently reference up to 3 other counters + use + * ->config for anything. They don't need an accumulator. + * + * The data type that will be written to *value_out by the read function can + * be determined by ->data_type + */ +struct perf_oa_counter +{ + struct perf_oa_counter *reference0; + struct perf_oa_counter *reference1; + struct perf_oa_counter *reference2; + union { + int report_offset; + int config; + }; + + int accumulator_index; + void (*accumulate)(struct perf_oa_counter *counter, + uint32_t *start, + uint32_t *end, + uint64_t *accumulator); + unsigned int data_type; + void (*read)(struct perf_oa_counter *counter, + uint64_t *accumulated, + void *value_out); +}; + +/* A counter that will be advertised and reported to applications */ +struct perf_query_counter +{ + const char *name; + const char *desc; + unsigned int type; + unsigned int data_type; + uint64_t raw_max; + size_t offset; + size_t size; + + union { + struct perf_oa_counter *oa_counter; + uint32_t pipeline_stat_reg; + }; +}; + +struct perf_query_builder +{ + cl_context ctx; + struct perf_query *query; + size_t offset; + int next_accumulator_index; + + int a_offset; + int b_offset; + int c_offset; + + struct perf_oa_counter *gpu_core_clock; +}; + +/** + * i965 representation of a performance query object. + * + * NB: We want to keep this structure relatively lean considering that + * applications may expect to allocate enough objects to be able to + * query around all draw calls in a frame. + */ +struct perf_query_object +{ + const struct perf_query *query; + + /* Use own batch for perf bo */ + intel_batchbuffer_t *batch; + + struct { + + /** + * BO containing OA counter snapshots at query Begin/End time. + */ + drm_intel_bo *bo; + int current_report_id; + + /** + * We collect periodic counter snapshots via perf so we can account + * for counter overflow and this is a pointer into the circular + * perf buffer for collecting snapshots that lie within the begin-end + * bounds of this query. + */ + unsigned int perf_tail; + + /** + * Storage the final accumulated OA counters. + */ + uint64_t accumulator[MAX_RAW_OA_COUNTERS]; + + /** + * false while in the unresolved_elements list, and set to true when + * the final, end MI_RPC snapshot has been accumulated. + */ + bool results_accumulated; + + } oa; +}; + +/* Samples read from the perf circular buffer */ +struct oa_perf_sample { + struct perf_event_header header; + uint32_t raw_size; + uint8_t raw_data[]; +}; +#define MAX_OA_PERF_SAMPLE_SIZE (8 + /* perf_event_header */ \ + 4 + /* raw_size */ \ + 256 + /* raw OA counter snapshot */ \ + 4) /* alignment padding */ + +#define TAKEN(HEAD, TAIL, POT_SIZE) (((HEAD) - (TAIL)) & (POT_SIZE - 1)) + +/* Note: this will equate to 0 when the buffer is exactly full... */ +#define REMAINING(HEAD, TAIL, POT_SIZE) (POT_SIZE - TAKEN (HEAD, TAIL, POT_SIZE)) + +#if defined(__i386__) +#define rmb() __asm__ volatile("lock; addl $0,0(%%esp)" ::: "memory") +#define mb() __asm__ volatile("lock; addl $0,0(%%esp)" ::: "memory") +#endif + +#if defined(__x86_64__) +#define rmb() __asm__ volatile("lfence" ::: "memory") +#define mb() __asm__ volatile("mfence" ::: "memory") +#endif + +/* TODO: consider using <stdatomic.h> something like: + * + * #define rmb() atomic_thread_fence(memory_order_seq_consume) + * #define mb() atomic_thread_fence(memory_order_seq_cst) + */ + +/* Allow building for a more recent kernel than the system headers + * correspond too... */ +#ifndef PERF_EVENT_IOC_FLUSH +#include <linux/ioctl.h> +#define PERF_EVENT_IOC_FLUSH _IO ('$', 8) +#endif + +#define SECOND_SNAPSHOT_OFFSET_IN_BYTES 2048 + +static inline size_t +pot_align(size_t base, int pot_alignment) +{ + return (base + pot_alignment - 1) & ~(pot_alignment - 1); +} + +/******************************************************************************/ +/** + * Emit an MI_REPORT_PERF_COUNT command packet. + * + * This writes the current OA counter values to buffer. + */ +static void +emit_mi_report_perf_count(cl_context ctx, + struct perf_query_object *obj, + uint32_t offset_in_bytes, + uint32_t report_id) +{ + drm_intel_bo *bo = obj->oa.bo; + + assert(offset_in_bytes % 64 == 0); + + intel_batchbuffer_reset(obj->batch, 512); + + /* Reports apparently don't always get written unless we flush first. */ + /* XXX required? need to call pipe_control function in intel_gpgpu.c */ + // intel_batchbuffer_emit_mi_flush(brw); + + BEGIN_BATCH(obj->batch, 3); + OUT_BATCH(obj->batch, MI_REPORT_PERF_COUNT); + OUT_RELOC(obj->batch, bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, + offset_in_bytes); + OUT_BATCH(obj->batch, report_id); + ADVANCE_BATCH(obj->batch); + + intel_batchbuffer_flush(obj->batch); + + /* XXX */ + /* Reports apparently don't always get written unless we flush after. */ + //intel_batchbuffer_emit_mi_flush(brw); +} + +static unsigned int +read_perf_head(struct perf_event_mmap_page *mmap_page) +{ + unsigned int head = (*(volatile uint64_t *)&mmap_page->data_head); + rmb(); + + return head; +} + +static void +write_perf_tail(struct perf_event_mmap_page *mmap_page, + unsigned int tail) +{ + /* Make sure we've finished reading all the sample data we + * we're consuming before updating the tail... */ + mb(); + mmap_page->data_tail = tail; +} + +/* Update the real perf tail pointer according to the query tail that + * is currently furthest behind... + */ +static void +update_perf_tail(cl_context ctx) +{ + unsigned int size = ctx->perfquery.perf_oa_buffer_size; + unsigned int head = read_perf_head(ctx->perfquery.perf_oa_mmap_page); + int straggler_taken = -1; + unsigned int straggler_tail; + int i; + + for (i = 0; i < ctx->perfquery.unresolved_elements; i++) { + struct perf_query_object *obj = ctx->perfquery.unresolved[i]; + int taken; + + if (!obj->oa.bo) + continue; + + taken = TAKEN(head, obj->oa.perf_tail, size); + + if (taken > straggler_taken) { + straggler_taken = taken; + straggler_tail = obj->oa.perf_tail; + } + } + + if (straggler_taken >= 0) + write_perf_tail(ctx->perfquery.perf_oa_mmap_page, straggler_tail); +} + +/** + * Add a query to the global list of "unresolved queries." + * + * Queries are "unresolved" until all the counter snapshots have been + * accumulated via accumulate_oa_snapshots() after the end MI_REPORT_PERF_COUNT + * has landed in query->oa.bo. + */ +static void +add_to_unresolved_query_list(cl_context ctx, + struct perf_query_object *obj) +{ + if (ctx->perfquery.unresolved_elements >= + ctx->perfquery.unresolved_array_size) { + ctx->perfquery.unresolved_array_size *= 1.5; + ctx->perfquery.unresolved = realloc(ctx->perfquery.unresolved, + sizeof(struct perf_query_object *) * + ctx->perfquery.unresolved_array_size); + } + + ctx->perfquery.unresolved[ctx->perfquery.unresolved_elements++] = obj; + + if (obj->oa.bo) + update_perf_tail(ctx); +} + +/** + * Remove a query from the global list of "unresolved queries." once + * the end MI_RPC OA counter snapshot has been accumulated, or when + * discarding unwanted query results. + */ +static void +drop_from_unresolved_query_list(cl_context ctx, + struct perf_query_object *obj) +{ + int i; + + for (i = 0; i < ctx->perfquery.unresolved_elements; i++) { + if (ctx->perfquery.unresolved[i] == obj) { + int last_elt = --ctx->perfquery.unresolved_elements; + + if (i == last_elt) + ctx->perfquery.unresolved[i] = NULL; + else + ctx->perfquery.unresolved[i] = ctx->perfquery.unresolved[last_elt]; + + break; + } + } + + if (obj->oa.bo) + update_perf_tail(ctx); +} + +static uint64_t +read_report_timestamp(cl_context ctx, uint32_t *report) +{ + return ctx->perfquery.read_oa_report_timestamp(report); +} + +/** + * Given pointers to starting and ending OA snapshots, add the deltas for each + * counter to the results. + */ +static void +add_deltas(cl_context ctx, + struct perf_query_object *obj, + uint32_t *start, uint32_t *end) +{ + const struct perf_query *query = obj->query; + int i; + +#if 0 + fprintf(stderr, "Accumulating delta:\n"); + fprintf(stderr, "> Start timestamp = %" PRIu64 "\n", read_report_timestamp(ctx, start)); + fprintf(stderr, "> End timestamp = %" PRIu64 "\n", read_report_timestamp(ctx, end)); +#endif + + for (i = 0; i < query->n_oa_counters; i++) { + struct perf_oa_counter *oa_counter = &query->oa_counters[i]; + //uint64_t pre_accumulate; + + if (!oa_counter->accumulate) + continue; + + //pre_accumulate = query->oa.accumulator[counter->id]; + oa_counter->accumulate(oa_counter, + start, end, + obj->oa.accumulator); +#if 0 + fprintf(stderr, "> Updated %s from %" PRIu64 " to %" PRIu64 "\n", + counter->name, pre_accumulate, + query->oa.accumulator[counter->id]); +#endif + } +} + +/* Handle restarting ioctl if interupted... */ +static int +perf_ioctl(int fd, unsigned long request, void *arg) +{ + int ret; + + do { + ret = ioctl(fd, request, arg); + } while (ret == -1 && (errno == EINTR || errno == EAGAIN)); + return ret; +} + +static bool +inc_n_oa_users(cl_context ctx) +{ + if (ctx->perfquery.n_oa_users == 0 && + perf_ioctl(ctx->perfquery.perf_oa_event_fd, + PERF_EVENT_IOC_ENABLE, 0) < 0) + { + return false; + } + ++ctx->perfquery.n_oa_users; + + return true; +} + +static void +dec_n_oa_users(cl_context ctx) +{ + /* Disabling the i915_oa event will effectively disable the OA + * counters. Note it's important to be sure there are no outstanding + * MI_RPC commands at this point since they could stall the CS + * indefinitely once OACONTROL is disabled. + */ + --ctx->perfquery.n_oa_users; + if (ctx->perfquery.n_oa_users == 0 && + perf_ioctl(ctx->perfquery.perf_oa_event_fd, + PERF_EVENT_IOC_DISABLE, 0) < 0) + { + DBG("WARNING: Error disabling i915_oa perf event: %m\n"); + } +} + +/** + * Accumulate OA counter results from a series of snapshots. + * + * N.B. We write snapshots for the beginning and end of a query into + * query->oa.bo as well as collect periodic snapshots from the Linux + * perf interface. + * + * These periodic snapshots help to ensure we handle counter overflow + * correctly by being frequent enough to ensure we don't miss multiple + * wrap overflows of a counter between snapshots. + */ +static void +accumulate_oa_snapshots(cl_context ctx, + struct perf_query_object *obj) +{ + uint32_t *query_buffer; + uint8_t *data = ctx->perfquery.perf_oa_mmap_base + ctx->perfquery.page_size; + const unsigned int size = ctx->perfquery.perf_oa_buffer_size; + const uint64_t mask = size - 1; + uint64_t head; + uint64_t tail; + uint32_t *start; + uint64_t start_timestamp; + uint32_t *last; + uint32_t *end; + uint64_t end_timestamp; + uint8_t scratch[MAX_OA_PERF_SAMPLE_SIZE]; + + if (perf_ioctl(ctx->perfquery.perf_oa_event_fd, + PERF_EVENT_IOC_FLUSH, 0) < 0) + DBG("Failed to flush outstanding perf events: %m\n"); + + drm_intel_bo_map(obj->oa.bo, false); + query_buffer = obj->oa.bo->virtual; + + start = last = query_buffer; + end = query_buffer + (SECOND_SNAPSHOT_OFFSET_IN_BYTES / sizeof(uint32_t)); + +#warning "TODO: find a way to report OA errors from the kernel" + /* XXX: Is there anything we can do to handle this gracefully/ + * report the error to the application? */ + if (start[0] != obj->oa.current_report_id) + DBG("Spurious start report id=%"PRIu32"\n", start[0]); + if (end[0] != (obj->oa.current_report_id + 1)) + DBG("Spurious end report id=%"PRIu32"\n", start[0]); + + start_timestamp = read_report_timestamp(ctx, start); + end_timestamp = read_report_timestamp(ctx, end); + + head = read_perf_head(ctx->perfquery.perf_oa_mmap_page); + tail = obj->oa.perf_tail; + + //fprintf(stderr, "Handle event mask = 0x%" PRIx64 + // " head=%" PRIu64 " tail=%" PRIu64 "\n", mask, head, tail); + + while (TAKEN(head, tail, size)) { + const struct perf_event_header *header = + (const struct perf_event_header *)(data + (tail & mask)); + + if (header->size == 0) { + DBG("Spurious header size == 0\n"); + /* XXX: How should we handle this instead of exiting() */ +#warning "FIXME: avoid exit(1) in error condition" + exit(1); + } + + if (header->size > (head - tail)) { + DBG("Spurious header size would overshoot head\n"); + /* XXX: How should we handle this instead of exiting() */ + exit(1); + } + + //fprintf(stderr, "header = %p tail=%" PRIu64 " size=%d\n", + // header, tail, header->size); + + if ((const uint8_t *)header + header->size > data + size) { + int before; + + if (header->size > MAX_OA_PERF_SAMPLE_SIZE) { + DBG("Skipping spurious sample larger than expected\n"); + tail += header->size; + continue; + } + + before = data + size - (const uint8_t *)header; + + memcpy(scratch, header, before); + memcpy(scratch + before, data, header->size - before); + + header = (struct perf_event_header *)scratch; + //fprintf(stderr, "DEBUG: split\n"); + //exit(1); + } + + switch (header->type) { + case PERF_RECORD_LOST: { + struct { + struct perf_event_header header; + uint64_t id; + uint64_t n_lost; + } *lost = (void *)header; + DBG("i915_oa: Lost %" PRIu64 " events\n", lost->n_lost); + break; + } + + case PERF_RECORD_THROTTLE: + DBG("i915_oa: Sampling has been throttled\n"); + break; + + case PERF_RECORD_UNTHROTTLE: + DBG("i915_oa: Sampling has been unthrottled\n"); + break; + + case PERF_RECORD_SAMPLE: { + struct oa_perf_sample *perf_sample = (struct oa_perf_sample *)header; + uint32_t *report = (uint32_t *)perf_sample->raw_data; + uint64_t timestamp = read_report_timestamp(ctx, report); + + if (timestamp >= end_timestamp) + goto end; + + if (timestamp > start_timestamp) { + add_deltas(ctx, obj, last, report); + last = report; + } + + break; + } + + default: + DBG("i915_oa: Spurious header type = %d\n", header->type); + } + + //fprintf(stderr, "Tail += %d\n", header->size); + + tail += header->size; + } + +end: + + add_deltas(ctx, obj, last, end); + + DBG("Marking %p resolved - results gathered\n", obj); + + drm_intel_bo_unmap(obj->oa.bo); + obj->oa.results_accumulated = true; + drop_from_unresolved_query_list(ctx, obj); + dec_n_oa_users(ctx); +} + +/******************************************************************************/ + +static uint64_t +read_file_uint64 (const char *file) +{ + char buf[32]; + int fd, n; + + fd = open(file, 0); + if (fd < 0) + return 0; + n = read(fd, buf, sizeof (buf) - 1); + close(fd); + if (n < 0) + return 0; + + buf[n] = '\0'; + return strtoull(buf, 0, 0); +} + +static uint64_t +lookup_i915_oa_id (void) +{ + return read_file_uint64("/sys/bus/event_source/devices/i915_oa/type"); +} + +static long +perf_event_open (struct perf_event_attr *hw_event, + pid_t pid, + int cpu, + int group_fd, + unsigned long flags) +{ + return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); +} + +static bool +open_i915_oa_event(cl_context ctx, + int metrics_set, + int report_format, + int period_exponent, + int drm_fd, + uint32_t ctx_id) +{ + struct perf_event_attr attr; + drm_i915_oa_attr_t oa_attr; + int event_fd; + void *mmap_base; + + memset(&attr, 0, sizeof(attr)); + attr.size = sizeof(attr); + attr.type = lookup_i915_oa_id(); + + attr.sample_type = PERF_SAMPLE_RAW; + attr.disabled = 1; /* initially off */ + attr.sample_period = 1; + + memset(&oa_attr, 0, sizeof(oa_attr)); + oa_attr.size = sizeof(oa_attr); + + oa_attr.format = report_format; + oa_attr.metrics_set = metrics_set; + oa_attr.timer_exponent = period_exponent; + + oa_attr.single_context = true; + oa_attr.ctx_id = ctx_id; + oa_attr.drm_fd = drm_fd; + + attr.config = (uint64_t)&oa_attr; + + event_fd = perf_event_open(&attr, + -1, /* pid */ + 0, /* cpu */ + -1, /* group fd */ + PERF_FLAG_FD_CLOEXEC); /* flags */ + if (event_fd == -1) { + DBG("Error opening i915_oa perf event: %m\n"); + return false; + } + + /* NB: A read-write mapping ensures the kernel will stop writing data when + * the buffer is full, and will report samples as lost. */ + mmap_base = mmap(NULL, + ctx->perfquery.perf_oa_buffer_size + ctx->perfquery.page_size, + PROT_READ | PROT_WRITE, MAP_SHARED, event_fd, 0); + if (mmap_base == MAP_FAILED) { + DBG("Error mapping circular buffer, %m\n"); + close (event_fd); + return false; + } + + ctx->perfquery.perf_oa_event_fd = event_fd; + ctx->perfquery.perf_oa_mmap_base = mmap_base; + ctx->perfquery.perf_oa_mmap_page = mmap_base; + + ctx->perfquery.perf_oa_metrics_set = metrics_set; + ctx->perfquery.perf_oa_format = report_format; + + return true; +} + +static bool +begin_perf_query(cl_context ctx, + struct perf_query_object *obj) +{ + const struct perf_query *query = obj->query; + intel_driver_t *driver = (intel_driver_t *)ctx->drv; + + /* If the OA counters aren't already on, enable them. */ + if (ctx->perfquery.perf_oa_event_fd == -1) { + uint32_t ctx_id = drm_intel_gem_context_get_context_id(driver->ctx); + int period_exponent; + + /* The timestamp for HSW+ increments every 80ns + * + * The period_exponent gives a sampling period as follows: + * sample_period = 80ns * 2^(period_exponent + 1) + * + * The overflow period for Haswell can be calculated as: + * + * 2^32 / (n_eus * max_gen_freq * 2) + * (E.g. 40 EUs @ 1GHz = ~53ms) + * + * We currently sample every 42 milliseconds... + */ + period_exponent = 18; + + if (!open_i915_oa_event(ctx, + query->oa_metrics_set, + query->oa_format, + period_exponent, + driver->fd, + ctx_id)) + return false; + } else { + /* Opening an i915_oa event fd implies exclusive access to + * the OA unit which will generate counter reports for a + * specific counter set/profile with a specific layout/format + * so we can't begin any OA based queries that require a + * different profile or format unless we get an opportunity + * to close the event fd and open a new one... + */ + if (ctx->perfquery.perf_oa_metrics_set != query->oa_metrics_set || + ctx->perfquery.perf_oa_format != query->oa_format) + { + return false; + } + } + + if (!inc_n_oa_users(ctx)) { + DBG("WARNING: Error enabling i915_oa perf event: %m\n"); + return false; + } + + if (obj->oa.bo) { + drm_intel_bo_unreference(obj->oa.bo); + obj->oa.bo = NULL; + } + + obj->oa.bo = + drm_intel_bo_alloc(driver->bufmgr, "perf. query OA bo", 4096, 64); +#ifdef DEBUG + /* Pre-filling the BO helps debug whether writes landed. */ + drm_intel_bo_map(obj->oa.bo, true); + memset((char *) obj->oa.bo->virtual, 0x80, 4096); + drm_intel_bo_unmap(obj->oa.bo); +#endif + + obj->oa.current_report_id = ctx->perfquery.next_query_start_report_id; + ctx->perfquery.next_query_start_report_id += 2; + + /* Take a starting OA counter snapshot. */ + emit_mi_report_perf_count(ctx, obj, 0, + obj->oa.current_report_id); + ++ctx->perfquery.n_active_oa_queries; + + /* Each unresolved query maintains a separate tail pointer into the + * circular perf sample buffer. The real tail pointer in + * perfquery.perf_oa_mmap_page.data_tail will correspond to the query + * tail that is furthest behind. + */ + obj->oa.perf_tail = read_perf_head(ctx->perfquery.perf_oa_mmap_page); + + memset(obj->oa.accumulator, 0, sizeof(obj->oa.accumulator)); + obj->oa.results_accumulated = false; + + add_to_unresolved_query_list(ctx, obj); + + return true; +} + +static void +end_perf_query(cl_context ctx, + struct perf_query_object *obj) +{ + /* Take an ending OA counter snapshot. */ + emit_mi_report_perf_count(ctx, obj, + SECOND_SNAPSHOT_OFFSET_IN_BYTES, + obj->oa.current_report_id + 1); + --ctx->perfquery.n_active_oa_queries; + + /* NB: even though the query has now ended, it can't be resolved + * until the end MI_REPORT_PERF_COUNT snapshot has been written + * to query->oa.bo */ +} + +static void +wait_perf_query(cl_context ctx, + struct perf_query_object *obj) +{ + drm_intel_bo *bo = NULL; + + bo = obj->oa.bo; + if (bo == NULL) + return; + + /* If the current batch references our results bo then we need to + * flush first... */ + if (drm_intel_bo_references(obj->batch->buffer, bo)) + intel_batchbuffer_flush(obj->batch); + +#if 0 /* XXX */ + if (drm_intel_bo_busy(bo)) + perf_debug("Stalling GPU waiting for a performance query object.\n"); +#endif + + drm_intel_bo_wait_rendering(bo); +} + +/** + * Is a performance query result available? + */ +static bool +is_perf_query_ready(cl_context ctx, + struct perf_query_object *obj) +{ + return (obj->oa.results_accumulated || + (obj->oa.bo && + !drm_intel_bo_references(obj->batch->buffer, obj->oa.bo) && + !drm_intel_bo_busy(obj->oa.bo))); +} + + +/******************************************************************************/ + +/* Type safe wrappers for reading OA counter values */ + +static uint64_t +read_uint64_oa_counter(struct perf_oa_counter *counter, uint64_t *accumulated) +{ + uint64_t value; + + assert(counter->data_type == PERFQUERY_COUNTER_DATA_UINT64_INTEL); + + counter->read(counter, accumulated, &value); + + return value; +} + +static float +read_float_oa_counter(struct perf_oa_counter *counter, uint64_t *accumulated) +{ + float value; + + assert(counter->data_type == PERFQUERY_COUNTER_DATA_FLOAT_INTEL); + + counter->read(counter, accumulated, &value); + + return value; +} + +/******************************************************************************/ + +/* + * OA counter normalisation support... + */ + +static void +read_accumulated_oa_counter_cb(struct perf_oa_counter *counter, + uint64_t *accumulator, + void *value) +{ + *((uint64_t *)value) = accumulator[counter->accumulator_index]; +} + +static void +accumulate_uint32_cb(struct perf_oa_counter *counter, + uint32_t *report0, + uint32_t *report1, + uint64_t *accumulator) +{ + accumulator[counter->accumulator_index] += + (uint32_t)(report1[counter->report_offset] - + report0[counter->report_offset]); +} + +#if 0 +/* XXX: we should factor this out for now, but notably BDW has 40bit counters... */ +static void +accumulate_uint40_cb(struct perf_oa_counter *counter, + uint32_t *report0, + uint32_t *report1, + uint64_t *accumulator) +{ + uint32_t value0 = report0[counter->report_offset]; + uint32_t value1 = report1[counter->report_offset]; + uint64_t delta; + + if (value0 > value1) + delta = (1ULL << 40) + value1 - value0; + else + delta = value1 - value0; + + accumulator[counter->accumulator_index] += delta; +} +#endif + +static struct perf_oa_counter * +add_raw_oa_counter(struct perf_query_builder *builder, int report_offset) +{ + struct perf_oa_counter *counter = + &builder->query->oa_counters[builder->query->n_oa_counters++]; + + counter->report_offset = report_offset; + counter->accumulator_index = builder->next_accumulator_index++; + counter->accumulate = accumulate_uint32_cb; + counter->read = read_accumulated_oa_counter_cb; + counter->data_type = PERFQUERY_COUNTER_DATA_UINT64_INTEL; + + return counter; +} + +static uint64_t +hsw_read_report_timestamp(uint32_t *report) +{ + /* The least significant timestamp bit represents 80ns on Haswell */ + return ((uint64_t)report[1]) * 80; +} + +static void +accumulate_hsw_elapsed_cb(struct perf_oa_counter *counter, + uint32_t *report0, + uint32_t *report1, + uint64_t *accumulator) +{ + uint64_t timestamp0 = hsw_read_report_timestamp(report0); + uint64_t timestamp1 = hsw_read_report_timestamp(report1); + + accumulator[counter->accumulator_index] += (timestamp1 - timestamp0); +} + +static struct perf_oa_counter * +add_hsw_elapsed_oa_counter(struct perf_query_builder *builder) +{ + struct perf_oa_counter *counter = + &builder->query->oa_counters[builder->query->n_oa_counters++]; + + counter->accumulator_index = builder->next_accumulator_index++; + counter->accumulate = accumulate_hsw_elapsed_cb; + counter->read = read_accumulated_oa_counter_cb; + counter->data_type = PERFQUERY_COUNTER_DATA_UINT64_INTEL; + + return counter; +} + +static void +read_frequency_cb(struct perf_oa_counter *counter, + uint64_t *accumulated, + void *value) /* uint64 */ +{ + uint64_t clk_delta = read_uint64_oa_counter(counter->reference0, accumulated); + uint64_t time_delta = read_uint64_oa_counter(counter->reference1, accumulated); + uint64_t *ret = value; + + if (!clk_delta) { + *ret = 0; + return; + } + + *ret = (clk_delta * 1000) / time_delta; +} + +static struct perf_oa_counter * +add_avg_frequency_oa_counter(struct perf_query_builder *builder, + struct perf_oa_counter *timestamp) +{ + struct perf_oa_counter *counter = + &builder->query->oa_counters[builder->query->n_oa_counters++]; + + assert(timestamp->data_type == PERFQUERY_COUNTER_DATA_UINT64_INTEL); + + counter->reference0 = builder->gpu_core_clock; + counter->reference1 = timestamp; + counter->read = read_frequency_cb; + counter->data_type = PERFQUERY_COUNTER_DATA_UINT64_INTEL; + + return counter; +} + +static void +read_oa_counter_normalized_by_gpu_duration_cb(struct perf_oa_counter *counter, + uint64_t *accumulated, + void *value) /* float */ +{ + uint64_t delta = read_uint64_oa_counter(counter->reference0, accumulated); + uint64_t clk_delta = read_uint64_oa_counter(counter->reference1, accumulated); + float *ret = value; + + if (!clk_delta) { + *ret = 0; + return; + } + + *ret = ((double)delta * 100.0) / (double)clk_delta; +} + +static struct perf_oa_counter * +add_oa_counter_normalised_by_gpu_duration(struct perf_query_builder *builder, + struct perf_oa_counter *raw) +{ + struct perf_oa_counter *counter = + &builder->query->oa_counters[builder->query->n_oa_counters++]; + + counter->reference0 = raw; + counter->reference1 = builder->gpu_core_clock; + counter->read = read_oa_counter_normalized_by_gpu_duration_cb; + counter->data_type = PERFQUERY_COUNTER_DATA_FLOAT_INTEL; + + return counter; +} + +static void +read_hsw_samplers_busy_duration_cb(struct perf_oa_counter *counter, + uint64_t *accumulated, + void *value) /* float */ +{ + uint64_t sampler0_busy = read_uint64_oa_counter(counter->reference0, accumulated); + uint64_t sampler1_busy = read_uint64_oa_counter(counter->reference1, accumulated); + uint64_t clk_delta = read_uint64_oa_counter(counter->reference2, accumulated); + float *ret = value; + + if (!clk_delta) { + *ret = 0; + return; + } + + *ret = ((double)(sampler0_busy + sampler1_busy) * 100.0) / ((double)clk_delta * 2.0); +} + +static struct perf_oa_counter * +add_hsw_samplers_busy_duration_oa_counter(struct perf_query_builder *builder, + struct perf_oa_counter *sampler0_busy_raw, + struct perf_oa_counter *sampler1_busy_raw) +{ + struct perf_oa_counter *counter = + &builder->query->oa_counters[builder->query->n_oa_counters++]; + + counter->reference0 = sampler0_busy_raw; + counter->reference1 = sampler1_busy_raw; + counter->reference2 = builder->gpu_core_clock; + counter->read = read_hsw_samplers_busy_duration_cb; + counter->data_type = PERFQUERY_COUNTER_DATA_FLOAT_INTEL; + + return counter; +} + +static void +read_hsw_slice_extrapolated_cb(struct perf_oa_counter *counter, + uint64_t *accumulated, + void *value) /* float */ +{ + uint64_t counter0 = read_uint64_oa_counter(counter->reference0, accumulated); + uint64_t counter1 = read_uint64_oa_counter(counter->reference1, accumulated); + int eu_count = counter->config; + uint64_t *ret = value; + + *ret = (counter0 + counter1) * eu_count; +} + +static struct perf_oa_counter * +add_hsw_slice_extrapolated_oa_counter(struct perf_query_builder *builder, + struct perf_oa_counter *counter0, + struct perf_oa_counter *counter1) +{ + struct perf_oa_counter *counter = + &builder->query->oa_counters[builder->query->n_oa_counters++]; + + counter->reference0 = counter0; + counter->reference1 = counter1; + counter->config = builder->ctx->perfquery.eu_count; + counter->read = read_hsw_slice_extrapolated_cb; + counter->data_type = PERFQUERY_COUNTER_DATA_UINT64_INTEL; + + return counter; +} + +static void +read_oa_counter_normalized_by_eu_duration_cb(struct perf_oa_counter *counter, + uint64_t *accumulated, + void *value) /* float */ +{ + uint64_t delta = read_uint64_oa_counter(counter->reference0, accumulated); + uint64_t clk_delta = read_uint64_oa_counter(counter->reference1, accumulated); + float *ret = value; + + if (!clk_delta) { + *ret = 0; + return; + } + + delta /= counter->config; /* EU count */ + + *ret = (double)delta * 100.0 / (double)clk_delta; +} + +static struct perf_oa_counter * +add_oa_counter_normalised_by_eu_duration(struct perf_query_builder *builder, + struct perf_oa_counter *raw) +{ + struct perf_oa_counter *counter = + &builder->query->oa_counters[builder->query->n_oa_counters++]; + + counter->reference0 = raw; + counter->reference1 = builder->gpu_core_clock; + counter->config = builder->ctx->perfquery.eu_count; + counter->read = read_oa_counter_normalized_by_eu_duration_cb; + counter->data_type = PERFQUERY_COUNTER_DATA_FLOAT_INTEL; + + return counter; +} + +static void +read_av_thread_cycles_counter_cb(struct perf_oa_counter *counter, + uint64_t *accumulated, + void *value) /* uint64 */ +{ + uint64_t delta = read_uint64_oa_counter(counter->reference0, accumulated); + uint64_t spawned = read_uint64_oa_counter(counter->reference1, accumulated); + uint64_t *ret = value; + + if (!spawned) { + *ret = 0; + return; + } + + *ret = delta / spawned; +} + +static struct perf_oa_counter * +add_average_thread_cycles_oa_counter(struct perf_query_builder *builder, + struct perf_oa_counter *raw, + struct perf_oa_counter *denominator) +{ + struct perf_oa_counter *counter = + &builder->query->oa_counters[builder->query->n_oa_counters++]; + + counter->reference0 = raw; + counter->reference1 = denominator; + counter->read = read_av_thread_cycles_counter_cb; + counter->data_type = PERFQUERY_COUNTER_DATA_UINT64_INTEL; + + return counter; +} + +static void +read_scaled_uint64_counter_cb(struct perf_oa_counter *counter, + uint64_t *accumulated, + void *value) /* uint64 */ +{ + uint64_t delta = read_uint64_oa_counter(counter->reference0, accumulated); + uint64_t scale = counter->config; + uint64_t *ret = value; + + *ret = delta * scale; +} + +static struct perf_oa_counter * +add_scaled_uint64_oa_counter(struct perf_query_builder *builder, + struct perf_oa_counter *input, + int scale) +{ + struct perf_oa_counter *counter = + &builder->query->oa_counters[builder->query->n_oa_counters++]; + + counter->reference0 = input; + counter->config = scale; + counter->read = read_scaled_uint64_counter_cb; + counter->data_type = PERFQUERY_COUNTER_DATA_UINT64_INTEL; + + return counter; +} + +static void +read_max_of_float_counters_cb(struct perf_oa_counter *counter, + uint64_t *accumulated, + void *value) /* float */ +{ + float counter0 = read_float_oa_counter(counter->reference0, accumulated); + float counter1 = read_float_oa_counter(counter->reference1, accumulated); + float *ret = value; + + *ret = counter0 >= counter1 ? counter0 : counter1; +} + + +static struct perf_oa_counter * +add_max_of_float_oa_counters(struct perf_query_builder *builder, + struct perf_oa_counter *counter0, + struct perf_oa_counter *counter1) +{ + struct perf_oa_counter *counter = + &builder->query->oa_counters[builder->query->n_oa_counters++]; + + counter->reference0 = counter0; + counter->reference1 = counter1; + counter->read = read_max_of_float_counters_cb; + counter->data_type = PERFQUERY_COUNTER_DATA_FLOAT_INTEL; + + return counter; +} + +static void +report_uint64_oa_counter_as_raw_uint64(struct perf_query_builder *builder, + const char *name, + const char *desc, + struct perf_oa_counter *oa_counter) +{ + struct perf_query_counter *counter = + &builder->query->counters[builder->query->n_counters++]; + + counter->oa_counter = oa_counter; + counter->name = name; + counter->desc = desc; + counter->type = PERFQUERY_COUNTER_RAW_INTEL; + counter->data_type = PERFQUERY_COUNTER_DATA_UINT64_INTEL; + counter->raw_max = 0; /* undefined range */ + counter->offset = pot_align(builder->offset, 8); + counter->size = sizeof(uint64_t); + + builder->offset = counter->offset + counter->size; +} + +static void +report_uint64_oa_counter_as_uint64_event(struct perf_query_builder *builder, + const char *name, + const char *desc, + struct perf_oa_counter *oa_counter) +{ + struct perf_query_counter *counter = + &builder->query->counters[builder->query->n_counters++]; + + counter->oa_counter = oa_counter; + counter->name = name; + counter->desc = desc; + counter->type = PERFQUERY_COUNTER_EVENT_INTEL; + counter->data_type = PERFQUERY_COUNTER_DATA_UINT64_INTEL; + counter->offset = pot_align(builder->offset, 8); + counter->size = sizeof(uint64_t); + + builder->offset = counter->offset + counter->size; +} + +static void +report_float_oa_counter_as_percentage_duration(struct perf_query_builder *builder, + const char *name, + const char *desc, + struct perf_oa_counter *oa_counter) +{ + struct perf_query_counter *counter = + &builder->query->counters[builder->query->n_counters++]; + + counter->oa_counter = oa_counter; + counter->name = name; + counter->desc = desc; + counter->type = PERFQUERY_COUNTER_DURATION_RAW_INTEL; + counter->data_type = PERFQUERY_COUNTER_DATA_FLOAT_INTEL; + counter->raw_max = 100; + counter->offset = pot_align(builder->offset, 4); + counter->size = sizeof(float); + + builder->offset = counter->offset + counter->size; +} + +static void +report_uint64_oa_counter_as_throughput(struct perf_query_builder *builder, + const char *name, + const char *desc, + struct perf_oa_counter *oa_counter) +{ + struct perf_query_counter *counter = + &builder->query->counters[builder->query->n_counters++]; + + counter->oa_counter = oa_counter; + counter->name = name; + counter->desc = desc; + counter->type = PERFQUERY_COUNTER_THROUGHPUT_INTEL; + counter->data_type = PERFQUERY_COUNTER_DATA_UINT64_INTEL; + counter->offset = pot_align(builder->offset, 8); + counter->size = sizeof(uint64_t); + + builder->offset = counter->offset + counter->size; +} + +static void +report_uint64_oa_counter_as_duration(struct perf_query_builder *builder, + const char *name, + const char *desc, + struct perf_oa_counter *oa_counter) +{ + struct perf_query_counter *counter = + &builder->query->counters[builder->query->n_counters++]; + + counter->oa_counter = oa_counter; + counter->name = name; + counter->desc = desc; + counter->type = PERFQUERY_COUNTER_DURATION_RAW_INTEL; + counter->data_type = PERFQUERY_COUNTER_DATA_UINT64_INTEL; + counter->raw_max = 0; + counter->offset = pot_align(builder->offset, 8); + counter->size = sizeof(uint64_t); + + builder->offset = counter->offset + counter->size; +} + +static void +add_aggregate_counters(struct perf_query_builder *builder) +{ + struct perf_oa_counter *raw; + struct perf_oa_counter *active, *stall, *n_threads; + struct perf_oa_counter *c; + int a_offset = builder->a_offset; + int aggregate_active_counter = a_offset + 17; /* aggregate active */ + int aggregate_stall_counter = a_offset + 18; /* aggregate stall */ + int n_threads_counter = a_offset + 20; + + raw = add_raw_oa_counter(builder, a_offset + 41); + c = add_oa_counter_normalised_by_gpu_duration(builder, raw); + report_float_oa_counter_as_percentage_duration(builder, + "GPU Busy", + "The percentage of time in which the GPU has being processing GPU commands.", + c); + + raw = add_raw_oa_counter(builder, a_offset); /* aggregate EU active */ + c = add_oa_counter_normalised_by_eu_duration(builder, raw); + report_float_oa_counter_as_percentage_duration(builder, + "EU Active", + "The percentage of time in which the Execution Units were actively processing.", + c); + + raw = add_raw_oa_counter(builder, a_offset + 1); /* aggregate EU stall */ + c = add_oa_counter_normalised_by_eu_duration(builder, raw); + report_float_oa_counter_as_percentage_duration(builder, + "EU Stall", + "The percentage of time in which the Execution Units were stalled.", + c); + + + active = add_raw_oa_counter(builder, aggregate_active_counter); + c = add_oa_counter_normalised_by_eu_duration(builder, active); + report_float_oa_counter_as_percentage_duration(builder, + "CS EU Active", + "The percentage of time in which compute shader were " + "processed actively on the EUs.", + c); + + stall = add_raw_oa_counter(builder, aggregate_stall_counter); + c = add_oa_counter_normalised_by_eu_duration(builder, stall); + report_float_oa_counter_as_percentage_duration(builder, + "CS EU Stall", + "The percentage of time in which compute shader were " + "stalled on the EUs.", + c); + + + n_threads = add_raw_oa_counter(builder, n_threads_counter); + c = add_average_thread_cycles_oa_counter(builder, active, n_threads); + report_uint64_oa_counter_as_raw_uint64(builder, + "CS AVG Active per Thread", + "The average number of cycles per hardware " + "thread run in which compute shader were processed actively " + "on the EUs.", + c); + + c = add_average_thread_cycles_oa_counter(builder, stall, n_threads); + report_uint64_oa_counter_as_raw_uint64(builder, + "CS AVG Stall per Thread", + "The average number of cycles per hardware " + "thread run in which compute shader were stalled " + "on the EUs.", + c); + + + #if 0 + raw = add_raw_oa_counter(builder, a_offset + 32); /* hiz fast z passing */ + raw = add_raw_oa_counter(builder, a_offset + 33); /* hiz fast z failing */ + + raw = add_raw_oa_counter(builder, a_offset + 42); /* vs bottleneck */ + raw = add_raw_oa_counter(builder, a_offset + 43); /* gs bottleneck */ + #endif +} + +static void +hsw_add_compute_counters(struct perf_query_builder *builder) +{ + struct perf_oa_counter *raw0; + struct perf_oa_counter *raw1; + struct perf_oa_counter *typed_read; + struct perf_oa_counter *typed_write; + struct perf_oa_counter *typed_atomics; + struct perf_oa_counter *untyped_read; + struct perf_oa_counter *untyped_write; + struct perf_oa_counter *slm_read; + struct perf_oa_counter *slm_write; + + raw0 = add_raw_oa_counter(builder, 0xd0>>2); + raw0 = add_scaled_uint64_oa_counter(builder, raw0, 64); + + raw1 = add_raw_oa_counter(builder, 0xd4>>2); + raw1 = add_scaled_uint64_oa_counter(builder, raw0, 64); + + typed_read = add_hsw_slice_extrapolated_oa_counter(builder, raw0, raw1); + report_uint64_oa_counter_as_raw_uint64(builder, + "TYPED_BYTES_READ", + "TYPED_BYTES_READ", + typed_read); + + raw0 = add_raw_oa_counter(builder, 0xd8>>2); + raw0 = add_scaled_uint64_oa_counter(builder, raw0, 64); + + raw1 = add_raw_oa_counter(builder, 0xdc>>2); + raw1 = add_scaled_uint64_oa_counter(builder, raw1, 64); + + typed_write = add_hsw_slice_extrapolated_oa_counter(builder, raw0, raw1); + report_uint64_oa_counter_as_raw_uint64(builder, + "TYPED_BYTES_WRITTEN", + "TYPED_BYTES_WRITTEN", + typed_write); + + raw0 = add_raw_oa_counter(builder, 0xc0>>2); + raw0 = add_scaled_uint64_oa_counter(builder, raw0, 64); + + raw1 = add_raw_oa_counter(builder, 0xc4>>2); + raw1 = add_scaled_uint64_oa_counter(builder, raw1, 64); + + untyped_read = add_hsw_slice_extrapolated_oa_counter(builder, raw0, raw1); + report_uint64_oa_counter_as_raw_uint64(builder, + "UNTYPED_BYTES_READ", + "UNTYPED_BYTES_READ", + untyped_read); + + raw0 = add_raw_oa_counter(builder, 0xc8>>2); + raw0 = add_scaled_uint64_oa_counter(builder, raw0, 64); + + raw1 = add_raw_oa_counter(builder, 0xcc>>2); + raw1 = add_scaled_uint64_oa_counter(builder, raw1, 64); + + untyped_write = add_hsw_slice_extrapolated_oa_counter(builder, raw0, raw1); + report_uint64_oa_counter_as_raw_uint64(builder, + "UNTYPED_BYTES_WRITTEN", + "UNTYPED_BYTES_WRITTEN", + untyped_write); + + raw0 = add_raw_oa_counter(builder, 0xf8>>2); + raw0 = add_scaled_uint64_oa_counter(builder, raw0, 64); + + raw1 = add_raw_oa_counter(builder, 0xfc>>2); + raw1 = add_scaled_uint64_oa_counter(builder, raw1, 64); + + slm_read = add_hsw_slice_extrapolated_oa_counter(builder, raw0, raw1); + report_uint64_oa_counter_as_raw_uint64(builder, + "SLM_BYTES_READ", + "SLM_BYTES_READ", + slm_read); + + raw0 = add_raw_oa_counter(builder, 0xf0>>2); + raw0 = add_scaled_uint64_oa_counter(builder, raw0, 64); + + raw1 = add_raw_oa_counter(builder, 0xf4>>2); + raw1 = add_scaled_uint64_oa_counter(builder, raw1, 64); + + slm_write = add_hsw_slice_extrapolated_oa_counter(builder, raw0, raw1); + report_uint64_oa_counter_as_raw_uint64(builder, + "SLM_BYTES_WRITTEN", + "SLM_BYTES_WRITTEN", + slm_write); + + raw0 = add_raw_oa_counter(builder, 0xe0>>2); + raw1 = add_raw_oa_counter(builder, 0xe4>>2); + typed_atomics = add_hsw_slice_extrapolated_oa_counter(builder, raw0, raw1); + report_uint64_oa_counter_as_raw_uint64(builder, + "TYPED_ATOMICS", + "TYPED_ATOMICS", + typed_atomics); +} + + +static void +hsw_add_compute_basic_oa_counter_query(cl_context ctx) +{ + struct perf_query_builder builder; + struct perf_query *query = &ctx->perfquery.queries[ctx->perfquery.n_queries++]; + int a_offset; + int b_offset; + int c_offset; + struct perf_oa_counter *elapsed; + struct perf_oa_counter *c; + struct perf_query_counter *last; + + query->name = "Gen7 Compute Basic Observability Architecture Counters"; + query->counters = malloc(sizeof(struct perf_query_counter) * + MAX_PERF_QUERY_COUNTERS); + query->n_counters = 0; + query->oa_counters = malloc(sizeof(struct perf_oa_counter) * + MAX_OA_QUERY_COUNTERS); + query->n_oa_counters = 0; + query->oa_metrics_set = I915_OA_METRICS_SET_COMPUTE; + query->oa_format = I915_OA_FORMAT_A45_B8_C8_HSW; + // query->perf_raw_size = 256; /* XXX */ + + builder.ctx = ctx; + builder.query = query; + builder.offset = 0; + builder.next_accumulator_index = 0; + + builder.a_offset = a_offset = 3; + builder.b_offset = b_offset = a_offset + 45; + builder.c_offset = c_offset = b_offset + 8; + + /* Can be referenced by other counters... */ + builder.gpu_core_clock = add_raw_oa_counter(&builder, c_offset + 2); + + elapsed = add_hsw_elapsed_oa_counter(&builder); + report_uint64_oa_counter_as_duration(&builder, + "GPU Time Elapsed", + "Time elapsed on the GPU during the measurement.", + elapsed); + + c = add_avg_frequency_oa_counter(&builder, elapsed); + report_uint64_oa_counter_as_uint64_event(&builder, + "AVG GPU Core Frequency", + "Average GPU Core Frequency in the measurement.", + c); + + add_aggregate_counters(&builder); + + hsw_add_compute_counters(&builder); + + assert(query->n_counters < MAX_PERF_QUERY_COUNTERS); + assert(query->n_oa_counters < MAX_OA_QUERY_COUNTERS); + + last = &query->counters[query->n_counters - 1]; + query->data_size = last->offset + last->size; +} + +static int +get_oa_counter_data(cl_context ctx, + struct perf_query_object *obj, + size_t data_size, + uint8_t *data) +{ + const struct perf_query *query = obj->query; + int n_counters = query->n_counters; + int written = 0, i; + + if (!obj->oa.results_accumulated) { + accumulate_oa_snapshots(ctx, obj); + assert(obj->oa.results_accumulated); + } + + for (i = 0; i < n_counters; i++) { + const struct perf_query_counter *counter = &query->counters[i]; + + if (counter->size) { + counter->oa_counter->read(counter->oa_counter, obj->oa.accumulator, + data + counter->offset); + written = counter->offset + counter->size; + } + } + + return written; +} + +/** + * Get the performance query result. + */ +static void +get_perf_query_data(cl_context ctx, + struct perf_query_object *obj, + size_t data_size, + cl_uint *data, + cl_uint *bytes_written) +{ + int written = 0; + + //assert(is_perf_query_ready(ctx, queue, obj)); + + /* XXX handle in flags */ + wait_perf_query(ctx, obj); + + written = get_oa_counter_data(ctx, obj, data_size, (uint8_t *)data); + + if (bytes_written) + *bytes_written = written; +} + +static void +close_perf(cl_context ctx) +{ + if (ctx->perfquery.perf_oa_event_fd != -1) { + if (ctx->perfquery.perf_oa_mmap_base) { + size_t mapping_len = + ctx->perfquery.perf_oa_buffer_size + ctx->perfquery.page_size; + + munmap(ctx->perfquery.perf_oa_mmap_base, mapping_len); + ctx->perfquery.perf_oa_mmap_base = NULL; + } + + close(ctx->perfquery.perf_oa_event_fd); + ctx->perfquery.perf_oa_event_fd = -1; + } +} + +void +intel_perf_query_first(cl_context ctx, cl_uint *queryId) +{ + *queryId = 0; +} + +void +intel_perf_query_next(cl_context ctx, cl_uint queryId, cl_uint *nextId) +{ + if (queryId < ctx->perfquery.n_queries - 1) + *nextId = queryId + 1; + else + *nextId = queryId; +} + +void +intel_perf_query_info(cl_context ctx, + cl_uint queryId, + cl_char **queryName, + cl_uint *dataSize, + cl_uint *noCounters, + cl_uint *noInstances) +{ + struct perf_query *query = &ctx->perfquery.queries[queryId]; + + *queryName = query->name; + *dataSize = query->data_size; + *noCounters = query->n_counters; + *noInstances = 1; /* current initial allows 1 instance */ +} + +void +intel_perf_counter_info(cl_context ctx, + cl_uint queryId, cl_uint counterId, + cl_char **counterName, + cl_char **counterDesc, + cl_uint *counterOffset, cl_uint *counterDataSize, + cl_uint *counterTypeEnum, cl_uint *counterDataTypeEnum, + cl_ulong *rawCounterMaxValue) +{ + struct perf_query *query = &ctx->perfquery.queries[queryId]; + struct perf_query_counter *counter = &query->counters[counterId]; + + *counterName = counter->name; + *counterDesc = counter->desc; + *counterOffset = counter->offset; + *counterDataSize = counter->size; + *counterTypeEnum = counter->type; + *counterDataTypeEnum = counter->data_type; + *rawCounterMaxValue = counter->raw_max; +} + +cl_int +intel_perf_query_create(cl_context context, cl_uint queryId, cl_perf_query_intel *queryHandle) +{ + struct perf_query *query = &context->perfquery.queries[queryId]; + struct perf_query_object *obj = + calloc(1, sizeof(struct perf_query_object)); + + if (!obj) + return -1; + + obj->query = query; + + obj->batch = intel_batchbuffer_new((intel_driver_t *)context->drv); + if (!obj->batch) { + fprintf(stderr, "failed to create perf batch\n"); + free(obj); + return -1; + } + + context->perfquery.n_query_instances++; + + *queryHandle = obj; + + return 0; +} + +cl_int +intel_perf_query_delete(cl_context context, cl_perf_query_intel queryHandle) +{ + struct perf_query_object *obj = (struct perf_query_object *)queryHandle; + + if (obj->oa.bo) { + if (!obj->oa.results_accumulated) { + drop_from_unresolved_query_list(context, obj); + dec_n_oa_users(context); + } + + drm_intel_bo_unreference(obj->oa.bo); + obj->oa.bo = NULL; + } + + obj->oa.results_accumulated = false; + + intel_batchbuffer_delete(obj->batch); + + free(obj); + + if (--context->perfquery.n_query_instances == 0) + close_perf(context); + + return 0; +} + +cl_int +intel_perf_query_begin(cl_context context, cl_perf_query_intel queryHandle) +{ + bool ret; + struct perf_query_object *obj = (struct perf_query_object *)queryHandle; + + ret = begin_perf_query(context, obj); + if (!ret) + return -1; + + return 0; +} + +cl_int +intel_perf_query_end(cl_context context, cl_perf_query_intel queryHandle) +{ + struct perf_query_object *obj = (struct perf_query_object *)queryHandle; + + end_perf_query(context, obj); + return 0; +} + +cl_int +intel_perf_query_get_data(cl_context context, + cl_perf_query_intel queryHandle, + cl_uint flags, size_t dataSize, void *data, + cl_uint *bytesWritten) +{ + struct perf_query_object *obj = (struct perf_query_object *)queryHandle; + + /* XXX flags? */ + get_perf_query_data(context, obj, + dataSize, data, + bytesWritten); + + return 0; +} + + +void +intel_perf_query_init(cl_context context) +{ + intel_driver_t *drv = (intel_driver_t *)context->drv; + + if (!IS_HASWELL(drv->device_id)) { + fprintf(stderr, "Perf query only supports on HSW now.\n"); + context->perfquery.enable = false; + return; + } + + /* XXX test kernel for i915 PMU driver available */ + if (lookup_i915_oa_id() == 0) { + fprintf(stderr, "Kernel has no i915 PMU driver.\n"); + context->perfquery.enable = false; + return; + } + + context->perfquery.eu_count = context->device->max_compute_unit; + + context->perfquery.read_oa_report_timestamp = hsw_read_report_timestamp; + + /* initialize intel query structs depends on device */ + hsw_add_compute_basic_oa_counter_query(context); + + context->perfquery.unresolved = calloc(2, sizeof(struct perf_query_object *)); + context->perfquery.unresolved_elements = 0; + context->perfquery.unresolved_array_size = 2; + + context->perfquery.page_size = sysconf(_SC_PAGE_SIZE); + + context->perfquery.perf_oa_event_fd = -1; + context->perfquery.perf_oa_buffer_size = 1024 * 1024; /* NB: must be power of two */ + + context->perfquery.next_query_start_report_id = 1000; + + context->perfquery.enable = true; +} + +void +intel_perf_query_destroy(cl_context context) +{ + int i; + struct perf_query *query; + + for (i = 0; i < context->perfquery.n_queries; i++) { + query = &context->perfquery.queries[i]; + free(query->counters); + free(query->oa_counters); + } + free(context->perfquery.unresolved); +} diff --git a/src/intel_perf.h b/src/intel_perf.h new file mode 100644 index 0000000..5099433 --- /dev/null +++ b/src/intel_perf.h @@ -0,0 +1,32 @@ +#ifndef _INTEL_PERF_H +#define _INTEL_PERF_H + +#include "CL/cl.h" +#include "CL/cl_intel.h" + +void intel_perf_query_first(cl_context, cl_uint *queryId); +void intel_perf_query_next(cl_context, cl_uint queryId, cl_uint *nextId); +void intel_perf_query_info(cl_context, cl_uint queryId, + cl_char **queryName, + cl_uint *dataSize, cl_uint *noCounters, cl_uint *noInstances); +void intel_perf_counter_info(cl_context, cl_uint queryId, cl_uint counterId, + cl_char **counterName, + cl_char **counterDesc, + cl_uint *counterOffset, cl_uint *counterDataSize, + cl_uint *counterTypeEnum, cl_uint *counterDataTypeEnum, + cl_ulong *rawCounterMaxValue); +cl_int intel_perf_query_create(cl_context context, cl_uint queryId, + cl_perf_query_intel *queryHandle); +cl_int intel_perf_query_delete(cl_context context, cl_perf_query_intel queryHandle); +cl_int intel_perf_query_begin(cl_context context, cl_perf_query_intel queryHandle); +cl_int intel_perf_query_end(cl_context context, cl_perf_query_intel queryHandle); +cl_int intel_perf_query_get_data(cl_context context, + cl_perf_query_intel queryHandle, + cl_uint flags, size_t dataSize, void *data, + cl_uint *bytesWritten); + +void intel_perf_query_init(cl_context context); +void intel_perf_query_destroy(cl_context context); + +#endif + -- 2.1.4 _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
