struct {
/* Variables referenced in the XML meta data for OA performance
* counters, e.g in the normalization equations.
diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.c
b/src/mesa/drivers/dri/i965/brw_performance_query.c
index c0bb4442bec..10f519a757f 100644
--- a/src/mesa/drivers/dri/i965/brw_performance_query.c
+++ b/src/mesa/drivers/dri/i965/brw_performance_query.c
@@ -1888,6 +1888,192 @@ init_oa_configs(struct brw_context *brw, const char
*sysfs_dev_dir)
}
}
+static bool
+query_topology(struct brw_context *brw)
+{
+ __DRIscreen *screen = brw->screen->driScrnPriv;
+ struct drm_i915_query_item item = {
+ .query_id = DRM_I915_QUERY_TOPOLOGY_INFO,
+ };
+ struct drm_i915_query query = {
+ .num_items = 1,
+ .items_ptr = (uintptr_t) &item,
+ };
+
+ return false;
+
+ if (drmIoctl(screen->fd, DRM_IOCTL_I915_QUERY, &query))
+ return false;
+
+ struct drm_i915_query_topology_info *topo_info =
+ (struct drm_i915_query_topology_info *) calloc(1, item.length);
+ item.data_ptr = (uintptr_t) topo_info;
+
+ if (drmIoctl(screen->fd, DRM_IOCTL_I915_QUERY, &query))
+ return false;
+
+ brw->topology.max_slices = topo_info->max_slices;
+ brw->topology.max_subslices = topo_info->max_subslices;
+ brw->topology.max_eus_per_subslice = topo_info->max_eus_per_subslice;
+
+ brw->topology.subslice_slice_stride =
+ DIV_ROUND_UP(brw->topology.max_subslices, 8);
+ brw->topology.eu_subslice_stride =
+ DIV_ROUND_UP(brw->topology.max_eus_per_subslice, 8);
+ brw->topology.eu_slice_stride = brw->topology.max_subslices *
+ brw->topology.eu_subslice_stride;
+
+ assert(DIV_ROUND_UP(topo_info->max_slices, 8) <=
+ sizeof(brw->topology.slice_mask));
+ memcpy(brw->topology.slice_mask, topo_info->data,
+ DIV_ROUND_UP(topo_info->max_slices, 8));
+
+ assert(DIV_ROUND_UP(topo_info->max_slices * topo_info->max_subslices, 8) <=
+ sizeof(brw->topology.subslice_mask));
+ memcpy(brw->topology.subslice_mask,
+ &topo_info->data[topo_info->subslice_offset],
+ topo_info->max_slices * topo_info->subslice_stride);
+
+ assert(DIV_ROUND_UP(topo_info->max_slices * topo_info->max_subslices *
+ topo_info->max_eus_per_subslice, 8) <=
+ sizeof(brw->topology.eu_mask));
+ memcpy(brw->topology.eu_mask,
+ &topo_info->data[topo_info->eu_offset],
+ topo_info->max_slices * topo_info->max_subslices *
topo_info->eu_stride);
+
+ free(topo_info);
+
+ return true;
+}
+
+static bool
+getparam_topology(struct brw_context *brw)
+{
+ const struct gen_device_info *devinfo = &brw->screen->devinfo;
+ __DRIscreen *screen = brw->screen->driScrnPriv;
+ drm_i915_getparam_t gp;
+ int ret;
+
+ /* On CNL+ we need to use the query ioctl(). */
+ assert(devinfo->gen < 10);
+
+ int slice_mask = 0;
+ gp.param = I915_PARAM_SLICE_MASK;
+ gp.value = &slice_mask;
+ ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
+ if (ret)
+ return false;
+
+ int subslice_mask = 0;
+ gp.param = I915_PARAM_SUBSLICE_MASK;
+ gp.value = &subslice_mask;
+ ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
+ if (ret)
+ return false;
+
+ brw->topology.max_slices = util_last_bit(slice_mask);
+ brw->topology.max_subslices = util_last_bit(subslice_mask);
+ brw->topology.max_eus_per_subslice = devinfo->is_haswell ? 10 : 8;
+
+ brw->topology.subslice_slice_stride =
+ DIV_ROUND_UP(brw->topology.max_subslices, 8);
+ brw->topology.eu_subslice_stride =
+ DIV_ROUND_UP(brw->topology.max_eus_per_subslice, 8);
+ brw->topology.eu_slice_stride = brw->topology.max_subslices *
+ brw->topology.eu_subslice_stride;
+
+ int n_subslices = __builtin_popcount(slice_mask) *
__builtin_popcount(subslice_mask);
+ int eus_per_subslice = brw->screen->eu_total / n_subslices;
+
+ for (int s = 0; s < brw->topology.max_slices; s++) {
+ brw->topology.slice_mask[s / 8] |= (1UL << (s % 8)) & slice_mask;
+
+ for (int ss = 0; ss < brw->topology.max_subslices; ss++) {
+ brw->topology.subslice_mask[s * brw->topology.subslice_slice_stride +
+ ss / 8] |=
+ (1UL << (ss % 8)) & subslice_mask;
+
+ for (int eug = 0; eug < brw->topology.eu_slice_stride; eug++) {
+ brw->topology.eu_mask[s * brw->topology.eu_slice_stride +
+ ss * brw->topology.eu_subslice_stride +
+ eug] =
+ (((1UL << eus_per_subslice) - 1) >> (eug * 8)) & 0xff;
+ }
+ }
+ }
+
+ return true;
+}
+
+static void
+devinfo_topology(struct brw_context *brw)
+{
+ const struct gen_device_info *devinfo = &brw->screen->devinfo;
+
+ assert(devinfo->is_haswell);
+
+ brw->topology.max_slices = devinfo->num_slices;
+ brw->topology.max_subslices = devinfo->num_subslices[0];
+ brw->topology.max_eus_per_subslice = 10;
+
+ int subslice_stride = DIV_ROUND_UP(brw->topology.max_subslices, 8);
+ int eu_subslice_stride = DIV_ROUND_UP(brw->topology.max_eus_per_subslice,
8);
+ int eu_slice_stride = brw->topology.max_subslices * eu_subslice_stride;
+
+ for (int s = 0; s < brw->topology.max_slices; s++) {
+ brw->topology.slice_mask[s / 8] |= 1UL << (s % 8);
+
+ for (int ss = 0; ss < brw->topology.max_subslices; ss++) {
+ brw->topology.subslice_mask[(s * subslice_stride + ss) / 8] |=
+ 1UL << (ss % 8);
+
+ for (int eug = 0; eug < eu_subslice_stride; eug++) {
+ brw->topology.eu_mask[s * eu_slice_stride + ss *
eu_subslice_stride + eug] =
+ (((1UL << brw->topology.max_eus_per_subslice) - 1) >> (eug * 8))
& 0xff;
+ }
+ }
+ }
+}
+
+static void
+compute_topology_builtins(struct brw_context *brw)
+{
+ const struct gen_device_info *devinfo = &brw->screen->devinfo;
+
+ assert(brw->topology.max_slices <= 8);
+ brw->perfquery.sys_vars.slice_mask = brw->topology.slice_mask[0];
+ brw->perfquery.sys_vars.n_eu_slices =
+ __builtin_popcount(brw->perfquery.sys_vars.slice_mask);
+
+ for (int i = 0; i < sizeof(brw->topology.subslice_mask); i++) {
+ brw->perfquery.sys_vars.n_eu_sub_slices +=
+ __builtin_popcount(brw->topology.subslice_mask[i]);
+ }
+
+ for (int i = 0; i < sizeof(brw->topology.eu_mask); i++) {
+ brw->perfquery.sys_vars.n_eus +=
+ __builtin_popcount(brw->topology.eu_mask[i]);
+ }
+
+ brw->perfquery.sys_vars.eu_threads_count =
+ brw->perfquery.sys_vars.n_eus * devinfo->num_thread_per_eu;
+
+ /* At the moment the subslice mask builtin has groups of 3bits for each
+ * slice.
+ *
+ * Ideally equations would be updated to have a slice/subslice query
+ * function/operator.
+ */
+ brw->perfquery.sys_vars.subslice_mask = 0;
+ for (int s = 0; s < brw->topology.max_slices; s++) {
+ for (int ss = 0; ss < brw->topology.max_subslices; ss++) {
+ if (brw->topology.subslice_mask[s * brw->topology.subslice_slice_stride +
ss / 8] &
+ (1UL << (ss % 8)))
+ brw->perfquery.sys_vars.subslice_mask |= 1UL << (s * 3 + ss);
+ }
+ }
+}
+
static bool
init_oa_sys_vars(struct brw_context *brw, const char *sysfs_dev_dir)
{
@@ -1905,83 +2091,18 @@ init_oa_sys_vars(struct brw_context *brw, const char
*sysfs_dev_dir)
&max_freq_mhz))
return false;
+ memset(&brw->topology, 0, sizeof(brw->topology));
+ if (!query_topology(brw)) {
+ if (!getparam_topology(brw))
+ devinfo_topology(brw);
+ }
+
+ memset(&brw->perfquery.sys_vars, 0, sizeof(brw->perfquery.sys_vars));
brw->perfquery.sys_vars.gt_min_freq = min_freq_mhz * 1000000;
brw->perfquery.sys_vars.gt_max_freq = max_freq_mhz * 1000000;
brw->perfquery.sys_vars.timestamp_frequency = devinfo->timestamp_frequency;
-
brw->perfquery.sys_vars.revision = intel_device_get_revision(screen->fd);
- brw->perfquery.sys_vars.n_eu_slices = devinfo->num_slices;
- /* Assuming uniform distribution of subslices per slices. */
- brw->perfquery.sys_vars.n_eu_sub_slices = devinfo->num_subslices[0];
-
- if (devinfo->is_haswell) {
- brw->perfquery.sys_vars.slice_mask = 0;
- brw->perfquery.sys_vars.subslice_mask = 0;
-
- for (int s = 0; s < devinfo->num_slices; s++)
- brw->perfquery.sys_vars.slice_mask |= 1U << s;
- for (int ss = 0; ss < devinfo->num_subslices[0]; ss++)
- brw->perfquery.sys_vars.subslice_mask |= 1U << ss;
-
- if (devinfo->gt == 1) {
- brw->perfquery.sys_vars.n_eus = 10;
- } else if (devinfo->gt == 2) {
- brw->perfquery.sys_vars.n_eus = 20;
- } else if (devinfo->gt == 3) {
- brw->perfquery.sys_vars.n_eus = 40;
- } else
- unreachable("not reached");
- } else {
- drm_i915_getparam_t gp;
- int ret;
- int slice_mask = 0;
- int ss_mask = 0;
- /* maximum number of slices */
- int s_max = devinfo->num_slices;
- /* maximum number of subslices per slice (assuming uniform subslices per
- * slices)
- */
- int ss_max = devinfo->num_subslices[0];
- uint64_t subslice_mask = 0;
- int s;
-
- gp.param = I915_PARAM_SLICE_MASK;
- gp.value = &slice_mask;
- ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
- if (ret)
- return false;
-
- gp.param = I915_PARAM_SUBSLICE_MASK;
- gp.value = &ss_mask;
- ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
- if (ret)
- return false;
-
- brw->perfquery.sys_vars.n_eus = brw->screen->eu_total;
- brw->perfquery.sys_vars.n_eu_slices = __builtin_popcount(slice_mask);
- brw->perfquery.sys_vars.slice_mask = slice_mask;
-
- /* Note: the _SUBSLICE_MASK param only reports a global subslice mask
- * which applies to all slices.
- *
- * Note: some of the metrics we have (as described in XML) are
- * conditional on a $SubsliceMask variable which is expected to also
- * reflect the slice mask by packing together subslice masks for each
- * slice in one value..
- */
- for (s = 0; s < s_max; s++) {
- if (slice_mask & (1<<s)) {
- subslice_mask |= ss_mask << (ss_max * s);
- }
- }
-
- brw->perfquery.sys_vars.subslice_mask = subslice_mask;
- brw->perfquery.sys_vars.n_eu_sub_slices =
- __builtin_popcount(subslice_mask);
- }
-
- brw->perfquery.sys_vars.eu_threads_count =
- brw->perfquery.sys_vars.n_eus * devinfo->num_thread_per_eu;
+ compute_topology_builtins(brw);
return true;
}
--
2.16.1
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev