On Thu, Feb 22, 2018 at 05:24:38PM +0000, Lionel Landwerlin wrote: > With the introduction of asymmetric slices in CNL, we cannot rely on > the previous SUBSLICE_MASK getparam to tell userspace what subslices > are available. > > We introduce a new uAPI in the kernel driver to report exactly what > part of the GPU are fused and require this to be available on Gen10+. > > Prior generations can continue to rely on GETPARAM on older kernels. > > This patch is quite a lot of code because we have to support lots of > different kernel versions, ranging from not providing any information > (for Haswell on 4.13 through 4.17), to being able to query through > GETPARAM (for gen8/9 on 4.13 through 4.17), to finally requiring 4.17 > for Gen10+.
I don't think it's that much code. It's reasonable given how many interfaces we have to query such data. > This change stores topology information in a unified way on > brw_context.topology from the various kernel APIs. And then generates > the appropriate values for the equations from that unified topology. > > Signed-off-by: Lionel Landwerlin <lionel.g.landwer...@intel.com> > --- > src/mesa/drivers/dri/i965/brw_context.h | 14 ++ > src/mesa/drivers/dri/i965/brw_performance_query.c | 267 > ++++++++++++++++------ > 2 files changed, 208 insertions(+), 73 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_context.h > b/src/mesa/drivers/dri/i965/brw_context.h > index 050b656e3da..69bf7530fbc 100644 > --- a/src/mesa/drivers/dri/i965/brw_context.h > +++ b/src/mesa/drivers/dri/i965/brw_context.h > @@ -1160,6 +1160,20 @@ struct brw_context > bool supported; > } predicate; > > + struct { > + uint8_t slice_mask[4]; > + uint8_t subslice_mask[100]; > + uint8_t eu_mask[100]; > + > + uint16_t max_slices; > + uint16_t max_subslices; > + uint16_t max_eus_per_subslice; > + > + uint16_t subslice_slice_stride; > + uint16_t eu_slice_stride; > + uint16_t eu_subslice_stride; > + } topology; > + I wonder if such information shouldn't be stored in gen_device_info. But it seems the rest of the OA code seems to be tied to i965 anyways, so I guess this should be fine. In any case, series is: Acked-by: Rafael Antognolli <rafael.antogno...@intel.com> > struct { > /* Variables referenced in the XML meta data for OA performance > * counters, e.g in the normalization equations. > diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.c > b/src/mesa/drivers/dri/i965/brw_performance_query.c > index c0bb4442bec..10f519a757f 100644 > --- a/src/mesa/drivers/dri/i965/brw_performance_query.c > +++ b/src/mesa/drivers/dri/i965/brw_performance_query.c > @@ -1888,6 +1888,192 @@ init_oa_configs(struct brw_context *brw, const char > *sysfs_dev_dir) > } > } > > +static bool > +query_topology(struct brw_context *brw) > +{ > + __DRIscreen *screen = brw->screen->driScrnPriv; > + struct drm_i915_query_item item = { > + .query_id = DRM_I915_QUERY_TOPOLOGY_INFO, > + }; > + struct drm_i915_query query = { > + .num_items = 1, > + .items_ptr = (uintptr_t) &item, > + }; > + > + return false; > + > + if (drmIoctl(screen->fd, DRM_IOCTL_I915_QUERY, &query)) > + return false; > + > + struct drm_i915_query_topology_info *topo_info = > + (struct drm_i915_query_topology_info *) calloc(1, item.length); > + item.data_ptr = (uintptr_t) topo_info; > + > + if (drmIoctl(screen->fd, DRM_IOCTL_I915_QUERY, &query)) > + return false; > + > + brw->topology.max_slices = topo_info->max_slices; > + brw->topology.max_subslices = topo_info->max_subslices; > + brw->topology.max_eus_per_subslice = topo_info->max_eus_per_subslice; > + > + brw->topology.subslice_slice_stride = > + DIV_ROUND_UP(brw->topology.max_subslices, 8); > + brw->topology.eu_subslice_stride = > + DIV_ROUND_UP(brw->topology.max_eus_per_subslice, 8); > + brw->topology.eu_slice_stride = brw->topology.max_subslices * > + brw->topology.eu_subslice_stride; > + > + assert(DIV_ROUND_UP(topo_info->max_slices, 8) <= > + sizeof(brw->topology.slice_mask)); > + memcpy(brw->topology.slice_mask, topo_info->data, > + DIV_ROUND_UP(topo_info->max_slices, 8)); > + > + assert(DIV_ROUND_UP(topo_info->max_slices * topo_info->max_subslices, 8) > <= > + sizeof(brw->topology.subslice_mask)); > + memcpy(brw->topology.subslice_mask, > + &topo_info->data[topo_info->subslice_offset], > + topo_info->max_slices * topo_info->subslice_stride); > + > + assert(DIV_ROUND_UP(topo_info->max_slices * topo_info->max_subslices * > + topo_info->max_eus_per_subslice, 8) <= > + sizeof(brw->topology.eu_mask)); > + memcpy(brw->topology.eu_mask, > + &topo_info->data[topo_info->eu_offset], > + topo_info->max_slices * topo_info->max_subslices * > topo_info->eu_stride); > + > + free(topo_info); > + > + return true; > +} > + > +static bool > +getparam_topology(struct brw_context *brw) > +{ > + const struct gen_device_info *devinfo = &brw->screen->devinfo; > + __DRIscreen *screen = brw->screen->driScrnPriv; > + drm_i915_getparam_t gp; > + int ret; > + > + /* On CNL+ we need to use the query ioctl(). */ > + assert(devinfo->gen < 10); > + > + int slice_mask = 0; > + gp.param = I915_PARAM_SLICE_MASK; > + gp.value = &slice_mask; > + ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp); > + if (ret) > + return false; > + > + int subslice_mask = 0; > + gp.param = I915_PARAM_SUBSLICE_MASK; > + gp.value = &subslice_mask; > + ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp); > + if (ret) > + return false; > + > + brw->topology.max_slices = util_last_bit(slice_mask); > + brw->topology.max_subslices = util_last_bit(subslice_mask); > + brw->topology.max_eus_per_subslice = devinfo->is_haswell ? 10 : 8; > + > + brw->topology.subslice_slice_stride = > + DIV_ROUND_UP(brw->topology.max_subslices, 8); > + brw->topology.eu_subslice_stride = > + DIV_ROUND_UP(brw->topology.max_eus_per_subslice, 8); > + brw->topology.eu_slice_stride = brw->topology.max_subslices * > + brw->topology.eu_subslice_stride; > + > + int n_subslices = __builtin_popcount(slice_mask) * > __builtin_popcount(subslice_mask); > + int eus_per_subslice = brw->screen->eu_total / n_subslices; > + > + for (int s = 0; s < brw->topology.max_slices; s++) { > + brw->topology.slice_mask[s / 8] |= (1UL << (s % 8)) & slice_mask; > + > + for (int ss = 0; ss < brw->topology.max_subslices; ss++) { > + brw->topology.subslice_mask[s * brw->topology.subslice_slice_stride > + > + ss / 8] |= > + (1UL << (ss % 8)) & subslice_mask; > + > + for (int eug = 0; eug < brw->topology.eu_slice_stride; eug++) { > + brw->topology.eu_mask[s * brw->topology.eu_slice_stride + > + ss * brw->topology.eu_subslice_stride + > + eug] = > + (((1UL << eus_per_subslice) - 1) >> (eug * 8)) & 0xff; > + } > + } > + } > + > + return true; > +} > + > +static void > +devinfo_topology(struct brw_context *brw) > +{ > + const struct gen_device_info *devinfo = &brw->screen->devinfo; > + > + assert(devinfo->is_haswell); > + > + brw->topology.max_slices = devinfo->num_slices; > + brw->topology.max_subslices = devinfo->num_subslices[0]; > + brw->topology.max_eus_per_subslice = 10; > + > + int subslice_stride = DIV_ROUND_UP(brw->topology.max_subslices, 8); > + int eu_subslice_stride = DIV_ROUND_UP(brw->topology.max_eus_per_subslice, > 8); > + int eu_slice_stride = brw->topology.max_subslices * eu_subslice_stride; > + > + for (int s = 0; s < brw->topology.max_slices; s++) { > + brw->topology.slice_mask[s / 8] |= 1UL << (s % 8); > + > + for (int ss = 0; ss < brw->topology.max_subslices; ss++) { > + brw->topology.subslice_mask[(s * subslice_stride + ss) / 8] |= > + 1UL << (ss % 8); > + > + for (int eug = 0; eug < eu_subslice_stride; eug++) { > + brw->topology.eu_mask[s * eu_slice_stride + ss * > eu_subslice_stride + eug] = > + (((1UL << brw->topology.max_eus_per_subslice) - 1) >> (eug * > 8)) & 0xff; > + } > + } > + } > +} > + > +static void > +compute_topology_builtins(struct brw_context *brw) > +{ > + const struct gen_device_info *devinfo = &brw->screen->devinfo; > + > + assert(brw->topology.max_slices <= 8); > + brw->perfquery.sys_vars.slice_mask = brw->topology.slice_mask[0]; > + brw->perfquery.sys_vars.n_eu_slices = > + __builtin_popcount(brw->perfquery.sys_vars.slice_mask); > + > + for (int i = 0; i < sizeof(brw->topology.subslice_mask); i++) { > + brw->perfquery.sys_vars.n_eu_sub_slices += > + __builtin_popcount(brw->topology.subslice_mask[i]); > + } > + > + for (int i = 0; i < sizeof(brw->topology.eu_mask); i++) { > + brw->perfquery.sys_vars.n_eus += > + __builtin_popcount(brw->topology.eu_mask[i]); > + } > + > + brw->perfquery.sys_vars.eu_threads_count = > + brw->perfquery.sys_vars.n_eus * devinfo->num_thread_per_eu; > + > + /* At the moment the subslice mask builtin has groups of 3bits for each > + * slice. > + * > + * Ideally equations would be updated to have a slice/subslice query > + * function/operator. > + */ > + brw->perfquery.sys_vars.subslice_mask = 0; > + for (int s = 0; s < brw->topology.max_slices; s++) { > + for (int ss = 0; ss < brw->topology.max_subslices; ss++) { > + if (brw->topology.subslice_mask[s * > brw->topology.subslice_slice_stride + ss / 8] & > + (1UL << (ss % 8))) > + brw->perfquery.sys_vars.subslice_mask |= 1UL << (s * 3 + ss); > + } > + } > +} > + > static bool > init_oa_sys_vars(struct brw_context *brw, const char *sysfs_dev_dir) > { > @@ -1905,83 +2091,18 @@ init_oa_sys_vars(struct brw_context *brw, const char > *sysfs_dev_dir) > &max_freq_mhz)) > return false; > > + memset(&brw->topology, 0, sizeof(brw->topology)); > + if (!query_topology(brw)) { > + if (!getparam_topology(brw)) > + devinfo_topology(brw); > + } > + > + memset(&brw->perfquery.sys_vars, 0, sizeof(brw->perfquery.sys_vars)); > brw->perfquery.sys_vars.gt_min_freq = min_freq_mhz * 1000000; > brw->perfquery.sys_vars.gt_max_freq = max_freq_mhz * 1000000; > brw->perfquery.sys_vars.timestamp_frequency = > devinfo->timestamp_frequency; > - > brw->perfquery.sys_vars.revision = intel_device_get_revision(screen->fd); > - brw->perfquery.sys_vars.n_eu_slices = devinfo->num_slices; > - /* Assuming uniform distribution of subslices per slices. */ > - brw->perfquery.sys_vars.n_eu_sub_slices = devinfo->num_subslices[0]; > - > - if (devinfo->is_haswell) { > - brw->perfquery.sys_vars.slice_mask = 0; > - brw->perfquery.sys_vars.subslice_mask = 0; > - > - for (int s = 0; s < devinfo->num_slices; s++) > - brw->perfquery.sys_vars.slice_mask |= 1U << s; > - for (int ss = 0; ss < devinfo->num_subslices[0]; ss++) > - brw->perfquery.sys_vars.subslice_mask |= 1U << ss; > - > - if (devinfo->gt == 1) { > - brw->perfquery.sys_vars.n_eus = 10; > - } else if (devinfo->gt == 2) { > - brw->perfquery.sys_vars.n_eus = 20; > - } else if (devinfo->gt == 3) { > - brw->perfquery.sys_vars.n_eus = 40; > - } else > - unreachable("not reached"); > - } else { > - drm_i915_getparam_t gp; > - int ret; > - int slice_mask = 0; > - int ss_mask = 0; > - /* maximum number of slices */ > - int s_max = devinfo->num_slices; > - /* maximum number of subslices per slice (assuming uniform subslices > per > - * slices) > - */ > - int ss_max = devinfo->num_subslices[0]; > - uint64_t subslice_mask = 0; > - int s; > - > - gp.param = I915_PARAM_SLICE_MASK; > - gp.value = &slice_mask; > - ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp); > - if (ret) > - return false; > - > - gp.param = I915_PARAM_SUBSLICE_MASK; > - gp.value = &ss_mask; > - ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp); > - if (ret) > - return false; > - > - brw->perfquery.sys_vars.n_eus = brw->screen->eu_total; > - brw->perfquery.sys_vars.n_eu_slices = __builtin_popcount(slice_mask); > - brw->perfquery.sys_vars.slice_mask = slice_mask; > - > - /* Note: the _SUBSLICE_MASK param only reports a global subslice mask > - * which applies to all slices. > - * > - * Note: some of the metrics we have (as described in XML) are > - * conditional on a $SubsliceMask variable which is expected to also > - * reflect the slice mask by packing together subslice masks for each > - * slice in one value.. > - */ > - for (s = 0; s < s_max; s++) { > - if (slice_mask & (1<<s)) { > - subslice_mask |= ss_mask << (ss_max * s); > - } > - } > - > - brw->perfquery.sys_vars.subslice_mask = subslice_mask; > - brw->perfquery.sys_vars.n_eu_sub_slices = > - __builtin_popcount(subslice_mask); > - } > - > - brw->perfquery.sys_vars.eu_threads_count = > - brw->perfquery.sys_vars.n_eus * devinfo->num_thread_per_eu; > + compute_topology_builtins(brw); > > return true; > } > -- > 2.16.1 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev