Module: Mesa
Branch: main
Commit: 622c2498d422485221f6804fbfd6593ed005b372
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=622c2498d422485221f6804fbfd6593ed005b372

Author: Francisco Jerez <curroje...@riseup.net>
Date:   Fri Sep 29 01:36:07 2023 -0700

intel/xehp+: Import algorithm for TBIMR tiling parameter calculation.

This implements a minimalistic algorithm that can be used to obtain an
approximate solution for the integer programming problem of finding
the optimal tile dimensions based on an estimate of the tile cache
consumption per pixel of the current graphics pipeline -- Including
the TC footprint of render targets, depth and stencil buffers and
their auxiliary surfaces.  Considering other (less local) memory
accesses performed by the pipeline (like texturing and shader storage)
would be useful (and could be considered by this algorithm with little
modification), but it would be pretty difficult to estimate the L3
cache consumption per pixel of such accesses based on static analysis
of the pipeline state alone without some sort of dynamic feedback.

The present algorithm returns a config with tile area large enough to
utilize a target fraction of the L3, which can be adjusted to obtain
greater/lower utilization of the L3 at the cost of higher/lower risk
of L3 cache thrashing respectively.  The aspect ratio of the tile
layout returned attempts to minimize the number of poorly utilized
tiles around the boundaries of the framebuffer (due to partial
coverage), since having the tile sequencer process additional tiles
comes at a cost due to the latency of the additional passes, even if
they're mostly empty.  Finally, among the solutions with satisfactory
cache footprint and tile count, the tile aspect ratio closest to 1 is
returned where possible, since tiles with very high aspect ratios can
have a negative impact on cache locality.

The algorithm is primarily intended for TBIMR, but it could be used
for PTBR as well with little modifications, since the TBIMR-specific
assumptions are few and noted in comments below.

Reviewed-by: Lionel Landwerlin <lionel.g.landwer...@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25493>

---

 src/intel/common/intel_tiled_render.h | 275 ++++++++++++++++++++++++++++++++++
 src/intel/common/meson.build          |   1 +
 2 files changed, 276 insertions(+)

diff --git a/src/intel/common/intel_tiled_render.h 
b/src/intel/common/intel_tiled_render.h
new file mode 100644
index 00000000000..a22621f2042
--- /dev/null
+++ b/src/intel/common/intel_tiled_render.h
@@ -0,0 +1,275 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifndef INTEL_TILED_RENDER_H
+#define INTEL_TILED_RENDER_H
+
+#include "intel/common/intel_l3_config.h"
+#include "intel/dev/intel_device_info.h"
+#include "intel/isl/isl.h"
+
+/**
+ * Return the tile cache space used as target by the tiling parameter
+ * calculation algorithm below.  Cache space units are in bits.
+ *
+ * \sa intel_calculate_tile_dimensions()
+ */
+UNUSED static unsigned
+intel_calculate_tile_cache_size(const struct intel_device_info *devinfo,
+                                const struct intel_l3_config *cfg)
+{
+   const unsigned tc_l3_partition_size = 1024 * 8 *
+      intel_get_l3_partition_size(devinfo, cfg, INTEL_L3P_TC);
+   const unsigned all_l3_partition_size = 1024 * 8 *
+      intel_get_l3_partition_size(devinfo, cfg, INTEL_L3P_ALL);
+   /* Target half of the total L3 space as simple heuristic, could be
+    * improved by adjusting the target dynamically.
+    */
+   const unsigned target_all_l3_partition_size = all_l3_partition_size / 2;
+   /* If there's a tile cache partition on the L3, use its size as
+    * target, otherwise (e.g. in unified L3 cache mode) use a fraction
+    * of the total L3 available.
+    *
+    * XXX - Note that this assumes TBIMR in pixel hashing mode is in use.
+    */
+   const unsigned tile_cache_size = tc_l3_partition_size ? 
tc_l3_partition_size :
+                                    target_all_l3_partition_size;
+   assert(tile_cache_size > 0);
+   return tile_cache_size;
+}
+
+/**
+ * Return the amount of bits per pixel used to store an ISL surface in
+ * memory.  This can be used as helper to estimate the value of the \p
+ * pixel_size argument of intel_calculate_tile_dimensions() below.
+ */
+UNUSED static unsigned
+intel_calculate_surface_pixel_size(const struct isl_surf *surf)
+{
+   const struct isl_format_layout *layout = 
isl_format_get_layout(surf->format);
+   const unsigned num_samples = MAX2(1, surf->samples);
+   if (surf->size_B > 0)
+      return DIV_ROUND_UP(layout->bpb * num_samples,
+                          layout->bw * layout->bh * layout->bd);
+   else
+      return 0;
+}
+
+/**
+ * Estimate tiling parameters that yield a reasonable balance between
+ * tile cache utilization and avoidance of thrashing, based on the
+ * device's current caching configuration, the framebuffer dimensions
+ * and an estimate of the tile cache footprint per fragment in bits (\p
+ * pixel_size).
+ *
+ * The calculated tile dimensions are guaranteed to be a multiple of
+ * the block dimensions \p block_width and \p block_height, which for
+ * TBIMR in pixel hashing mode must be equal to the pixel hashing
+ * block size, typically 16x16 or 32x32.
+ */
+UNUSED static void
+intel_calculate_tile_dimensions(const struct intel_device_info *devinfo,
+                                const struct intel_l3_config *cfg,
+                                unsigned block_width, unsigned block_height,
+                                unsigned fb_width, unsigned fb_height,
+                                unsigned pixel_size,
+                                unsigned *tile_width, unsigned *tile_height)
+{
+   /* Maximum number of tiles supported by the TBIMR tile sequencing
+    * hardware.
+    */
+   const unsigned max_horiz_tiles = 32;
+   const unsigned max_vert_tiles = 32;
+
+   /* Represent dimensions in hashing block units, which guarantees
+    * that the resulting tile dimensions are a multiple of the hashing
+    * block dimensions, a requirement of TBIMR in pixel hashing mode.
+    */
+   const unsigned fb_block_width = DIV_ROUND_UP(fb_width, block_width);
+   const unsigned fb_block_height = DIV_ROUND_UP(fb_height, block_height);
+
+   /* Amount of tile cache space for the workload to target. */
+   const unsigned tile_cache_size = intel_calculate_tile_cache_size(devinfo, 
cfg);
+   /* Cache footprint of a single hashing block worth of threads. */
+   const unsigned block_size = MAX2(1, pixel_size * block_width * 
block_height);
+   /* Calculate the desired tile surface (in block units) that fully
+    * utilizes the target portion of the tile cache, which in an ideal
+    * world where an oracle has given us the tile cache footprint per
+    * block is just the ratio of the two.
+    */
+   const unsigned desired_tile_surf = MAX2(1, tile_cache_size / block_size);
+   /* Clamp the desired tile surface to be between the surface of the
+    * whole framebuffer and the surface of the smallest tile possible
+    * at the maximum suported tile count.
+    */
+   const unsigned tile_surf = CLAMP(desired_tile_surf,
+                                    (DIV_ROUND_UP(fb_block_width, 
max_horiz_tiles) *
+                                     DIV_ROUND_UP(fb_block_height, 
max_vert_tiles)),
+                                    fb_block_width * fb_block_height);
+   /* XXX - If the tile_surf calculated above is smaller than the
+    *       number of pixel pipes on the GPU, the pipeline is so
+    *       cache-heavy that the parallelism of the GPU will have to
+    *       be constrained in order to avoid thrashing the tile cache.
+    *       Possibly emit a performance warning, or better, return an
+    *       error indicating that the pixel pipe hashing config needs
+    *       to be adjusted to use a finer hashing mode in order to
+    *       spread out the workload evenly across the available slices.
+    */
+
+   /* Select the tile aspect ratio that minimizes the number of passes
+    * required to render the whole framebuffer.  The search starts at
+    * an approximately square tile size of the desired surface and
+    * increases the ratio between its major and minor axes in a
+    * sequence of finite increments.
+    *
+    * The algorithm is biased in favor of the squarest possible tiling
+    * config since it starts with a tile shape closest to a square and
+    * early-exits when a global minimum is detected.  This bias is
+    * intentional since cache locality may suffer at high tile aspect
+    * ratios.
+    */
+   const float base_major = sqrtf(tile_surf);
+   /* Make sure that the minimum major axis where the search starts
+    * isn't so small (due to a small framebuffer or rounding) that the
+    * tile would have to be larger than the framebuffer in the
+    * opposite "minor" direction.
+    */
+   const unsigned min_major = MAX3(1, floorf(base_major),
+                                  tile_surf / MIN2(fb_block_width, 
fb_block_height));
+   /* Stop search at a an aspect ratio of approximately 2 (A major
+    * axis equal to 'base_major * M_SQRT2' would give an aspect ratio
+    * of exactly 2 if it was a valid integer number).  Aspect ratios
+    * higher than 2 could technically be useful, the upper bound is
+    * intended as a heuristic in order to set a low limit to the
+    * number of iterations the loop below may execute.
+    */
+   const unsigned max_major = ceilf(MAX2(base_major, min_major) * M_SQRT2);
+   assert(max_major < INT_MAX);
+
+   /* Best tile dimensions found so far. */
+   unsigned best_count = UINT_MAX;
+   unsigned best_block_width = 0;
+   unsigned best_block_height = 0;
+
+   for (unsigned major = min_major; major <= max_major;) {
+      /* Minor axis that yields the desired tile surface for the
+       * present major parameter.
+       */
+      const unsigned minor = MAX2(1, tile_surf / major);
+
+      /* Calculate the total number of tiles if this aspect ratio is
+       * used in the X-major orientation.
+       */
+      const unsigned horiz_tiles_x = DIV_ROUND_UP(fb_block_width, major);
+      const unsigned vert_tiles_x = DIV_ROUND_UP(fb_block_height, minor);
+      const unsigned count_x = horiz_tiles_x * vert_tiles_x;
+
+      /* Calculate the number of blocks we need to add to the major
+       * axis for the number of X-major tile columns (horiz_tiles_x)
+       * to drop by one.  This avoids many useless iterations relative
+       * to exhaustive search, since an increase in major can only
+       * decrease the total tile count if it decreases horiz_tiles_x
+       * as well, vert_tiles_x is monotonically increasing with major.
+       *
+       * If the number of tile columns is already 1 the X-major
+       * solution cannot be improved further, use "infinity" so the
+       * increment for the next iteration is only determined by the
+       * Y-major search -- If the Y-major solution cannot be improved
+       * either the search will be terminated.
+       */
+      const unsigned delta_x = horiz_tiles_x == 1 ? INT_MAX :
+         DIV_ROUND_UP(fb_block_width - major * (horiz_tiles_x - 1),
+                      horiz_tiles_x - 1);
+
+      /* Update the best known solution with the present X-major one
+       * if it's allowed by the hardware and requires a lower total
+       * number of tiles to cover the whole framebuffer.
+       */
+      if (horiz_tiles_x <= max_horiz_tiles && vert_tiles_x <= max_vert_tiles &&
+          count_x < best_count) {
+         best_count = count_x;
+         best_block_width = major;
+         best_block_height = minor;
+
+         /* The array of tiles is fully covered by the framebuffer, a
+          * global minimum has been found, terminate the search.
+          */
+         if (count_x * tile_surf == fb_block_width * fb_block_height)
+            break;
+      }
+
+      /* Calculate the total number of tiles if this aspect ratio is
+       * used in the Y-major orientation.
+       */
+      const unsigned horiz_tiles_y = DIV_ROUND_UP(fb_block_width, minor);
+      const unsigned vert_tiles_y = DIV_ROUND_UP(fb_block_height, major);
+      const unsigned count_y =  horiz_tiles_y * vert_tiles_y;
+
+      /* Calculate the number of blocks we need to add to the major
+       * axis for the number of Y-major tile rows (vert_tiles_y) to
+       * drop by one.  Analogous to the delta_x described above after
+       * a flip of the X and Y axes.
+       */
+      const unsigned delta_y = vert_tiles_y == 1 ? INT_MAX :
+         DIV_ROUND_UP(fb_block_height - major * (vert_tiles_y - 1),
+                      vert_tiles_y - 1);
+
+      /* Update the best known solution with the present Y-major one
+       * if it's allowed by the hardware and requires a lower total
+       * number of tiles to cover the whole framebuffer.
+       */
+      if (horiz_tiles_y <= max_horiz_tiles && vert_tiles_y <= max_vert_tiles &&
+          count_y < best_count) {
+         best_count = count_y;
+         best_block_width = minor;
+         best_block_height = major;
+
+         /* The array of tiles is fully covered by the framebuffer, a
+          * global minimum has been found, terminate the search.
+          */
+         if (count_y * tile_surf == fb_block_width * fb_block_height)
+            break;
+      }
+
+      /* Use the smallest of the computed major increments in order to
+       * visit the closest subsequent solution candidate.  If both the
+       * X-major and Y-major searches have terminated major will be
+       * pushed above the upper bound of the search, causing immediate
+       * termination.
+       */
+      const unsigned delta = MIN2(delta_x, delta_y);
+      assert(major + delta > major);
+      major += delta;
+   }
+
+   /* Sanity-check and return the result, scaling it back to pixel
+    * units.
+    */
+   assert(best_block_width > 0 && best_block_height > 0);
+   assert(DIV_ROUND_UP(fb_block_width, best_block_width) <= max_horiz_tiles);
+   assert(DIV_ROUND_UP(fb_block_height, best_block_height) <= max_vert_tiles);
+
+   *tile_width = best_block_width * block_width;
+   *tile_height = best_block_height * block_height;
+}
+
+#endif
diff --git a/src/intel/common/meson.build b/src/intel/common/meson.build
index 32c4151c4ca..a92407dbff6 100644
--- a/src/intel/common/meson.build
+++ b/src/intel/common/meson.build
@@ -43,6 +43,7 @@ files_libintel_common = files(
   'intel_guardband.h',
   'intel_l3_config.c',
   'intel_l3_config.h',
+  'intel_tiled_render.h',
   'intel_urb_config.c',
   'intel_sample_positions.c',
   'intel_sample_positions.h',

Reply via email to