[Intel-gfx] [PATCH 3/3] intel_perf_counters: Add support for Sandybridge.

2013-03-27 Thread Kenneth Graunke
While the Sandybridge PRM doesn't have any documentation on the GPU's
performance counters, a lot of information can be gleaned from the older
Ironlake PRM.  Oddly, none of the information documented there actually
appears to apply to Ironlake.  However, it apparently works just great
on Sandybridge.

Since this information has all been publicly available on the internet
for around three years, we can use it.

Signed-off-by: Kenneth Graunke kenn...@whitecape.org
---
 tools/intel_perf_counters.c | 146 
 1 file changed, 146 insertions(+)

diff --git a/tools/intel_perf_counters.c b/tools/intel_perf_counters.c
index fd268b1..b528361 100644
--- a/tools/intel_perf_counters.c
+++ b/tools/intel_perf_counters.c
@@ -22,9 +22,21 @@
  *
  * Authors:
  *Eric Anholt e...@anholt.net
+ *Kenneth Graunke kenn...@whitecape.org
+ *
+ * While documentation for performance counters is suspiciously missing from 
the
+ * Sandybridge PRM, they were documented in Volume 1 Part 3 of the Ironlake 
PRM.
+ *
+ * A lot of the Ironlake PRM actually unintentionally documents Sandybridge
+ * due to mistakes made when updating the documentation for Gen6+.  Many of
+ * these mislabeled sections carried forward to the public documentation.
+ *
+ * The Ironlake PRMs have been publicly available since 2010 and are online at:
+ * https://01.org/linuxgraphics/documentation/2010-intel-core-processor-family
  */
 
 #include unistd.h
+#include stdbool.h
 #include stdlib.h
 #include stdio.h
 #include err.h
@@ -71,6 +83,60 @@ const char *gen5_counter_names[GEN5_COUNTER_COUNT] = {
cycles any EU is stalled for math,
 };
 
+#define GEN6_COUNTER_COUNT 29
+
+/**
+ * Sandybridge: Counter Select = 001
+ * A0   A1   A2   A3   A4   TIMESTAMP RPT_ID
+ * A5   A6   A7   A8   A9   A10  A11  A12
+ * A13  A14  A15  A16  A17  A18  A19  A20
+ * A21  A22  A23  A24  A25  A26  A27  A28
+ */
+const int gen6_counter_format = 1;
+
+/**
+ * Names for aggregating counters A0-A28.
+ *
+ * While the Ironlake PRM clearly documents that there are 29 counters 
(A0-A28),
+ * it only lists the names for 28 of them; one is missing.  However, careful
+ * examination reveals a pattern: there are five GS counters (Active, Stall,
+ * Core Stall, # threads loaded, and ready but not running time).  There are
+ * also five PS counters, in the same order.  But there are only four VS
+ * counters listed - the number of VS threads loaded is missing.  Presumably,
+ * it exists and is counter 5, and the rest are shifted over one place.
+ */
+const char *gen6_counter_names[GEN6_COUNTER_COUNT] = {
+   [0]  = Aggregated Core Array Active,
+   [1]  = Aggregated Core Array Stalled,
+   [2]  = Vertex Shader Active Time,
+   [3]  = Vertex Shader Stall Time,
+   [4]  = Vertex Shader Stall Time - Core Stall,
+   [5]  = # VS threads loaded,
+   [6]  = Vertex Shader Ready but not running time,
+   [7]  = Geometry Shader Active Time,
+   [8]  = Geometry Shader Stall Time,
+   [9]  = Geometry Shader Stall Time - Core Stall,
+   [10] = # GS threads loaded,
+   [11] = Geometry Shader ready but not running Time,
+   [12] = Pixel Shader Active Time,
+   [13] = Pixel Shader Stall Time,
+   [14] = Pixel Shader Stall Time - Core Stall,
+   [15] = # PS threads loaded,
+   [16] = Pixel Shader ready but not running Time,
+   [17] = Early Z Test Pixels Passing,
+   [18] = Early Z Test Pixels Failing,
+   [19] = Early Stencil Test Pixels Passing,
+   [20] = Early Stencil Test Pixels Failing,
+   [21] = Pixel Kill Count,
+   [22] = Alpha Test Pixels Failed,
+   [23] = Post PS Stencil Pixels Failed,
+   [24] = Post PS Z buffer Pixels Failed,
+   [25] = Pixels/samples Written in the frame buffer,
+   [26] = GPU Busy,
+   [27] = CL active and not stalled,
+   [28] = SF active and stalled,
+};
+
 int have_totals = 0;
 uint32_t *totals;
 uint32_t *last_counter;
@@ -85,6 +151,20 @@ struct intel_batchbuffer *batch;
 #define MI_COUNTER_ADDRESS_GTT (1  0)
 /* DW2: report ID */
 
+/**
+ * According to the Sandybridge PRM, Volume 1, Part 1, page 48,
+ * MI_REPORT_PERF_COUNT is now opcode 0x28.  The Ironlake PRM, Volume 1,
+ * Part 3 details how it works.
+ */
+/* DW0 */
+#define GEN6_MI_REPORT_PERF_COUNT (0x28  23)
+/* DW1 and 2 are the same as above */
+
+/* OACONTROL exists on Gen6+ but is documented in the Ironlake PRM */
+#define OACONTROL   0x2360
+# define OACONTROL_COUNTER_SELECT_SHIFT 2
+# define PERFORMANCE_COUNTER_ENABLE (1  0)
+
 static void
 gen5_get_counters(void)
 {
@@ -124,6 +204,45 @@ gen5_get_counters(void)
drm_intel_bo_unreference(stats_bo);
 }
 
+static void
+gen6_get_counters(void)
+{
+   int i;
+   drm_intel_bo *stats_bo;
+   uint32_t *stats_result;
+
+   /* Map from counter names to their index in the buffer object */
+   static const int buffer_index[GEN6_COUNTER_COUNT] =
+   {
+  

Re: [Intel-gfx] [PATCH 3/3] intel_perf_counters: Add support for Sandybridge.

2013-03-27 Thread Daniel Vetter
On Tue, Mar 26, 2013 at 10:06:39PM -0700, Kenneth Graunke wrote:
 While the Sandybridge PRM doesn't have any documentation on the GPU's
 performance counters, a lot of information can be gleaned from the older
 Ironlake PRM.  Oddly, none of the information documented there actually
 appears to apply to Ironlake.  However, it apparently works just great
 on Sandybridge.
 
 Since this information has all been publicly available on the internet
 for around three years, we can use it.
 
 Signed-off-by: Kenneth Graunke kenn...@whitecape.org

Merged, thanks for the patches.
-Daniel

 ---
  tools/intel_perf_counters.c | 146 
 
  1 file changed, 146 insertions(+)
 
 diff --git a/tools/intel_perf_counters.c b/tools/intel_perf_counters.c
 index fd268b1..b528361 100644
 --- a/tools/intel_perf_counters.c
 +++ b/tools/intel_perf_counters.c
 @@ -22,9 +22,21 @@
   *
   * Authors:
   *Eric Anholt e...@anholt.net
 + *Kenneth Graunke kenn...@whitecape.org
 + *
 + * While documentation for performance counters is suspiciously missing from 
 the
 + * Sandybridge PRM, they were documented in Volume 1 Part 3 of the Ironlake 
 PRM.
 + *
 + * A lot of the Ironlake PRM actually unintentionally documents Sandybridge
 + * due to mistakes made when updating the documentation for Gen6+.  Many of
 + * these mislabeled sections carried forward to the public documentation.
 + *
 + * The Ironlake PRMs have been publicly available since 2010 and are online 
 at:
 + * 
 https://01.org/linuxgraphics/documentation/2010-intel-core-processor-family
   */
  
  #include unistd.h
 +#include stdbool.h
  #include stdlib.h
  #include stdio.h
  #include err.h
 @@ -71,6 +83,60 @@ const char *gen5_counter_names[GEN5_COUNTER_COUNT] = {
   cycles any EU is stalled for math,
  };
  
 +#define GEN6_COUNTER_COUNT 29
 +
 +/**
 + * Sandybridge: Counter Select = 001
 + * A0   A1   A2   A3   A4   TIMESTAMP RPT_ID
 + * A5   A6   A7   A8   A9   A10  A11  A12
 + * A13  A14  A15  A16  A17  A18  A19  A20
 + * A21  A22  A23  A24  A25  A26  A27  A28
 + */
 +const int gen6_counter_format = 1;
 +
 +/**
 + * Names for aggregating counters A0-A28.
 + *
 + * While the Ironlake PRM clearly documents that there are 29 counters 
 (A0-A28),
 + * it only lists the names for 28 of them; one is missing.  However, careful
 + * examination reveals a pattern: there are five GS counters (Active, Stall,
 + * Core Stall, # threads loaded, and ready but not running time).  There are
 + * also five PS counters, in the same order.  But there are only four VS
 + * counters listed - the number of VS threads loaded is missing.  Presumably,
 + * it exists and is counter 5, and the rest are shifted over one place.
 + */
 +const char *gen6_counter_names[GEN6_COUNTER_COUNT] = {
 + [0]  = Aggregated Core Array Active,
 + [1]  = Aggregated Core Array Stalled,
 + [2]  = Vertex Shader Active Time,
 + [3]  = Vertex Shader Stall Time,
 + [4]  = Vertex Shader Stall Time - Core Stall,
 + [5]  = # VS threads loaded,
 + [6]  = Vertex Shader Ready but not running time,
 + [7]  = Geometry Shader Active Time,
 + [8]  = Geometry Shader Stall Time,
 + [9]  = Geometry Shader Stall Time - Core Stall,
 + [10] = # GS threads loaded,
 + [11] = Geometry Shader ready but not running Time,
 + [12] = Pixel Shader Active Time,
 + [13] = Pixel Shader Stall Time,
 + [14] = Pixel Shader Stall Time - Core Stall,
 + [15] = # PS threads loaded,
 + [16] = Pixel Shader ready but not running Time,
 + [17] = Early Z Test Pixels Passing,
 + [18] = Early Z Test Pixels Failing,
 + [19] = Early Stencil Test Pixels Passing,
 + [20] = Early Stencil Test Pixels Failing,
 + [21] = Pixel Kill Count,
 + [22] = Alpha Test Pixels Failed,
 + [23] = Post PS Stencil Pixels Failed,
 + [24] = Post PS Z buffer Pixels Failed,
 + [25] = Pixels/samples Written in the frame buffer,
 + [26] = GPU Busy,
 + [27] = CL active and not stalled,
 + [28] = SF active and stalled,
 +};
 +
  int have_totals = 0;
  uint32_t *totals;
  uint32_t *last_counter;
 @@ -85,6 +151,20 @@ struct intel_batchbuffer *batch;
  #define MI_COUNTER_ADDRESS_GTT   (1  0)
  /* DW2: report ID */
  
 +/**
 + * According to the Sandybridge PRM, Volume 1, Part 1, page 48,
 + * MI_REPORT_PERF_COUNT is now opcode 0x28.  The Ironlake PRM, Volume 1,
 + * Part 3 details how it works.
 + */
 +/* DW0 */
 +#define GEN6_MI_REPORT_PERF_COUNT (0x28  23)
 +/* DW1 and 2 are the same as above */
 +
 +/* OACONTROL exists on Gen6+ but is documented in the Ironlake PRM */
 +#define OACONTROL   0x2360
 +# define OACONTROL_COUNTER_SELECT_SHIFT 2
 +# define PERFORMANCE_COUNTER_ENABLE (1  0)
 +
  static void
  gen5_get_counters(void)
  {
 @@ -124,6 +204,45 @@ gen5_get_counters(void)
   drm_intel_bo_unreference(stats_bo);
  }
  
 +static void
 +gen6_get_counters(void)
 +{
 + int i;
 + drm_intel_bo