On 23/03/18 18:00, Kenneth Graunke wrote:
On Thursday, March 8, 2018 7:42:53 AM PDT Lionel Landwerlin wrote:
This register contains the frequency of the GT, it's one of the value
GPA would like to have as part of their queries.

Signed-off-by: Lionel Landwerlin <lionel.g.landwer...@intel.com>
---
  src/mesa/drivers/dri/i965/brw_defines.h           | 10 +++++
  src/mesa/drivers/dri/i965/brw_performance_query.c | 45 +++++++++++++++++++++++
  src/mesa/drivers/dri/i965/brw_performance_query.h |  5 +++
  3 files changed, 60 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
b/src/mesa/drivers/dri/i965/brw_defines.h
index 8bf6f68b67c..ead44ebc5e8 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1656,6 +1656,16 @@ enum brw_pixel_shader_coverage_mask_mode {
  #define CS_DEBUG_MODE2                     0x20d8 /* Gen9+ */
  # define CSDBG2_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE (1 << 4)
+#define GEN6_RPSTAT1 0xA01C
+#define  GEN6_RPSTAT1_CURR_GT_FREQ_SHIFT   7
+#define  GEN6_RPSTAT1_CURR_GT_FREQ_MASK    INTEL_MASK(13, 7)
+#define  GEN6_RPSTAT1_PREV_GT_FREQ_SHIFT   0
+#define  GEN6_RPSTAT1_PREV_GT_FREQ_MASK    INTEL_MASK(6, 0)
+#define  GEN9_RPSTAT1_CURR_GT_FREQ_SHIFT   23
+#define  GEN9_RPSTAT1_CURR_GT_FREQ_MASK    INTEL_MASK(31, 23)
+#define  GEN9_RPSTAT1_PREV_GT_FREQ_SHIFT   0
+#define  GEN9_RPSTAT1_PREV_GT_FREQ_MASK    INTEL_MASK(8, 0)
+
I can confirm that Haswell->Broadwell use 13:7 and 6:0, while
Skylake and Cannonlake use 31:23 and 8:0.  They apparently call this
RPSTAT1 on Haswell and RP_STATUS0 on Gen8+.

These are the wrong masks for Sandybridge, so I would not call them
GEN6_*.  The kernel has code for Sandybridge if we wanted to handle it,
but it looks like we don't expose OA on Sandybridge anyway, so there's
likely little point.

Baytrail and Cherryview should both be excluded, as you have to read the
current frequency from the PUnit.  Broxton and all others should work.

Thanks, updating.


  #define SLICE_COMMON_ECO_CHICKEN1          0x731c /* Gen9+ */
  # define GLK_SCEC_BARRIER_MODE_GPGPU       (0 << 7)
  # define GLK_SCEC_BARRIER_MODE_3D_HULL     (1 << 7)
diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.c 
b/src/mesa/drivers/dri/i965/brw_performance_query.c
index 98666759d75..7d5b44cf61d 100644
--- a/src/mesa/drivers/dri/i965/brw_performance_query.c
+++ b/src/mesa/drivers/dri/i965/brw_performance_query.c
@@ -227,6 +227,8 @@ brw_perf_query(struct gl_perf_query_object *o)
#define MI_RPC_BO_SIZE 4096
  #define MI_RPC_BO_END_OFFSET_BYTES  (MI_RPC_BO_SIZE / 2)
+#define MI_FREQ_START_OFFSET_BYTES  (3072)
+#define MI_FREQ_END_OFFSET_BYTES    (3076)
Why these?

That's where I store the RPSTAT copy (before/after the workload).


  
/******************************************************************************/
@@ -1150,6 +1152,9 @@ brw_begin_perf_query(struct gl_context *ctx,
        /* Take a starting OA counter snapshot. */
        brw->vtbl.emit_mi_report_perf_count(brw, obj->oa.bo, 0,
                                            obj->oa.begin_report_id);
+      brw_store_register_mem32(brw, obj->oa.bo, GEN6_RPSTAT1,
+                               MI_FREQ_START_OFFSET_BYTES);
+
        ++brw->perfquery.n_active_oa_queries;
/* No already-buffered samples can possibly be associated with this query
@@ -1233,6 +1238,8 @@ brw_end_perf_query(struct gl_context *ctx,
         */
        if (!obj->oa.results_accumulated) {
           /* Take an ending OA counter snapshot. */
+         brw_store_register_mem32(brw, obj->oa.bo, GEN6_RPSTAT1,
+                                  MI_FREQ_END_OFFSET_BYTES);
           brw->vtbl.emit_mi_report_perf_count(brw, obj->oa.bo,
                                               MI_RPC_BO_END_OFFSET_BYTES,
                                               obj->oa.begin_report_id + 1);
@@ -1333,6 +1340,43 @@ brw_is_perf_query_ready(struct gl_context *ctx,
     return false;
  }
+static void
+read_gt_frequency(struct brw_context *brw,
+                  struct brw_perf_query_object *obj)
+{
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   uint32_t *start_reg = obj->oa.map + MI_FREQ_START_OFFSET_BYTES,
+      *end_reg = obj->oa.map + MI_FREQ_END_OFFSET_BYTES;
+
+   switch (devinfo->gen) {
+   case 7:
+   case 8:
+      obj->oa.gt_frequency[0] =
+         ((start_reg[0] & GEN6_RPSTAT1_CURR_GT_FREQ_MASK) >>
+          GEN6_RPSTAT1_CURR_GT_FREQ_SHIFT) * 50ULL;
You can just do:

   GET_FIELD(start_reg[0], GEN6_RPSTAT1_CURR_GT_FREQ)

instead of shifting and masking.

I think your conversions may be wrong.  In particular, you don't handle
Gen9LP and Gen9 differently, while in the kernel, GT_PM_INTERVAL_TO_US
does:

   Gen9 LP:      0.833 -> usec
   Gen9+ non-LP: 1.33  -> usec
   other:        1.28  -> usec

#define INTERVAL_1_28_TO_US(interval)  (((interval) << 7) / 100)
#define INTERVAL_1_33_TO_US(interval)  (((interval) << 2) / 3)
#define INTERVAL_0_833_TO_US(interval) (((interval) * 5)  / 6)
#define GT_PM_INTERVAL_TO_US(dev_priv, interval) (INTEL_GEN(dev_priv) >= 9 ? \
                            (IS_GEN9_LP(dev_priv) ? \
                            INTERVAL_0_833_TO_US(interval) : \
                            INTERVAL_1_33_TO_US(interval)) : \
                            INTERVAL_1_28_TO_US(interval))

I could be mistaken, though.

Actually the kernel reads rpstat1 already and computes the frequency value.
I think the current code is equivalent to what the kernel does on big cores & small cores >= gen9.

On cherryview/valleyview, we need to read another register to figure out the multipliers...

So I'll just leave it out for those small cores gens for now.


+      obj->oa.gt_frequency[1] =
+         ((end_reg[0] & GEN6_RPSTAT1_CURR_GT_FREQ_MASK) >>
+          GEN6_RPSTAT1_CURR_GT_FREQ_SHIFT) * 50ULL;
+      break;
+   case 9:
+   case 10:
+   case 11:
+      obj->oa.gt_frequency[0] =
+         ((start_reg[0] & GEN9_RPSTAT1_CURR_GT_FREQ_MASK) >>
+          GEN9_RPSTAT1_CURR_GT_FREQ_SHIFT) * 100ULL / 6ULL;
+      obj->oa.gt_frequency[1] =
+         ((end_reg[0] & GEN9_RPSTAT1_CURR_GT_FREQ_MASK) >>
+          GEN9_RPSTAT1_CURR_GT_FREQ_SHIFT) * 100ULL / 6ULL;
+      break;
+   default:
+      unreachable("unexpected gen");
+   }
+
+   /* Put the numbers into Hz. */
+   obj->oa.gt_frequency[0] *= 1000000ULL;
+   obj->oa.gt_frequency[1] *= 1000000ULL;
+}
+
  static int
  get_oa_counter_data(struct brw_context *brw,
                      struct brw_perf_query_object *obj,
@@ -1344,6 +1388,7 @@ get_oa_counter_data(struct brw_context *brw,
     int written = 0;
if (!obj->oa.results_accumulated) {
+      read_gt_frequency(brw, obj);
        accumulate_oa_reports(brw, obj);
        assert(obj->oa.results_accumulated);
diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.h b/src/mesa/drivers/dri/i965/brw_performance_query.h
index f62786f7f1c..f8732738b4e 100644
--- a/src/mesa/drivers/dri/i965/brw_performance_query.h
+++ b/src/mesa/drivers/dri/i965/brw_performance_query.h
@@ -113,6 +113,11 @@ struct brw_perf_query_object
            * Number of reports accumulated to produce the results.
            */
           uint32_t reports_accumulated;
+
+         /**
+          * Frequency of the GT at begin and end of the query.
+          */
+         uint64_t gt_frequency[2];
        } oa;
struct {


_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to