[Intel-gfx] [PATCH] drm/i915/perf: implement active wait for noa configurations

2019-10-10 Thread Chris Wilson
From: Lionel Landwerlin 

NOA configuration take some amount of time to apply. That amount of
time depends on the size of the GT. There is no documented time for
this. For example, past experimentations with powergating
configuration changes seem to indicate a 60~70us delay. We go with
500us as default for now which should be over the required amount of
time (according to HW architects).

v2: Don't forget to save/restore registers used for the wait (Chris)

v3: Name used CS_GPR registers (Chris)
Fix compile issue due to rebase (Lionel)

v4: Fix save/restore helpers (Umesh)

v5: Move noa_wait from drm_i915_private to i915_perf_stream (Lionel)

v6: Add missing struct declarations in i915_perf.h

Signed-off-by: Lionel Landwerlin 
Reviewed-by: Chris Wilson  (v4)
Signed-off-by: Chris Wilson  (v4)
---
 drivers/gpu/drm/i915/gt/intel_gpu_commands.h  |   4 +-
 drivers/gpu/drm/i915/gt/intel_gt_types.h  |   5 +
 drivers/gpu/drm/i915/i915_debugfs.c   |  32 +++
 drivers/gpu/drm/i915/i915_perf.c  | 224 ++
 drivers/gpu/drm/i915/i915_perf_types.h|   8 +
 drivers/gpu/drm/i915/i915_reg.h   |   4 +-
 .../drm/i915/selftests/i915_live_selftests.h  |   1 +
 drivers/gpu/drm/i915/selftests/i915_perf.c| 216 +
 8 files changed, 492 insertions(+), 2 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/selftests/i915_perf.c

diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h 
b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
index 0987100c786b..8e63cffcabe0 100644
--- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
@@ -163,7 +163,8 @@
 #define MI_BATCH_BUFFER_START  MI_INSTR(0x31, 0)
 #define   MI_BATCH_GTT (2<<6) /* aliased with (1<<7) on gen4 */
 #define MI_BATCH_BUFFER_START_GEN8 MI_INSTR(0x31, 1)
-#define   MI_BATCH_RESOURCE_STREAMER (1<<10)
+#define   MI_BATCH_RESOURCE_STREAMER REG_BIT(10)
+#define   MI_BATCH_PREDICATE REG_BIT(15) /* HSW+ on RCS only*/
 
 /*
  * 3D instructions used by the kernel
@@ -224,6 +225,7 @@
 #define   PIPE_CONTROL_CS_STALL(1<<20)
 #define   PIPE_CONTROL_TLB_INVALIDATE  (1<<18)
 #define   PIPE_CONTROL_MEDIA_STATE_CLEAR   (1<<16)
+#define   PIPE_CONTROL_WRITE_TIMESTAMP (3<<14)
 #define   PIPE_CONTROL_QW_WRITE(1<<14)
 #define   PIPE_CONTROL_POST_SYNC_OP_MASK(3<<14)
 #define   PIPE_CONTROL_DEPTH_STALL (1<<13)
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h 
b/drivers/gpu/drm/i915/gt/intel_gt_types.h
index 802f516a3430..be4b263621c8 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
@@ -109,6 +109,11 @@ enum intel_gt_scratch_field {
/* 8 bytes */
INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA = 256,
 
+   /* 6 * 8 bytes */
+   INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR = 2048,
+
+   /* 4 bytes */
+   INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1 = 2096,
 };
 
 #endif /* __INTEL_GT_TYPES_H__ */
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index 277f31297f29..d463a28b7475 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -3590,6 +3590,37 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops,
i915_wedged_get, i915_wedged_set,
"%llu\n");
 
+static int
+i915_perf_noa_delay_set(void *data, u64 val)
+{
+   struct drm_i915_private *i915 = data;
+   const u32 clk = RUNTIME_INFO(i915)->cs_timestamp_frequency_khz;
+
+   /*
+* This would lead to infinite waits as we're doing timestamp
+* difference on the CS with only 32bits.
+*/
+   if (val > mul_u32_u32(U32_MAX, clk))
+   return -EINVAL;
+
+   atomic64_set(>perf.noa_programming_delay, val);
+   return 0;
+}
+
+static int
+i915_perf_noa_delay_get(void *data, u64 *val)
+{
+   struct drm_i915_private *i915 = data;
+
+   *val = atomic64_read(>perf.noa_programming_delay);
+   return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(i915_perf_noa_delay_fops,
+   i915_perf_noa_delay_get,
+   i915_perf_noa_delay_set,
+   "%llu\n");
+
 #define DROP_UNBOUND   BIT(0)
 #define DROP_BOUND BIT(1)
 #define DROP_RETIREBIT(2)
@@ -4340,6 +4371,7 @@ static const struct i915_debugfs_files {
const char *name;
const struct file_operations *fops;
 } i915_debugfs_files[] = {
+   {"i915_perf_noa_delay", _perf_noa_delay_fops},
{"i915_wedged", _wedged_fops},
{"i915_cache_sharing", _cache_sharing_fops},
{"i915_gem_drop_caches", _drop_caches_fops},
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 7d7baee7febe..abb7a70e17ec 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ 

[Intel-gfx] [PATCH] drm/i915/perf: implement active wait for noa configurations

2019-10-09 Thread Chris Wilson
From: Lionel Landwerlin 

NOA configuration take some amount of time to apply. That amount of
time depends on the size of the GT. There is no documented time for
this. For example, past experimentations with powergating
configuration changes seem to indicate a 60~70us delay. We go with
500us as default for now which should be over the required amount of
time (according to HW architects).

v2: Don't forget to save/restore registers used for the wait (Chris)

v3: Name used CS_GPR registers (Chris)
Fix compile issue due to rebase (Lionel)

v4: Fix save/restore helpers (Umesh)

v5: Move noa_wait from drm_i915_private to i915_perf_stream (Lionel)

v6: Add missing struct declarations in i915_perf.h

Signed-off-by: Lionel Landwerlin 
Reviewed-by: Chris Wilson  (v4)
---
 drivers/gpu/drm/i915/gt/intel_gpu_commands.h  |   4 +-
 drivers/gpu/drm/i915/gt/intel_gt_types.h  |   5 +
 drivers/gpu/drm/i915/i915_debugfs.c   |  32 +++
 drivers/gpu/drm/i915/i915_perf.c  | 223 ++
 drivers/gpu/drm/i915/i915_perf_types.h|   8 +
 drivers/gpu/drm/i915/i915_reg.h   |   4 +-
 .../drm/i915/selftests/i915_live_selftests.h  |   1 +
 drivers/gpu/drm/i915/selftests/i915_perf.c| 217 +
 8 files changed, 492 insertions(+), 2 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/selftests/i915_perf.c

diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h 
b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
index 0987100c786b..8e63cffcabe0 100644
--- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
@@ -163,7 +163,8 @@
 #define MI_BATCH_BUFFER_START  MI_INSTR(0x31, 0)
 #define   MI_BATCH_GTT (2<<6) /* aliased with (1<<7) on gen4 */
 #define MI_BATCH_BUFFER_START_GEN8 MI_INSTR(0x31, 1)
-#define   MI_BATCH_RESOURCE_STREAMER (1<<10)
+#define   MI_BATCH_RESOURCE_STREAMER REG_BIT(10)
+#define   MI_BATCH_PREDICATE REG_BIT(15) /* HSW+ on RCS only*/
 
 /*
  * 3D instructions used by the kernel
@@ -224,6 +225,7 @@
 #define   PIPE_CONTROL_CS_STALL(1<<20)
 #define   PIPE_CONTROL_TLB_INVALIDATE  (1<<18)
 #define   PIPE_CONTROL_MEDIA_STATE_CLEAR   (1<<16)
+#define   PIPE_CONTROL_WRITE_TIMESTAMP (3<<14)
 #define   PIPE_CONTROL_QW_WRITE(1<<14)
 #define   PIPE_CONTROL_POST_SYNC_OP_MASK(3<<14)
 #define   PIPE_CONTROL_DEPTH_STALL (1<<13)
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h 
b/drivers/gpu/drm/i915/gt/intel_gt_types.h
index 802f516a3430..be4b263621c8 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
@@ -109,6 +109,11 @@ enum intel_gt_scratch_field {
/* 8 bytes */
INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA = 256,
 
+   /* 6 * 8 bytes */
+   INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR = 2048,
+
+   /* 4 bytes */
+   INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1 = 2096,
 };
 
 #endif /* __INTEL_GT_TYPES_H__ */
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index 277f31297f29..d463a28b7475 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -3590,6 +3590,37 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops,
i915_wedged_get, i915_wedged_set,
"%llu\n");
 
+static int
+i915_perf_noa_delay_set(void *data, u64 val)
+{
+   struct drm_i915_private *i915 = data;
+   const u32 clk = RUNTIME_INFO(i915)->cs_timestamp_frequency_khz;
+
+   /*
+* This would lead to infinite waits as we're doing timestamp
+* difference on the CS with only 32bits.
+*/
+   if (val > mul_u32_u32(U32_MAX, clk))
+   return -EINVAL;
+
+   atomic64_set(>perf.noa_programming_delay, val);
+   return 0;
+}
+
+static int
+i915_perf_noa_delay_get(void *data, u64 *val)
+{
+   struct drm_i915_private *i915 = data;
+
+   *val = atomic64_read(>perf.noa_programming_delay);
+   return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(i915_perf_noa_delay_fops,
+   i915_perf_noa_delay_get,
+   i915_perf_noa_delay_set,
+   "%llu\n");
+
 #define DROP_UNBOUND   BIT(0)
 #define DROP_BOUND BIT(1)
 #define DROP_RETIREBIT(2)
@@ -4340,6 +4371,7 @@ static const struct i915_debugfs_files {
const char *name;
const struct file_operations *fops;
 } i915_debugfs_files[] = {
+   {"i915_perf_noa_delay", _perf_noa_delay_fops},
{"i915_wedged", _wedged_fops},
{"i915_cache_sharing", _cache_sharing_fops},
{"i915_gem_drop_caches", _drop_caches_fops},
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 5e14dd6c4c78..6bf02e4c5f09 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -198,6 +198,7 @@