[Intel-gfx] [PATCH 09/15] drm/i915: Extend i915 perf framework for collecting timestamps on all gpu engines

2016-11-04 Thread sourab . gupta
From: Sourab Gupta 

This patch extends the i915  perf framework to handle the perf sample
collection for any given gpu engine. Particularly, the support
for collecting timestamp sample type is added, which can be requested for
any engine.
With this, for RCS, timestamps and OA reports can be collected together,
and provided to userspace in separate sample fields. For other engines,
the capabilility to collect timestamps is added.

The thing to note is that, still only a single stream instance can be
opened at any particular time. Though that stream may now be opened for any
gpu engine, for collection of timestamp samples.

So, this patch doesn't add the support to open multiple concurrent streams,
as yet. Though it lays the groundwork for this support to be added
susequently. Part of this groundwork involves having separate command
stream buffers, per engine, for holding the samples generated.
Likewise for a few other data structures maintaining per-engine state.

Signed-off-by: Sourab Gupta 
---
 drivers/gpu/drm/i915/i915_drv.h  |  35 ++-
 drivers/gpu/drm/i915/i915_perf.c | 635 +--
 drivers/gpu/drm/i915/i915_reg.h  |   2 +
 include/uapi/drm/i915_drm.h  |   7 +
 4 files changed, 445 insertions(+), 234 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 0f171f8..a05335a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1814,7 +1814,8 @@ struct i915_perf_stream_ops {
 * Routine to emit the commands in the command streamer associated
 * with the corresponding gpu engine.
 */
-   void (*command_stream_hook)(struct drm_i915_gem_request *req, u32 tag);
+   void (*command_stream_hook)(struct i915_perf_stream *stream,
+   struct drm_i915_gem_request *req, u32 tag);
 };
 
 enum i915_perf_stream_state {
@@ -1839,6 +1840,9 @@ struct i915_perf_stream {
/* Whether command stream based data collection is enabled */
bool cs_mode;
 
+   /* Whether the OA unit is in use */
+   bool using_oa;
+
const struct i915_perf_stream_ops *ops;
 };
 
@@ -1870,7 +1874,16 @@ struct i915_oa_ops {
 struct i915_perf_cs_data_node {
struct list_head link;
struct drm_i915_gem_request *request;
-   u32 offset;
+
+   /* Offsets into the GEM obj holding the data */
+   u32 start_offset;
+   u32 oa_offset;
+   u32 ts_offset;
+
+   /* buffer size corresponding to this entry */
+   u32 size;
+
+   /* Other metadata */
u32 ctx_id;
u32 pid;
u32 tag;
@@ -2189,14 +2202,14 @@ struct drm_i915_private {
 
spinlock_t hook_lock;
 
-   struct {
-   struct i915_perf_stream *exclusive_stream;
 
-   u32 specific_ctx_id;
+   struct hrtimer poll_check_timer;
+   struct i915_perf_stream *exclusive_stream;
+   wait_queue_head_t poll_wq[I915_NUM_ENGINES];
+   atomic_t pollin[I915_NUM_ENGINES];
 
-   struct hrtimer poll_check_timer;
-   wait_queue_head_t poll_wq;
-   atomic_t pollin;
+   struct {
+   u32 specific_ctx_id;
 
bool periodic;
int period_exponent;
@@ -2241,13 +2254,13 @@ struct drm_i915_private {
u8 *addr;
 #define I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW (1<<0)
u32 status;
-   } command_stream_buf;
+   } command_stream_buf[I915_NUM_ENGINES];
 
u32 last_ctx_id;
u32 last_pid;
u32 last_tag;
-   struct list_head node_list;
-   spinlock_t node_list_lock;
+   struct list_head node_list[I915_NUM_ENGINES];
+   spinlock_t node_list_lock[I915_NUM_ENGINES];
} perf;
 
/* Abstract the submission mechanism (legacy ringbuffer or execlists) 
away */
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index ca523b1..516fd54 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -250,12 +250,17 @@ static u32 i915_perf_stream_paranoid = true;
 /* For determining the behavior on overflow of command stream samples */
 #define CMD_STREAM_BUF_OVERFLOW_ALLOWED
 
-/* Data common to periodic and RCS based samples */
-struct oa_sample_data {
+#define OA_ADDR_ALIGN 64
+#define TS_ADDR_ALIGN 8
+#define I915_PERF_TS_SAMPLE_SIZE 8
+
+/* Data common to all samples (periodic OA / CS based OA / Timestamps) */
+struct sample_data {
u32 source;
u32 ctx_id;
u32 pid;
u32 tag;
+   u64 ts;
const u8 *report;
 };
 
@@ -313,6 +318,7 @@ static const enum intel_engine_id 
user_ring_map[I915_USER_RINGS + 1] = {
 #define SAMPLE_CTX_ID  

[Intel-gfx] [PATCH 09/15] drm/i915: Extend i915 perf framework for collecting timestamps on all gpu engines

2016-06-01 Thread sourab . gupta
From: Sourab Gupta 

This patch extends the i915  perf framework to handle the perf sample
collection for any given gpu engine. Particularly, the support
for collecting timestamp sample type is added, which can be requested for
any engine.
With this, for RCS, timestamps and OA reports can be collected together,
and provided to userspace in separate sample fields. For other engines,
the capabilility to collect timestamps is added.

The thing to note is that, still only a single stream instance can be
opened at any particular time. Though that stream may now be opened for any
gpu engine, for collection of timestamp samples.

So, this patch doesn't add the support to open multiple concurrent streams,
as yet. Though it lays the groundwork for this support to be added
susequently. Part of this groundwork involves having separate command
stream buffers, per engine, for holding the samples generated.
Likewise for a few other data structures maintaining per-engine state.

Signed-off-by: Sourab Gupta 
---
 drivers/gpu/drm/i915/i915_drv.h  |  32 +-
 drivers/gpu/drm/i915/i915_perf.c | 648 ++-
 drivers/gpu/drm/i915/i915_reg.h  |   2 +
 include/uapi/drm/i915_drm.h  |   7 +
 4 files changed, 465 insertions(+), 224 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 9da5007..2a31b79 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1797,7 +1797,8 @@ struct i915_perf_stream_ops {
 * Routine to emit the commands in the command streamer associated
 * with the corresponding gpu engine.
 */
-   void (*command_stream_hook)(struct drm_i915_gem_request *req, u32 tag);
+   void (*command_stream_hook)(struct i915_perf_stream *stream,
+   struct drm_i915_gem_request *req, u32 tag);
 };
 
 enum i915_perf_stream_state {
@@ -1821,6 +1822,9 @@ struct i915_perf_stream {
/* Whether command stream based data collection is enabled */
bool cs_mode;
 
+   /* Whether the OA unit is in use */
+   bool using_oa;
+
const struct i915_perf_stream_ops *ops;
 };
 
@@ -1850,7 +1854,16 @@ struct i915_oa_ops {
 struct i915_perf_cs_data_node {
struct list_head link;
struct drm_i915_gem_request *request;
-   u32 offset;
+
+   /* Offsets into the GEM obj holding the data */
+   u32 start_offset;
+   u32 oa_offset;
+   u32 ts_offset;
+
+   /* buffer size corresponding to this entry */
+   u32 size;
+
+   /* Other metadata */
u32 ctx_id;
u32 pid;
u32 tag;
@@ -2147,14 +2160,13 @@ struct drm_i915_private {
 
spinlock_t hook_lock;
 
-   struct {
-   struct i915_perf_stream *exclusive_stream;
+   struct hrtimer poll_check_timer;
+   struct i915_perf_stream *exclusive_stream;
+   wait_queue_head_t poll_wq[I915_NUM_ENGINES];
 
+   struct {
u32 specific_ctx_id;
 
-   struct hrtimer poll_check_timer;
-   wait_queue_head_t poll_wq;
-
bool periodic;
int period_exponent;
int timestamp_frequency;
@@ -2197,13 +2209,13 @@ struct drm_i915_private {
u8 *addr;
 #define I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW (1<<0)
u32 status;
-   } command_stream_buf;
+   } command_stream_buf[I915_NUM_ENGINES];
 
u32 last_ctx_id;
u32 last_pid;
u32 last_tag;
-   struct list_head node_list;
-   spinlock_t node_list_lock;
+   struct list_head node_list[I915_NUM_ENGINES];
+   spinlock_t node_list_lock[I915_NUM_ENGINES];
} perf;
 
/* Abstract the submission mechanism (legacy ringbuffer or execlists) 
away */
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 902f84f..4a6fc5e 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -84,12 +84,17 @@ static u32 i915_perf_stream_paranoid = true;
 /* For determining the behavior on overflow of command stream samples */
 #define CMD_STREAM_BUF_OVERFLOW_ALLOWED
 
-/* Data common to periodic and RCS based samples */
-struct oa_sample_data {
+#define OA_ADDR_ALIGN 64
+#define TS_ADDR_ALIGN 8
+#define I915_PERF_TS_SAMPLE_SIZE 8
+
+/* Data common to all samples (periodic OA / CS based OA / Timestamps) */
+struct sample_data {
u32 source;
u32 ctx_id;
u32 pid;
u32 tag;
+   u64 ts;
const u8 *report;
 };
 
@@ -147,6 +152,7 @@ static const enum intel_engine_id 
user_ring_map[I915_USER_RINGS + 1] = {
 #define SAMPLE_CTX_ID  (1<<2)
 #define SAMPLE_PID (1<<3)
 #define SAMPLE_TAG