[Intel-gfx] [CI 6/9] drm/i915/perf: execute OA configuration from command stream

2019-10-11 Thread Chris Wilson
From: Lionel Landwerlin 

We haven't run into issues with programming the global OA/NOA
registers configuration from CPU so far, but HW engineers actually
recommend doing this from the command streamer. On TGL in particular
one of the clock domain in which some of that programming goes might
not be powered when we poke things from the CPU.

Since we have a command buffer prepared for the execbuffer side of
things, we can reuse that approach here too.

This also allows us to significantly reduce the amount of time we hold
the main lock.

v2: Drop the global lock as much as possible

v3: Take global lock to pin global

v4: Create i915 request in emit_oa_config() to avoid deadlocks (Lionel)

v5: Move locking to the stream (Lionel)

v6: Move active reconfiguration request into i915_perf_stream (Lionel)

v7: Pin VMA outside request creation (Chris)
Lock VMA before move to active (Chris)

v8: Fix double free on stream->initial_oa_config_bo (Lionel)
Don't allow interruption when waiting on active config request
(Lionel)

Signed-off-by: Lionel Landwerlin 
Reviewed-by: Chris Wilson 
Signed-off-by: Chris Wilson 
---
 drivers/gpu/drm/i915/i915_perf.c | 199 ---
 1 file changed, 156 insertions(+), 43 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index abb7a70e17ec..c2431b5a1f55 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1731,56 +1731,181 @@ static int alloc_noa_wait(struct i915_perf_stream 
*stream)
return 0;
 
 err_unpin:
-   __i915_vma_unpin(vma);
+   i915_vma_unpin_and_release(&vma, 0);
 err_unref:
i915_gem_object_put(bo);
return ret;
 }
 
-static void config_oa_regs(struct intel_uncore *uncore,
-  const struct i915_oa_reg *regs,
-  u32 n_regs)
+static u32 *write_cs_mi_lri(u32 *cs,
+   const struct i915_oa_reg *reg_data,
+   u32 n_regs)
 {
u32 i;
 
for (i = 0; i < n_regs; i++) {
-   const struct i915_oa_reg *reg = regs + i;
+   if ((i % MI_LOAD_REGISTER_IMM_MAX_REGS) == 0) {
+   u32 n_lri = min_t(u32,
+ n_regs - i,
+ MI_LOAD_REGISTER_IMM_MAX_REGS);
+
+   *cs++ = MI_LOAD_REGISTER_IMM(n_lri);
+   }
+   *cs++ = i915_mmio_reg_offset(reg_data[i].addr);
+   *cs++ = reg_data[i].value;
+   }
+
+   return cs;
+}
+
+static int num_lri_dwords(int num_regs)
+{
+   int count = 0;
+
+   if (num_regs > 0) {
+   count += DIV_ROUND_UP(num_regs, MI_LOAD_REGISTER_IMM_MAX_REGS);
+   count += num_regs * 2;
+   }
+
+   return count;
+}
+
+static struct i915_oa_config_bo *
+alloc_oa_config_buffer(struct i915_perf_stream *stream,
+  struct i915_oa_config *oa_config)
+{
+   struct drm_i915_gem_object *obj;
+   struct i915_oa_config_bo *oa_bo;
+   size_t config_length = 0;
+   u32 *cs;
+   int err;
+
+   oa_bo = kzalloc(sizeof(*oa_bo), GFP_KERNEL);
+   if (!oa_bo)
+   return ERR_PTR(-ENOMEM);
+
+   config_length += num_lri_dwords(oa_config->mux_regs_len);
+   config_length += num_lri_dwords(oa_config->b_counter_regs_len);
+   config_length += num_lri_dwords(oa_config->flex_regs_len);
+   config_length++; /* MI_BATCH_BUFFER_END */
+   config_length = ALIGN(sizeof(u32) * config_length, I915_GTT_PAGE_SIZE);
+
+   obj = i915_gem_object_create_shmem(stream->perf->i915, config_length);
+   if (IS_ERR(obj)) {
+   err = PTR_ERR(obj);
+   goto err_free;
+   }
+
+   cs = i915_gem_object_pin_map(obj, I915_MAP_WB);
+   if (IS_ERR(cs)) {
+   err = PTR_ERR(cs);
+   goto err_oa_bo;
+   }
 
-   intel_uncore_write(uncore, reg->addr, reg->value);
+   cs = write_cs_mi_lri(cs,
+oa_config->mux_regs,
+oa_config->mux_regs_len);
+   cs = write_cs_mi_lri(cs,
+oa_config->b_counter_regs,
+oa_config->b_counter_regs_len);
+   cs = write_cs_mi_lri(cs,
+oa_config->flex_regs,
+oa_config->flex_regs_len);
+
+   *cs++ = MI_BATCH_BUFFER_END;
+
+   i915_gem_object_flush_map(obj);
+   i915_gem_object_unpin_map(obj);
+
+   oa_bo->vma = i915_vma_instance(obj,
+  &stream->engine->gt->ggtt->vm,
+  NULL);
+   if (IS_ERR(oa_bo->vma)) {
+   err = PTR_ERR(oa_bo->vma);
+   goto err_oa_bo;
}
+
+   oa_bo->oa_config = i915_oa_config_get(oa_config);
+   llist_add(&oa_bo->node, &stream->oa_config_bos);
+
+   return oa_bo;
+

[Intel-gfx] [CI 6/9] drm/i915/perf: execute OA configuration from command stream

2019-10-10 Thread Chris Wilson
From: Lionel Landwerlin 

We haven't run into issues with programming the global OA/NOA
registers configuration from CPU so far, but HW engineers actually
recommend doing this from the command streamer. On TGL in particular
one of the clock domain in which some of that programming goes might
not be powered when we poke things from the CPU.

Since we have a command buffer prepared for the execbuffer side of
things, we can reuse that approach here too.

This also allows us to significantly reduce the amount of time we hold
the main lock.

v2: Drop the global lock as much as possible

v3: Take global lock to pin global

v4: Create i915 request in emit_oa_config() to avoid deadlocks (Lionel)

v5: Move locking to the stream (Lionel)

v6: Move active reconfiguration request into i915_perf_stream (Lionel)

v7: Pin VMA outside request creation (Chris)
Lock VMA before move to active (Chris)

v8: Fix double free on stream->initial_oa_config_bo (Lionel)
Don't allow interruption when waiting on active config request
(Lionel)

Signed-off-by: Lionel Landwerlin 
Reviewed-by: Chris Wilson 
Signed-off-by: Chris Wilson 
---
 drivers/gpu/drm/i915/i915_perf.c | 199 ---
 1 file changed, 156 insertions(+), 43 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index caa4ab68cea5..c37fe275cf33 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1731,56 +1731,181 @@ static int alloc_noa_wait(struct i915_perf_stream 
*stream)
return 0;
 
 err_unpin:
-   __i915_vma_unpin(vma);
+   i915_vma_unpin_and_release(&vma, 0);
 err_unref:
i915_gem_object_put(bo);
return ret;
 }
 
-static void config_oa_regs(struct intel_uncore *uncore,
-  const struct i915_oa_reg *regs,
-  u32 n_regs)
+static u32 *write_cs_mi_lri(u32 *cs,
+   const struct i915_oa_reg *reg_data,
+   u32 n_regs)
 {
u32 i;
 
for (i = 0; i < n_regs; i++) {
-   const struct i915_oa_reg *reg = regs + i;
+   if ((i % MI_LOAD_REGISTER_IMM_MAX_REGS) == 0) {
+   u32 n_lri = min_t(u32,
+ n_regs - i,
+ MI_LOAD_REGISTER_IMM_MAX_REGS);
+
+   *cs++ = MI_LOAD_REGISTER_IMM(n_lri);
+   }
+   *cs++ = i915_mmio_reg_offset(reg_data[i].addr);
+   *cs++ = reg_data[i].value;
+   }
+
+   return cs;
+}
+
+static int num_lri_dwords(int num_regs)
+{
+   int count = 0;
+
+   if (num_regs > 0) {
+   count += DIV_ROUND_UP(num_regs, MI_LOAD_REGISTER_IMM_MAX_REGS);
+   count += num_regs * 2;
+   }
+
+   return count;
+}
+
+static struct i915_oa_config_bo *
+alloc_oa_config_buffer(struct i915_perf_stream *stream,
+  struct i915_oa_config *oa_config)
+{
+   struct drm_i915_gem_object *obj;
+   struct i915_oa_config_bo *oa_bo;
+   size_t config_length = 0;
+   u32 *cs;
+   int err;
+
+   oa_bo = kzalloc(sizeof(*oa_bo), GFP_KERNEL);
+   if (!oa_bo)
+   return ERR_PTR(-ENOMEM);
+
+   config_length += num_lri_dwords(oa_config->mux_regs_len);
+   config_length += num_lri_dwords(oa_config->b_counter_regs_len);
+   config_length += num_lri_dwords(oa_config->flex_regs_len);
+   config_length++; /* MI_BATCH_BUFFER_END */
+   config_length = ALIGN(sizeof(u32) * config_length, I915_GTT_PAGE_SIZE);
+
+   obj = i915_gem_object_create_shmem(stream->perf->i915, config_length);
+   if (IS_ERR(obj)) {
+   err = PTR_ERR(obj);
+   goto err_free;
+   }
+
+   cs = i915_gem_object_pin_map(obj, I915_MAP_WB);
+   if (IS_ERR(cs)) {
+   err = PTR_ERR(cs);
+   goto err_oa_bo;
+   }
 
-   intel_uncore_write(uncore, reg->addr, reg->value);
+   cs = write_cs_mi_lri(cs,
+oa_config->mux_regs,
+oa_config->mux_regs_len);
+   cs = write_cs_mi_lri(cs,
+oa_config->b_counter_regs,
+oa_config->b_counter_regs_len);
+   cs = write_cs_mi_lri(cs,
+oa_config->flex_regs,
+oa_config->flex_regs_len);
+
+   *cs++ = MI_BATCH_BUFFER_END;
+
+   i915_gem_object_flush_map(obj);
+   i915_gem_object_unpin_map(obj);
+
+   oa_bo->vma = i915_vma_instance(obj,
+  &stream->engine->gt->ggtt->vm,
+  NULL);
+   if (IS_ERR(oa_bo->vma)) {
+   err = PTR_ERR(oa_bo->vma);
+   goto err_oa_bo;
}
+
+   oa_bo->oa_config = i915_oa_config_get(oa_config);
+   llist_add(&oa_bo->node, &stream->oa_config_bos);
+
+   return oa_bo;
+