Re: [Intel-gfx] [PATCH v2 3/5] drm/i915: context submission pvmmio optimization

2018-10-31 Thread Zhang, Xiaolin
Ping review, thanks very much. 

BRs, Xiaolin 

-Original Message-
From: Zhang, Xiaolin 
Sent: Friday, October 19, 2018 3:27 PM
To: intel-gfx@lists.freedesktop.org
Cc: intel-gvt-...@lists.freedesktop.org; Zhang, Xiaolin 
; Zhenyu Wang ; Wang, Zhi A 
; Chris Wilson ; Joonas 
Lahtinen ; He; He, Min ; 
Jiang; Jiang, Fei ; Gong; Gong, Zhipeng 
; Yuan; Yuan, Hang ; Lv, Zhiyuan 

Subject: [PATCH v2 3/5] drm/i915: context submission pvmmio optimization

It is performance optimization to reduce mmio trap numbers from 4 to
1 durning ELSP porting writing (context submission).

When context subission, to cache elsp_data[4] values in the shared page, the 
last elsp_data[0] port writing will be trapped to gvt for real context 
submission.

Use PVMMIO_ELSP_SUBMIT to control this level of pvmmio optimization.

v0: RFC
v1: rebase
v2: added pv ops for pv context submission. to maximize code resuse, introduced 
2 more ops (submit_ports & preempt_context) instead of 1 op
(set_default_submission) in engine structure. pv version of submit_ports and 
preempt_context implemented.

Cc: Zhenyu Wang 
Cc: Zhi Wang 
Cc: Chris Wilson 
Cc: Joonas Lahtinen 
Cc: He, Min 
Cc: Jiang, Fei 
Cc: Gong, Zhipeng 
Cc: Yuan, Hang 
Cc: Zhiyuan Lv 
Signed-off-by: Xiaolin Zhang 
---
 drivers/gpu/drm/i915/i915_vgpu.c|  2 +
 drivers/gpu/drm/i915/intel_lrc.c| 88 +++--
 drivers/gpu/drm/i915/intel_ringbuffer.h |  3 ++
 3 files changed, 90 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_vgpu.c b/drivers/gpu/drm/i915/i915_vgpu.c
index cb409d5..9870ea6 100644
--- a/drivers/gpu/drm/i915/i915_vgpu.c
+++ b/drivers/gpu/drm/i915/i915_vgpu.c
@@ -66,6 +66,8 @@ void i915_check_vgpu(struct drm_i915_private *dev_priv)
 
BUILD_BUG_ON(sizeof(struct vgt_if) != VGT_PVINFO_SIZE);
 
+   dev_priv->vgpu.pv_caps = PVMMIO_ELSP_SUBMIT;
+
magic = __raw_i915_read64(dev_priv, vgtif_reg(magic));
if (magic != VGT_MAGIC)
return;
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 22b57b8..9e6ccf9 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -460,6 +460,60 @@ static void execlists_submit_ports(struct intel_engine_cs 
*engine)
execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK);  }
 
+static void execlists_submit_ports_pv(struct intel_engine_cs *engine) {
+   struct intel_engine_execlists *execlists = >execlists;
+   struct execlist_port *port = execlists->port;
+   u32 __iomem *elsp =
+   engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
+   u32 *elsp_data;
+   unsigned int n;
+   u32 descs[4];
+   int i = 0;
+
+   /*
+* ELSQ note: the submit queue is not cleared after being submitted
+* to the HW so we need to make sure we always clean it up. This is
+* currently ensured by the fact that we always write the same number
+* of elsq entries, keep this in mind before changing the loop below.
+*/
+   for (n = execlists_num_ports(execlists); n--; ) {
+   struct i915_request *rq;
+   unsigned int count;
+   u64 desc;
+
+   rq = port_unpack([n], );
+   if (rq) {
+   GEM_BUG_ON(count > !n);
+   if (!count++)
+   execlists_context_schedule_in(rq);
+   port_set([n], port_pack(rq, count));
+   desc = execlists_update_context(rq);
+   } else {
+   GEM_BUG_ON(!n);
+   desc = 0;
+   }
+   GEM_BUG_ON(i >= 4);
+   descs[i] = upper_32_bits(desc);
+   descs[i + 1] = lower_32_bits(desc);
+   i += 2;
+   }
+
+   spin_lock(>i915->vgpu.shared_page_lock);
+   elsp_data = engine->i915->vgpu.shared_page->elsp_data;
+   *elsp_data = descs[0];
+   *(elsp_data + 1) = descs[1];
+   *(elsp_data + 2) = descs[2];
+   writel(descs[3], elsp);
+   spin_unlock(>i915->vgpu.shared_page_lock);
+
+   /* we need to manually load the submit queue */
+   if (execlists->ctrl_reg)
+   writel(EL_CTRL_LOAD, execlists->ctrl_reg);
+
+   execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK); }
+
 static bool ctx_single_port_submission(const struct intel_context *ce)  {
return (IS_ENABLED(CONFIG_DRM_I915_GVT) && @@ -497,7 +551,6 @@ static 
void inject_preempt_context(struct intel_engine_cs *engine)
 
GEM_BUG_ON(execlists->preempt_complete_status !=
   upper_32_bits(ce->lrc_desc));
-
/*
 * Switch to our empty preempt context so
 * the state of the GPU is known (idle).
@@ -516,6 +569,27 @@ static void inject_preempt_context(struct intel_engine_cs 
*engine)
execlists_set_active(execlists, EXECLISTS_ACTIVE_PREEMPT);  }
 
+static void 

[Intel-gfx] [PATCH v2 3/5] drm/i915: context submission pvmmio optimization

2018-10-19 Thread Xiaolin Zhang
It is performance optimization to reduce mmio trap numbers from 4 to
1 durning ELSP porting writing (context submission).

When context subission, to cache elsp_data[4] values in
the shared page, the last elsp_data[0] port writing will be trapped
to gvt for real context submission.

Use PVMMIO_ELSP_SUBMIT to control this level of pvmmio optimization.

v0: RFC
v1: rebase
v2: added pv ops for pv context submission. to maximize code resuse,
introduced 2 more ops (submit_ports & preempt_context) instead of 1 op
(set_default_submission) in engine structure. pv version of
submit_ports and preempt_context implemented.

Cc: Zhenyu Wang 
Cc: Zhi Wang 
Cc: Chris Wilson 
Cc: Joonas Lahtinen 
Cc: He, Min 
Cc: Jiang, Fei 
Cc: Gong, Zhipeng 
Cc: Yuan, Hang 
Cc: Zhiyuan Lv 
Signed-off-by: Xiaolin Zhang 
---
 drivers/gpu/drm/i915/i915_vgpu.c|  2 +
 drivers/gpu/drm/i915/intel_lrc.c| 88 +++--
 drivers/gpu/drm/i915/intel_ringbuffer.h |  3 ++
 3 files changed, 90 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_vgpu.c b/drivers/gpu/drm/i915/i915_vgpu.c
index cb409d5..9870ea6 100644
--- a/drivers/gpu/drm/i915/i915_vgpu.c
+++ b/drivers/gpu/drm/i915/i915_vgpu.c
@@ -66,6 +66,8 @@ void i915_check_vgpu(struct drm_i915_private *dev_priv)
 
BUILD_BUG_ON(sizeof(struct vgt_if) != VGT_PVINFO_SIZE);
 
+   dev_priv->vgpu.pv_caps = PVMMIO_ELSP_SUBMIT;
+
magic = __raw_i915_read64(dev_priv, vgtif_reg(magic));
if (magic != VGT_MAGIC)
return;
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 22b57b8..9e6ccf9 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -460,6 +460,60 @@ static void execlists_submit_ports(struct intel_engine_cs 
*engine)
execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK);
 }
 
+static void execlists_submit_ports_pv(struct intel_engine_cs *engine)
+{
+   struct intel_engine_execlists *execlists = >execlists;
+   struct execlist_port *port = execlists->port;
+   u32 __iomem *elsp =
+   engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
+   u32 *elsp_data;
+   unsigned int n;
+   u32 descs[4];
+   int i = 0;
+
+   /*
+* ELSQ note: the submit queue is not cleared after being submitted
+* to the HW so we need to make sure we always clean it up. This is
+* currently ensured by the fact that we always write the same number
+* of elsq entries, keep this in mind before changing the loop below.
+*/
+   for (n = execlists_num_ports(execlists); n--; ) {
+   struct i915_request *rq;
+   unsigned int count;
+   u64 desc;
+
+   rq = port_unpack([n], );
+   if (rq) {
+   GEM_BUG_ON(count > !n);
+   if (!count++)
+   execlists_context_schedule_in(rq);
+   port_set([n], port_pack(rq, count));
+   desc = execlists_update_context(rq);
+   } else {
+   GEM_BUG_ON(!n);
+   desc = 0;
+   }
+   GEM_BUG_ON(i >= 4);
+   descs[i] = upper_32_bits(desc);
+   descs[i + 1] = lower_32_bits(desc);
+   i += 2;
+   }
+
+   spin_lock(>i915->vgpu.shared_page_lock);
+   elsp_data = engine->i915->vgpu.shared_page->elsp_data;
+   *elsp_data = descs[0];
+   *(elsp_data + 1) = descs[1];
+   *(elsp_data + 2) = descs[2];
+   writel(descs[3], elsp);
+   spin_unlock(>i915->vgpu.shared_page_lock);
+
+   /* we need to manually load the submit queue */
+   if (execlists->ctrl_reg)
+   writel(EL_CTRL_LOAD, execlists->ctrl_reg);
+
+   execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK);
+}
+
 static bool ctx_single_port_submission(const struct intel_context *ce)
 {
return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
@@ -497,7 +551,6 @@ static void inject_preempt_context(struct intel_engine_cs 
*engine)
 
GEM_BUG_ON(execlists->preempt_complete_status !=
   upper_32_bits(ce->lrc_desc));
-
/*
 * Switch to our empty preempt context so
 * the state of the GPU is known (idle).
@@ -516,6 +569,27 @@ static void inject_preempt_context(struct intel_engine_cs 
*engine)
execlists_set_active(execlists, EXECLISTS_ACTIVE_PREEMPT);
 }
 
+static void inject_preempt_context_pv(struct intel_engine_cs *engine)
+{
+   struct intel_engine_execlists *execlists = >execlists;
+   struct intel_context *ce =
+   to_intel_context(engine->i915->preempt_context, engine);
+   u32 __iomem *elsp =
+   engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
+   u32 *elsp_data;
+
+   GEM_BUG_ON(execlists->preempt_complete_status !=
+