Re: [Intel-gfx] [PATCH v3 2/2] drm/i915/tgl: MOCS table update

2019-11-13 Thread Lis, Tomasz


On 2019-11-13 02:09, Lucas De Marchi wrote:

On Tue, Nov 12, 2019 at 02:47:57PM -0800, Matt Roper wrote:

The bspec was just updated with a minor correction to entry 61 (it
shouldn't have had the SCF bit set).

v2:
- Add a MOCS_ENTRY_UNUSED() and use it to declare the
  explicitly-reserved MOCS entries. (Lucas)
- Move the warning suppression from the Makefile to a #pragma that only
  affects the TGL table. (Lucas)

v3:
- Entries 16 and 17 are identical to ICL now, so no need to explicitly
  adjust them (or mess with compiler warning overrides).

Bspec: 45101
Fixes: 2ddf992179c4 ("drm/i915/tgl: Define MOCS entries for Tigerlake")
Cc: Tomasz Lis 
Cc: Lucas De Marchi 
Cc: Francisco Jerez 
Cc: Jon Bloomfield 
Signed-off-by: Matt Roper 



Reviewed-by: Lucas De Marchi 

Lucas De Marchi


Reviewed-by: Tomasz Lis 

Tomasz




---
drivers/gpu/drm/i915/gt/intel_mocs.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_mocs.c 
b/drivers/gpu/drm/i915/gt/intel_mocs.c

index 06e2adbf27be..2b977991b785 100644
--- a/drivers/gpu/drm/i915/gt/intel_mocs.c
+++ b/drivers/gpu/drm/i915/gt/intel_mocs.c
@@ -263,7 +263,7 @@ static const struct drm_i915_mocs_entry 
tigerlake_mocs_table[] = {

   L3_1_UC),
/* HW Special Case (Displayable) */
MOCS_ENTRY(61,
-   LE_1_UC | LE_TC_1_LLC | LE_SCF(1),
+   LE_1_UC | LE_TC_1_LLC,
   L3_3_WB),
};

--
2.21.0


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Re: [Intel-gfx] [PATCH 3/4] drm/i915/tgl: Tigerlake only has global MOCS registers

2019-07-26 Thread Lis, Tomasz



On 2019-07-26 02:12, Lucas De Marchi wrote:

From: Michel Thierry 

Until Icelake, each engine had its own set of 64 MOCS registers. In
order to simplify, Tigerlake moves to only 64 Global MOCS registers,
which are no longer part of the engine context. Since these registers
are now global, they also only need to be initialized once.

 From Gen12 onwards, MOCS must specify the target cache (3:2) and LRU
management (5:4) fields and cannot be programmed to 'use the value from
Private PAT', because these fields are no longer part of the PPAT. Also
cacheability control (1:0) field has changed, 00 no longer means 'use
controls from page table', but uncacheable (UC).

v2: Move the changes to the fault registers to a separate commit - the old ones
 overlap with the range used by the new global MOCS (requested by
 Daniele)

Cc: Daniele Ceraolo Spurio 
Cc: Tomasz Lis 
Signed-off-by: Michel Thierry 
Signed-off-by: Tvrtko Ursulin 
Signed-off-by: Lucas De Marchi 
---
  drivers/gpu/drm/i915/gt/intel_mocs.c | 47 
  drivers/gpu/drm/i915/gt/intel_mocs.h |  1 +
  drivers/gpu/drm/i915/i915_drv.h  |  2 +
  drivers/gpu/drm/i915/i915_gem.c  |  1 +
  drivers/gpu/drm/i915/i915_gpu_error.c|  6 ++-
  drivers/gpu/drm/i915/i915_pci.c  |  3 +-
  drivers/gpu/drm/i915/i915_reg.h  |  2 +
  drivers/gpu/drm/i915/intel_device_info.h |  1 +
  8 files changed, 60 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_mocs.c 
b/drivers/gpu/drm/i915/gt/intel_mocs.c
index ca370c7487f9..9399c0ec08f1 100644
--- a/drivers/gpu/drm/i915/gt/intel_mocs.c
+++ b/drivers/gpu/drm/i915/gt/intel_mocs.c
@@ -377,6 +377,10 @@ void intel_mocs_init_engine(struct intel_engine_cs *engine)
unsigned int index;
u32 unused_value;
  
+	/* Platforms with global MOCS do not need per-engine initialization. */

+   if (HAS_GLOBAL_MOCS_REGISTERS(gt->i915))
+   return;
+
/* Called under a blanket forcewake */
assert_forcewakes_active(uncore, FORCEWAKE_ALL);
  
@@ -401,6 +405,46 @@ void intel_mocs_init_engine(struct intel_engine_cs *engine)

  unused_value);
  }
  
+/**

+ * intel_mocs_init_global() - program the global mocs registers
+ * gt:  pointer to struct intel_gt
+ *
+ * This function initializes the MOCS global registers.
+ */
+void intel_mocs_init_global(struct intel_gt *gt)
+{
+   struct intel_uncore *uncore = gt->uncore;
+   struct drm_i915_mocs_table table;
+   unsigned int index;
+
+   if (!HAS_GLOBAL_MOCS_REGISTERS(gt->i915))
+   return;
+
+   if (!get_mocs_settings(gt, ))
+   return;
+
+   if (GEM_DEBUG_WARN_ON(table.size > table.n_entries))
+   return;
+
+   for (index = 0; index < table.size; index++)
+   intel_uncore_write(uncore,
+  GEN12_GLOBAL_MOCS(index),
+  table.table[index].control_value);
+
+   /*
+* Ok, now set the unused entries to uncached. These entries
+* are officially undefined and no contract for the contents
+* and settings is given for these entries.
+*
+* Entry 0 in the table is uncached - so we are just writing
+* that value to all the used entries.
+*/
+   for (; index < table.n_entries; index++)
+   intel_uncore_write(uncore,
+  GEN12_GLOBAL_MOCS(index),
+  table.table[0].control_value);
While get_mocs_settings() can return a table with less than 64 entries, 
it will never be the case for platforms supporting global MOCS.
So this for() is actually a dead code.. but removing it could cause harm 
in case this is forgotten and modifications are made, so I'd leave it as is.


R-b: Tomasz Lis 

-Tomasz

+}
+
  /**
   * emit_mocs_control_table() - emit the mocs control table
   * @rq:   Request to set up the MOCS table for.
@@ -604,6 +648,9 @@ int intel_rcs_context_init_mocs(struct i915_request *rq)
struct drm_i915_mocs_table t;
int ret;
  
+	if (HAS_GLOBAL_MOCS_REGISTERS(rq->i915))

+   return 0;
+
if (get_mocs_settings(rq->engine->gt, )) {
/* Program the RCS control registers */
ret = emit_mocs_control_table(rq, );
diff --git a/drivers/gpu/drm/i915/gt/intel_mocs.h 
b/drivers/gpu/drm/i915/gt/intel_mocs.h
index 8b9813e6f9ac..aa3a2df07c82 100644
--- a/drivers/gpu/drm/i915/gt/intel_mocs.h
+++ b/drivers/gpu/drm/i915/gt/intel_mocs.h
@@ -56,6 +56,7 @@ struct intel_gt;
  
  int intel_rcs_context_init_mocs(struct i915_request *rq);

  void intel_mocs_init_l3cc_table(struct intel_gt *gt);
+void intel_mocs_init_global(struct intel_gt *gt);
  void intel_mocs_init_engine(struct intel_engine_cs *engine);
  
  #endif

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index

Re: [Intel-gfx] [PATCH 18/22] drm/i915/tgl: Define MOCS entries for Tigerlake

2019-07-25 Thread Lis, Tomasz



On 2019-07-25 00:32, Lucas De Marchi wrote:

On Thu, Jul 18, 2019 at 10:09:27AM -0700, Daniele Ceraolo Spurio wrote:



On 7/18/19 6:08 AM, Ville Syrjälä wrote:

On Fri, Jul 12, 2019 at 06:09:36PM -0700, Lucas De Marchi wrote:

From: Tomasz Lis 

The MOCS table is published as part of bspec, and versioned. Entries
are supposed to never be modified, but new ones can be added. Adding
entries increases table version. The patch includes version 1 entries.

Two of the 3 legacy entries used for gen9 are no longer expected to 
work.
Although we are changing the gen11 table, those changes are 
supposed to

be backward compatible since we are only touching previously undefined
entries.

Cc: Joonas Lahtinen 
Cc: Mika Kuoppala 
Cc: Daniele Ceraolo Spurio 
Signed-off-by: Tomasz Lis 
Signed-off-by: Lucas De Marchi 
---
 drivers/gpu/drm/i915/gt/intel_mocs.c | 25 ++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_mocs.c 
b/drivers/gpu/drm/i915/gt/intel_mocs.c

index 290a5e9b90b9..259e7bec0a63 100644
--- a/drivers/gpu/drm/i915/gt/intel_mocs.c
+++ b/drivers/gpu/drm/i915/gt/intel_mocs.c
@@ -62,6 +62,10 @@ struct drm_i915_mocs_table {
 #define GEN11_NUM_MOCS_ENTRIES    64  /* 63-64 are reserved, but 
configured. */

 /* (e)LLC caching options */
+/*
+ * Note: LE_0_PAGETABLE works only up to Gen11; for newer gens it 
means

+ * the same as LE_UC
+ */
 #define LE_0_PAGETABLE    _LE_CACHEABILITY(0)
 #define LE_1_UC    _LE_CACHEABILITY(1)
 #define LE_2_WT    _LE_CACHEABILITY(2)
@@ -100,8 +104,9 @@ struct drm_i915_mocs_table {
  * of bspec.
  *
  * Entries not part of the following tables are undefined as far as
- * userspace is concerned and shouldn't be relied upon. For the time
- * being they will be initialized to PTE.
+ * userspace is concerned and shouldn't be relied upon. For Gen < 12
+ * they will be initialized to PTE. Gen >= 12 onwards don't have a 
setting for

+ * PTE. We use the same value, but that actually means Uncached.
  *
  * The last two entries are reserved by the hardware. For ICL+ they
  * should be initialized according to bspec and never used, for older
@@ -137,11 +142,13 @@ static const struct drm_i915_mocs_entry 
broxton_mocs_table[] = {

 };
 #define GEN11_MOCS_ENTRIES \
-    /* Base - Uncached (Deprecated) */ \
+    /* Gen11: Base - Uncached (Deprecated) */ \
+    /* Gen12+: Base - Error (Reserved for Non-Use) */ \
 MOCS_ENTRY(I915_MOCS_UNCACHED, \
    LE_1_UC | LE_TC_1_LLC, \
    L3_1_UC), \
 /* Base - L3 + LeCC:PAT (Deprecated) */ \
+    /* Gen12+: Base - Reserved */ \
 MOCS_ENTRY(I915_MOCS_PTE, \
    LE_0_PAGETABLE | LE_TC_1_LLC, \
    L3_3_WB), \
@@ -233,6 +240,18 @@ static const struct drm_i915_mocs_entry 
broxton_mocs_table[] = {

 MOCS_ENTRY(23, \
    LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_RSC(1) | 
LE_SCC(7), \

    L3_3_WB), \
+    /* Gen12+: HW Reserved - HDC:L1 + L3 + LLC */ \


Why is this marked as reserved? From the looks of things 48-61 should
just be normal entries that userspace can select to get HDC L1$. And
looks like icl already has that stuff. So someone should probably 
figure

out if Mesa/etc. can make use of the HDC L1$, and if so we should add
the relevant MOCS entries for icl as well.


Here the reserved terminology is indeed misleading. The 48-59 range 
is a "special" range where L1 usage is implicitly enabled by the HW, 
as there is no explicit L1 toggle in the MOCS registers. The reserved 
here means that the range shouldn't be used for "normal" MOCS 
settings, but SW can freely use these entries as needed. Similarly, 
MOCS 60 and 61 are reserved for other special purposes, but are still 
usable by SW. The only entries SW shouldn't touch are 62 and 63.


Regarding ICL, Gen11 HW doesn't have the capability so no new entries 
are required there.
It might be a good idea to find a better word for it than "reserved". 
Not only for this patch, but to be used everywhere, especially the spec.

But that exceeds the scope of this review.





+    MOCS_ENTRY(48, \
+   LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), \
+   L3_3_WB), \
+    /* Gen12+: HW Reserved - HW Special Case (CCS) */ \


The specs have MOCS 49-51 defined as well.


humn... it seems they got added later.

I'm not sure anymore if we should update igt so it doesn't expect those
entries to be set to PTE or if we should stop reusing the same table for
ICL and TGL. Spec doesn't mention the compatibility of this table with
gen 11 anymore. Thoughts?



It doesn't sound right to change the implementation decision to keep them
together, only because this allows to keep tests intact.

I don't believe there's a reason to verify undefined entries in IGT.
Sometimes we do want to verify our specific implementation instead of
pure specs compliance; but I don't see a reason for this should be the 
case here.


The existing MOCS entries are supposed to be unchangeable (in 

Re: [Intel-gfx] [PATCH v2 11/22] drm/i915/guc: Reset GuC ADS during sanitize

2019-04-16 Thread Lis, Tomasz


Only comment issues. Besides these:

Reviewed-by: Tomasz Lis 

On 2019-04-11 10:44, Michal Wajdeczko wrote:

GuC stores some data in there, which might be stale after a reset.
Reinitialize whole ADS in case any part of it was corrupted during
previous GuC run.

Signed-off-by: Michal Wajdeczko 
Cc: Daniele Ceraolo Spurio 
Cc: MichaĹ Winiarski 
Cc: Tomasz Lis 
---
  drivers/gpu/drm/i915/intel_guc.h |  2 +
  drivers/gpu/drm/i915/intel_guc_ads.c | 85 ++--
  drivers/gpu/drm/i915/intel_guc_ads.h |  1 +
  3 files changed, 57 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
index 2c59ff8d9f39..4f3cf8eddfe6 100644
--- a/drivers/gpu/drm/i915/intel_guc.h
+++ b/drivers/gpu/drm/i915/intel_guc.h
@@ -26,6 +26,7 @@
  #define _INTEL_GUC_H_
  
  #include "intel_uncore.h"

+#include "intel_guc_ads.h"
  #include "intel_guc_fw.h"
  #include "intel_guc_fwif.h"
  #include "intel_guc_ct.h"
@@ -177,6 +178,7 @@ u32 intel_guc_reserved_gtt_size(struct intel_guc *guc);
  static inline int intel_guc_sanitize(struct intel_guc *guc)
  {
intel_uc_fw_sanitize(>fw);
+   intel_guc_ads_reset(guc);
return 0;
  }
  
diff --git a/drivers/gpu/drm/i915/intel_guc_ads.c b/drivers/gpu/drm/i915/intel_guc_ads.c

index abab5cb6909a..97926effb944 100644
--- a/drivers/gpu/drm/i915/intel_guc_ads.c
+++ b/drivers/gpu/drm/i915/intel_guc_ads.c
@@ -72,43 +72,28 @@ static void guc_ct_pool_entries_init(struct 
guc_ct_pool_entry *pool, u32 num)
   */
  #define LR_HW_CONTEXT_SIZE(80 * sizeof(u32))
  
-/**

- * intel_guc_ads_create() - creates GuC ADS
- * @guc: intel_guc struct
- *
- */
-int intel_guc_ads_create(struct intel_guc *guc)
+/* The ads obj includes the struct itself and buffers passed to GuC */
+struct __guc_ads_blob {
+   struct guc_ads ads;
+   struct guc_policies policies;
+   struct guc_mmio_reg_state reg_state;
+   struct guc_gt_system_info system_info;
+   struct guc_clients_info clients_info;
+   struct guc_ct_pool_entry ct_pool[GUC_CT_POOL_SIZE];
+   u8 reg_state_buffer[GUC_S3_SAVE_SPACE_PAGES * PAGE_SIZE];
+} __packed;
+
+static int __guc_ads_reinit(struct intel_guc *guc)
  {
struct drm_i915_private *dev_priv = guc_to_i915(guc);
-   struct i915_vma *vma;
-   /* The ads obj includes the struct itself and buffers passed to GuC */
-   struct {
-   struct guc_ads ads;
-   struct guc_policies policies;
-   struct guc_mmio_reg_state reg_state;
-   struct guc_gt_system_info system_info;
-   struct guc_clients_info clients_info;
-   struct guc_ct_pool_entry ct_pool[GUC_CT_POOL_SIZE];
-   u8 reg_state_buffer[GUC_S3_SAVE_SPACE_PAGES * PAGE_SIZE];
-   } __packed *blob;
+   struct __guc_ads_blob *blob;
const u32 skipped_size = LRC_PPHWSP_SZ * PAGE_SIZE + LR_HW_CONTEXT_SIZE;
u32 base;
u8 engine_class;
-   int ret;
-
-   GEM_BUG_ON(guc->ads_vma);
-
-   vma = intel_guc_allocate_vma(guc, PAGE_ALIGN(sizeof(*blob)));
-   if (IS_ERR(vma))
-   return PTR_ERR(vma);
-
-   guc->ads_vma = vma;
  
  	blob = i915_gem_object_pin_map(guc->ads_vma->obj, I915_MAP_WB);

-   if (IS_ERR(blob)) {
-   ret = PTR_ERR(blob);
-   goto err_vma;
-   }
+   if (IS_ERR(blob))
+   return PTR_ERR(blob);
  
  	/* GuC scheduling policies */

guc_policies_init(>policies);
@@ -142,7 +127,7 @@ int intel_guc_ads_create(struct intel_guc *guc)
blob->system_info.vebox_enable_mask = VEBOX_MASK(dev_priv);
blob->system_info.vdbox_sfc_support_mask = 
RUNTIME_INFO(dev_priv)->vdbox_sfc_access;
  
-	base = intel_guc_ggtt_offset(guc, vma);

+   base = intel_guc_ggtt_offset(guc, guc->ads_vma);
  
  	/* Clients info  */

guc_ct_pool_entries_init(blob->ct_pool, ARRAY_SIZE(blob->ct_pool));
@@ -161,6 +146,32 @@ int intel_guc_ads_create(struct intel_guc *guc)
i915_gem_object_unpin_map(guc->ads_vma->obj);
  
  	return 0;

+}
+
+/**
+ * intel_guc_ads_create() - creates GuC ADS
+ * @guc: intel_guc struct
Are we really ok with documentation which just reads names with spaces 
instead of underscores?
I believe the description should go deeper, or at least use different 
words to describe the thing.

ie. here:

intel_guc_ads_create() - allocates and initializes GuC Additional Data Struct

@guc: instance of intel_guc which will own the ADS

+ *
+ */
+int intel_guc_ads_create(struct intel_guc *guc)
+{
+   const u32 size = PAGE_ALIGN(sizeof(struct __guc_ads_blob));
+   struct i915_vma *vma;
+   int ret;
+
+   GEM_BUG_ON(guc->ads_vma);
+
+   vma = intel_guc_allocate_vma(guc, size);
+   if (IS_ERR(vma))
+   return PTR_ERR(vma);
+
+   guc->ads_vma = vma;
+
+   ret = __guc_ads_reinit(guc);
+   if (ret)
+   goto err_vma;
+
+   return 0;
  
  err_vma:


Re: [Intel-gfx] [PATCH v8 7/7] drm/i915/icl: Define MOCS table for Icelake

2019-01-23 Thread Lis, Tomasz



On 2019-01-22 06:12, Lucas De Marchi wrote:

From: Tomasz Lis 

The table has been unified across OSes to minimize virtualization overhead.

The MOCS table is now published as part of bspec, and versioned. Entries
are supposed to never be modified, but new ones can be added. Adding
entries increases table version. The patch includes version 1 entries.

Meaning of each entry is now explained in bspec, and user mode clients
are expected to know what each entry means. The 3 entries used for previous
platforms are still compatible with their legacy definitions, but that is
not guaranteed to be true for future platforms.

v2: Fixed SCC values, improved commit comment (Daniele)
v3: Improved MOCS table comment (Daniele)
v4: Moved new entries below gen9 ones. Put common entries into
 definition to be used in multiple arrays. (Lucas)
v5: Made defines for or-ing flags. Renamed macros from MOCS_TABLE
 to MOCS_ENTRIES. Switched LE_CoS to upper case. (Joonas)
v6: Removed definitions of reserved entries. (Michal)
 Increased limit of entries sent to the hardware on gen11+.
v7: Simplify table as done for previou gens (Lucas)
v8: Rebase on cached number of entries per-platform and use new
 MOCS_ENTRY() macro (Lucas)

BSpec: 34007
BSpec: 560

Signed-off-by: Tomasz Lis 
Reviewed-by: Daniele Ceraolo Spurio 
Reviewed-by: Lucas De Marchi 
Signed-off-by: Lucas De Marchi 

Acked-by: Tomasz Lis 

---
  drivers/gpu/drm/i915/intel_mocs.c | 107 +++---
  1 file changed, 98 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_mocs.c 
b/drivers/gpu/drm/i915/intel_mocs.c
index 716f3f6f2966..3afd8c30cacc 100644
--- a/drivers/gpu/drm/i915/intel_mocs.c
+++ b/drivers/gpu/drm/i915/intel_mocs.c
@@ -46,6 +46,8 @@ struct drm_i915_mocs_table {
  #define LE_SCC(value) ((value) << 8)
  #define LE_PFM(value) ((value) << 11)
  #define LE_SCF(value) ((value) << 14)
+#define LE_COS(value)  ((value) << 15)
+#define LE_SSE(value)  ((value) << 17)
  
  /* Defines for the tables (LNCFMOCS0 - LNCFMOCS31) - two entries per word */

  #define L3_ESC(value) ((value) << 0)
@@ -54,6 +56,7 @@ struct drm_i915_mocs_table {
  
  /* Helper defines */

  #define GEN9_NUM_MOCS_ENTRIES 62  /* 62 out of 64 - 63 & 64 are reserved. */
+#define GEN11_NUM_MOCS_ENTRIES 64  /* 63-64 are reserved, but configured. */
  
  /* (e)LLC caching options */

  #define LE_0_PAGETABLE_LE_CACHEABILITY(0)
@@ -89,18 +92,22 @@ struct drm_i915_mocs_table {
   * LNCFCMOCS0 - LNCFCMOCS32 registers.
   *
   * These tables are intended to be kept reasonably consistent across
- * platforms. However some of the fields are not applicable to all of
- * them.
+ * HW platforms, and for ICL+, be identical across OSes. To achieve
+ * that, for Icelake and above, list of entries is published as part
+ * of bspec.
   *
   * Entries not part of the following tables are undefined as far as
   * userspace is concerned and shouldn't be relied upon.  For the time
   * being they will be initialized to PTE.
   *
- * NOTE: These tables MUST start with being uncached and the length
- *   MUST be less than 63 as the last two registers are reserved
- *   by the hardware.  These tables are part of the kernel ABI and
- *   may only be updated incrementally by adding entries at the
- *   end.
+ * The last two entries are reserved by the hardware. For ICL+ they
+ * should be initialized according to bspec and never used, for older
+ * platforms they should never be written to.
+ *
+ * NOTE: These tables are part of bspec and defined as part of hardware
+ *   interface for ICL+. For older platforms, they are part of kernel
+ *   ABI. It is expected that existing entries will remain constant
+ *   and the tables will only be updated by adding new entries.
We have no guarantee that the entries will remain constant across gens.. 
so maybe:


+ *   ABI. It is expected that, for specific hardware platform, existing
+ *   entries will remain constant and the table will only be updated
+ *   by adding new entries, filling unused positions.

-Tomasz

   */
  #define GEN9_MOCS_ENTRIES \
MOCS_ENTRY(I915_MOCS_UNCACHED,  LE_1_UC | LE_TC_2_LLC_ELLC, \
@@ -121,6 +128,84 @@ static const struct drm_i915_mocs_entry 
broxton_mocs_table[] = {
L3_3_WB)
  };
  
+#define GEN11_MOCS_ENTRIES \

+   /* Base - Uncached (Deprecated) */ \
+   MOCS_ENTRY(I915_MOCS_UNCACHED,  LE_1_UC | LE_TC_1_LLC, \
+   L3_1_UC), \
+   /* Base - L3 + LeCC:PAT (Deprecated) */ \
+   MOCS_ENTRY(I915_MOCS_PTE,   LE_0_PAGETABLE | LE_TC_1_LLC, \
+   L3_3_WB), \
+   /* Base - L3 + LLC */ \
+   MOCS_ENTRY(2,   LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), \
+   L3_3_WB), \
+   /* Base - Uncached */ \
+   MOCS_ENTRY(3,   LE_1_UC | LE_TC_1_LLC, \
+   

Re: [Intel-gfx] [PATCH v8 6/7] drm/i915: cache number of MOCS entries

2019-01-23 Thread Lis, Tomasz



On 2019-01-22 06:12, Lucas De Marchi wrote:

Instead of checking the gen number every time we need to know the max
number of entries, just save it into the table struct so we don't need
extra branches throughout the code. This will be useful for Ice Lake
that has 64 rather than 62 defined entries. Ice Lake changes will be
added in a follow up.

v2: make size and n_entries `unsigned int` and introduce changes as a
 pre-work for the Ice Lake changes (Tvrtko)

Suggested-by: Tvrtko Ursulin 
Signed-off-by: Lucas De Marchi 
I would name it total_entries or n_hw_entries, not n_entries; but that's 
just a name, so:

Reviewed-by: Tomasz Lis 
-Tomasz

---
  drivers/gpu/drm/i915/intel_mocs.c | 27 ++-
  1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_mocs.c 
b/drivers/gpu/drm/i915/intel_mocs.c
index af2ae2f396ae..716f3f6f2966 100644
--- a/drivers/gpu/drm/i915/intel_mocs.c
+++ b/drivers/gpu/drm/i915/intel_mocs.c
@@ -32,7 +32,8 @@ struct drm_i915_mocs_entry {
  };
  
  struct drm_i915_mocs_table {

-   u32 size;
+   unsigned int size;
+   unsigned int n_entries;
const struct drm_i915_mocs_entry *table;
  };
  
@@ -140,10 +141,12 @@ static bool get_mocs_settings(struct drm_i915_private *dev_priv,

if (IS_GEN9_BC(dev_priv) || IS_CANNONLAKE(dev_priv) ||
IS_ICELAKE(dev_priv)) {
table->size  = ARRAY_SIZE(skylake_mocs_table);
+   table->n_entries = GEN9_NUM_MOCS_ENTRIES;
table->table = skylake_mocs_table;
result = true;
} else if (IS_GEN9_LP(dev_priv)) {
table->size  = ARRAY_SIZE(broxton_mocs_table);
+   table->n_entries = GEN9_NUM_MOCS_ENTRIES;
table->table = broxton_mocs_table;
result = true;
} else {
@@ -202,8 +205,6 @@ void intel_mocs_init_engine(struct intel_engine_cs *engine)
if (!get_mocs_settings(dev_priv, ))
return;
  
-	GEM_BUG_ON(table.size > GEN9_NUM_MOCS_ENTRIES);

-
/* Set unused values to PTE */
unused_value = table.table[I915_MOCS_PTE].control_value;
  
@@ -215,7 +216,7 @@ void intel_mocs_init_engine(struct intel_engine_cs *engine)

}
  
  	/* All remaining entries are also unused */

-   for (; index < GEN9_NUM_MOCS_ENTRIES; index++)
+   for (; index < table.n_entries; index++)
I915_WRITE(mocs_register(engine->id, index), unused_value);
  }
  
@@ -237,17 +238,17 @@ static int emit_mocs_control_table(struct i915_request *rq,

u32 unused_value;
u32 *cs;
  
-	if (WARN_ON(table->size > GEN9_NUM_MOCS_ENTRIES))

+   if (GEM_WARN_ON(table->size > table->n_entries))
return -ENODEV;
  
  	/* Set unused values to PTE */

unused_value = table->table[I915_MOCS_PTE].control_value;
  
-	cs = intel_ring_begin(rq, 2 + 2 * GEN9_NUM_MOCS_ENTRIES);

+   cs = intel_ring_begin(rq, 2 + 2 * table->n_entries);
if (IS_ERR(cs))
return PTR_ERR(cs);
  
-	*cs++ = MI_LOAD_REGISTER_IMM(GEN9_NUM_MOCS_ENTRIES);

+   *cs++ = MI_LOAD_REGISTER_IMM(table->n_entries);
  
  	for (index = 0; index < table->size; index++) {

u32 value = table->table[index].used ?
@@ -258,7 +259,7 @@ static int emit_mocs_control_table(struct i915_request *rq,
}
  
  	/* All remaining entries are also unused */

-   for (; index < GEN9_NUM_MOCS_ENTRIES; index++) {
+   for (; index < table->n_entries; index++) {
*cs++ = i915_mmio_reg_offset(mocs_register(engine, index));
*cs++ = unused_value;
}
@@ -294,17 +295,17 @@ static int emit_mocs_l3cc_table(struct i915_request *rq,
unsigned int i, unused_index;
u32 *cs;
  
-	if (WARN_ON(table->size > GEN9_NUM_MOCS_ENTRIES))

+   if (GEM_WARN_ON(table->size > table->n_entries))
return -ENODEV;
  
  	/* Set unused values to PTE */

unused_index = I915_MOCS_PTE;
  
-	cs = intel_ring_begin(rq, 2 + GEN9_NUM_MOCS_ENTRIES);

+   cs = intel_ring_begin(rq, 2 + table->n_entries);
if (IS_ERR(cs))
return PTR_ERR(cs);
  
-	*cs++ = MI_LOAD_REGISTER_IMM(GEN9_NUM_MOCS_ENTRIES / 2);

+   *cs++ = MI_LOAD_REGISTER_IMM(table->n_entries / 2);
  
  	for (i = 0; i < table->size / 2; i++) {

u16 low = table->table[2 * i].used ?
@@ -327,7 +328,7 @@ static int emit_mocs_l3cc_table(struct i915_request *rq,
}
  
  	/* All remaining entries are also unused */

-   for (; i < GEN9_NUM_MOCS_ENTRIES / 2; i++) {
+   for (; i < table->n_entries / 2; i++) {
*cs++ = i915_mmio_reg_offset(GEN9_LNCFCMOCS(i));
*cs++ = l3cc_combine(table, unused_index, unused_index);
}
@@ -384,7 +385,7 @@ void intel_mocs_init_l3cc_table(struct drm_i915_private 
*dev_

Re: [Intel-gfx] [PATCH v8 5/7] drm/i915: keep track of used entries in MOCS table

2019-01-23 Thread Lis, Tomasz



On 2019-01-22 06:12, Lucas De Marchi wrote:

Instead of considering we have defined entries for any index in the
table, let's keep track of the ones we explicitly defined. This will
allow Gen 11 to have it's new table defined in which we have holes of
undefined entries.

Repeated comments about the meaning of undefined entries were removed
since they are overly verbose and copy-pasted in several functions: now
the definition is in the top only.

Signed-off-by: Lucas De Marchi 

Reviewed-by: Tomasz Lis 
-Tomasz

---
  drivers/gpu/drm/i915/intel_mocs.c | 88 ---
  1 file changed, 57 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_mocs.c 
b/drivers/gpu/drm/i915/intel_mocs.c
index faae2eefc5cc..af2ae2f396ae 100644
--- a/drivers/gpu/drm/i915/intel_mocs.c
+++ b/drivers/gpu/drm/i915/intel_mocs.c
@@ -28,6 +28,7 @@
  struct drm_i915_mocs_entry {
u32 control_value;
u16 l3cc_value;
+   u16 used;
  };
  
  struct drm_i915_mocs_table {

@@ -75,6 +76,7 @@ struct drm_i915_mocs_table {
[__idx] = { \
.control_value = __control_value, \
.l3cc_value = __l3cc_value, \
+   .used = 1, \
}
  
  /*

@@ -195,24 +197,26 @@ void intel_mocs_init_engine(struct intel_engine_cs 
*engine)
struct drm_i915_private *dev_priv = engine->i915;
struct drm_i915_mocs_table table;
unsigned int index;
+   u32 unused_value;
  
  	if (!get_mocs_settings(dev_priv, ))

return;
  
  	GEM_BUG_ON(table.size > GEN9_NUM_MOCS_ENTRIES);
  
-	for (index = 0; index < table.size; index++)

-   I915_WRITE(mocs_register(engine->id, index),
-  table.table[index].control_value);
+   /* Set unused values to PTE */
+   unused_value = table.table[I915_MOCS_PTE].control_value;
  
-	/*

-* Now set the unused entries to PTE. These entries are officially
-* undefined and no contract for the contents and settings is given
-* for these entries.
-*/
+   for (index = 0; index < table.size; index++) {
+   u32 value = table.table[index].used ?
+   table.table[index].control_value : unused_value;
+
+   I915_WRITE(mocs_register(engine->id, index), value);
+   }
+
+   /* All remaining entries are also unused */
for (; index < GEN9_NUM_MOCS_ENTRIES; index++)
-   I915_WRITE(mocs_register(engine->id, index),
-  table.table[I915_MOCS_PTE].control_value);
+   I915_WRITE(mocs_register(engine->id, index), unused_value);
  }
  
  /**

@@ -230,11 +234,15 @@ static int emit_mocs_control_table(struct i915_request 
*rq,
  {
enum intel_engine_id engine = rq->engine->id;
unsigned int index;
+   u32 unused_value;
u32 *cs;
  
  	if (WARN_ON(table->size > GEN9_NUM_MOCS_ENTRIES))

return -ENODEV;
  
+	/* Set unused values to PTE */

+   unused_value = table->table[I915_MOCS_PTE].control_value;
+
cs = intel_ring_begin(rq, 2 + 2 * GEN9_NUM_MOCS_ENTRIES);
if (IS_ERR(cs))
return PTR_ERR(cs);
@@ -242,18 +250,17 @@ static int emit_mocs_control_table(struct i915_request 
*rq,
*cs++ = MI_LOAD_REGISTER_IMM(GEN9_NUM_MOCS_ENTRIES);
  
  	for (index = 0; index < table->size; index++) {

+   u32 value = table->table[index].used ?
+   table->table[index].control_value : unused_value;
+
*cs++ = i915_mmio_reg_offset(mocs_register(engine, index));
-   *cs++ = table->table[index].control_value;
+   *cs++ = value;
}
  
-	/*

-* Now set the unused entries to PTE. These entries are officially
-* undefined and no contract for the contents and settings is given
-* for these entries.
-*/
+   /* All remaining entries are also unused */
for (; index < GEN9_NUM_MOCS_ENTRIES; index++) {
*cs++ = i915_mmio_reg_offset(mocs_register(engine, index));
-   *cs++ = table->table[I915_MOCS_PTE].control_value;
+   *cs++ = unused_value;
}
  
  	*cs++ = MI_NOOP;

@@ -284,12 +291,15 @@ static inline u32 l3cc_combine(const struct 
drm_i915_mocs_table *table,
  static int emit_mocs_l3cc_table(struct i915_request *rq,
const struct drm_i915_mocs_table *table)
  {
-   unsigned int i;
+   unsigned int i, unused_index;
u32 *cs;
  
  	if (WARN_ON(table->size > GEN9_NUM_MOCS_ENTRIES))

return -ENODEV;
  
+	/* Set unused values to PTE */

+   unused_index = I915_MOCS_PTE;
+
cs = intel_ring_begin(rq, 2 + GEN9_NUM_MOCS_ENTRIES);
if (IS_ERR(cs))
return PTR_ERR(cs);
@@ -297,25 +307,29 @@ static int emit_mocs_l3cc_table(struct i915_request *rq,
*cs++ = MI_LOAD_REGIS

Re: [Intel-gfx] [PATCH v8 4/7] drm/i915: use a macro to define MOCS entries

2019-01-23 Thread Lis, Tomasz



On 2019-01-22 06:12, Lucas De Marchi wrote:

Let's use a macro to make tables smaller and at the same time allow us
to add fields that apply to all entries in future.

For the sake of readability, I'm calling an exception on 80 chars limit.
Lines are aligned for easy comparison of the entry values.

Signed-off-by: Lucas De Marchi 

Reviewed-by: Tomasz Lis 

---
  drivers/gpu/drm/i915/intel_mocs.c | 39 +++
  1 file changed, 14 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_mocs.c 
b/drivers/gpu/drm/i915/intel_mocs.c
index c7a2a8d81d90..faae2eefc5cc 100644
--- a/drivers/gpu/drm/i915/intel_mocs.c
+++ b/drivers/gpu/drm/i915/intel_mocs.c
@@ -71,6 +71,12 @@ struct drm_i915_mocs_table {
  #define L3_2_RESERVED _L3_CACHEABILITY(2)
  #define L3_3_WB   _L3_CACHEABILITY(3)
  
+#define MOCS_ENTRY(__idx, __control_value, __l3cc_value) \

+   [__idx] = { \
+   .control_value = __control_value, \
+   .l3cc_value = __l3cc_value, \
+   }
+
  /*
   * MOCS tables
   *
@@ -93,40 +99,23 @@ struct drm_i915_mocs_table {
   *   may only be updated incrementally by adding entries at the
   *   end.
   */
-
The idea behind this EOL was that the comment above relates to several 
statements below, not just the first one.
But I'm not really sure what our commenting rules are in this case - a 
comment which bundles several definitions.

-Tomasz

  #define GEN9_MOCS_ENTRIES \
-   [I915_MOCS_UNCACHED] = { \
-   /* 0x0009 */ \
-   .control_value = LE_1_UC | LE_TC_2_LLC_ELLC, \
-   /* 0x0010 */ \
-   .l3cc_value = L3_1_UC, \
-   }, \
-   [I915_MOCS_PTE] = { \
-   /* 0x0038 */ \
-   .control_value = LE_0_PAGETABLE | LE_TC_2_LLC_ELLC | 
LE_LRUM(3), \
-   /* 0x0030 */ \
-   .l3cc_value = L3_3_WB, \
-   }
+   MOCS_ENTRY(I915_MOCS_UNCACHED,  LE_1_UC | LE_TC_2_LLC_ELLC, \
+   L3_1_UC), \
+   MOCS_ENTRY(I915_MOCS_PTE,   LE_0_PAGETABLE | LE_TC_2_LLC_ELLC | 
LE_LRUM(3), \
+   L3_3_WB) \
  
  static const struct drm_i915_mocs_entry skylake_mocs_table[] = {

GEN9_MOCS_ENTRIES,
-   [I915_MOCS_CACHED] = {
-   /* 0x003b */
-   .control_value = LE_3_WB | LE_TC_2_LLC_ELLC | LE_LRUM(3),
-   /* 0x0030 */
-   .l3cc_value =   L3_3_WB,
-   },
+   MOCS_ENTRY(I915_MOCS_CACHED,LE_3_WB | LE_TC_2_LLC_ELLC | LE_LRUM(3),
+   L3_3_WB)
  };
  
  /* NOTE: the LE_TGT_CACHE is not used on Broxton */

  static const struct drm_i915_mocs_entry broxton_mocs_table[] = {
GEN9_MOCS_ENTRIES,
-   [I915_MOCS_CACHED] = {
-   /* 0x0039 */
-   .control_value = LE_1_UC | LE_TC_2_LLC_ELLC | LE_LRUM(3),
-   /* 0x0030 */
-   .l3cc_value = L3_3_WB,
-   },
+   MOCS_ENTRY(I915_MOCS_CACHED,LE_1_UC | LE_TC_2_LLC_ELLC | LE_LRUM(3),
+   L3_3_WB)
  };
  
  /**


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v8 3/7] drm/i915/skl: Rework MOCS tables to keep common part in a define

2019-01-23 Thread Lis, Tomasz



On 2019-01-22 06:12, Lucas De Marchi wrote:

From: Tomasz Lis 

The MOCS tables are going to be very similar across platforms.

To reduce the amount of copied code, this patch rips the common part and
puts it into a definition valid for all gen9 platforms.

v2: Made defines for or-ing flags. Renamed macros from MOCS_TABLE
 to MOCS_ENTRIES. (Joonas)
v3 (Lucas):
   - Fix indentation
   - Rebase on rework done by additional patch
   - Remove define for or-ing flags as it made the table more complex by
 requiring zeroed values to be passed
   - Do not embed comma in the macro, so to treat that just as another
 item and please source code formatting tools

Signed-off-by: Tomasz Lis 
Suggested-by: Lucas De Marchi 
Signed-off-by: Lucas De Marchi 

Acked-by: Tomasz Lis 
-Tomasz

---
  drivers/gpu/drm/i915/intel_mocs.c | 57 ++-
  1 file changed, 25 insertions(+), 32 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_mocs.c 
b/drivers/gpu/drm/i915/intel_mocs.c
index 4ea80bb7dcc8..c7a2a8d81d90 100644
--- a/drivers/gpu/drm/i915/intel_mocs.c
+++ b/drivers/gpu/drm/i915/intel_mocs.c
@@ -93,46 +93,39 @@ struct drm_i915_mocs_table {
   *   may only be updated incrementally by adding entries at the
   *   end.
   */
+
+#define GEN9_MOCS_ENTRIES \
+   [I915_MOCS_UNCACHED] = { \
+   /* 0x0009 */ \
+   .control_value = LE_1_UC | LE_TC_2_LLC_ELLC, \
+   /* 0x0010 */ \
+   .l3cc_value = L3_1_UC, \
+   }, \
+   [I915_MOCS_PTE] = { \
+   /* 0x0038 */ \
+   .control_value = LE_0_PAGETABLE | LE_TC_2_LLC_ELLC | 
LE_LRUM(3), \
+   /* 0x0030 */ \
+   .l3cc_value = L3_3_WB, \
+   }
+
  static const struct drm_i915_mocs_entry skylake_mocs_table[] = {
-   [I915_MOCS_UNCACHED] = {
- /* 0x0009 */
- .control_value = LE_1_UC | LE_TC_2_LLC_ELLC,
- /* 0x0010 */
- .l3cc_value =L3_1_UC,
-   },
-   [I915_MOCS_PTE] = {
- /* 0x0038 */
- .control_value = LE_0_PAGETABLE | LE_TC_2_LLC_ELLC | LE_LRUM(3),
- /* 0x0030 */
- .l3cc_value =L3_3_WB,
-   },
+   GEN9_MOCS_ENTRIES,
[I915_MOCS_CACHED] = {
- /* 0x003b */
- .control_value = LE_3_WB | LE_TC_2_LLC_ELLC | LE_LRUM(3),
- /* 0x0030 */
- .l3cc_value =   L3_3_WB,
+   /* 0x003b */
+   .control_value = LE_3_WB | LE_TC_2_LLC_ELLC | LE_LRUM(3),
+   /* 0x0030 */
+   .l3cc_value =   L3_3_WB,
},
  };
  
  /* NOTE: the LE_TGT_CACHE is not used on Broxton */

  static const struct drm_i915_mocs_entry broxton_mocs_table[] = {
-   [I915_MOCS_UNCACHED] = {
- /* 0x0009 */
- .control_value = LE_1_UC | LE_TC_2_LLC_ELLC,
- /* 0x0010 */
- .l3cc_value = L3_1_UC,
-   },
-   [I915_MOCS_PTE] = {
- /* 0x0038 */
- .control_value = LE_0_PAGETABLE | LE_TC_2_LLC_ELLC | LE_LRUM(3),
- /* 0x0030 */
- .l3cc_value = L3_3_WB,
-   },
+   GEN9_MOCS_ENTRIES,
[I915_MOCS_CACHED] = {
- /* 0x0039 */
- .control_value = LE_1_UC | LE_TC_2_LLC_ELLC | LE_LRUM(3),
- /* 0x0030 */
- .l3cc_value = L3_3_WB,
+   /* 0x0039 */
+   .control_value = LE_1_UC | LE_TC_2_LLC_ELLC | LE_LRUM(3),
+   /* 0x0030 */
+   .l3cc_value = L3_3_WB,
},
  };
  


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v8 1/7] drm/i915: initialize unused MOCS entries to PTE

2019-01-23 Thread Lis, Tomasz



On 2019-01-22 06:12, Lucas De Marchi wrote:

Instead of initializing them to uncached, let's set them to PTE for
kernel tracking. While at it do some minor adjustments to comments and
coding style.

Signed-off-by: Lucas De Marchi 

Reviewed-by: Tomasz Lis 
One comment (with no expectations for change) below.

---
  drivers/gpu/drm/i915/intel_mocs.c | 56 +--
  1 file changed, 23 insertions(+), 33 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_mocs.c 
b/drivers/gpu/drm/i915/intel_mocs.c
index e976c5ce5479..0d6b94a239d6 100644
--- a/drivers/gpu/drm/i915/intel_mocs.c
+++ b/drivers/gpu/drm/i915/intel_mocs.c
@@ -85,10 +85,7 @@ struct drm_i915_mocs_table {
   *
   * Entries not part of the following tables are undefined as far as
   * userspace is concerned and shouldn't be relied upon.  For the time
- * being they will be implicitly initialized to the strictest caching
- * configuration (uncached) to guarantee forwards compatibility with
- * userspace programs written against more recent kernels providing
- * additional MOCS entries.
+ * being they will be initialized to PTE.
   *
   * NOTE: These tables MUST start with being uncached and the length
   *   MUST be less than 63 as the last two registers are reserved
@@ -249,16 +246,13 @@ void intel_mocs_init_engine(struct intel_engine_cs 
*engine)
   table.table[index].control_value);
  
  	/*

-* Ok, now set the unused entries to uncached. These entries
-* are officially undefined and no contract for the contents
-* and settings is given for these entries.
-*
-* Entry 0 in the table is uncached - so we are just writing
-* that value to all the used entries.
+* Now set the unused entries to PTE. These entries are officially
+* undefined and no contract for the contents and settings is given
+* for these entries.
 */
for (; index < GEN9_NUM_MOCS_ENTRIES; index++)
I915_WRITE(mocs_register(engine->id, index),
-  table.table[0].control_value);
+  table.table[I915_MOCS_PTE].control_value);
  }
  
  /**

@@ -293,16 +287,13 @@ static int emit_mocs_control_table(struct i915_request 
*rq,
}
  
  	/*

-* Ok, now set the unused entries to uncached. These entries
-* are officially undefined and no contract for the contents
-* and settings is given for these entries.
-*
-* Entry 0 in the table is uncached - so we are just writing
-* that value to all the used entries.
+* Now set the unused entries to PTE. These entries are officially
+* undefined and no contract for the contents and settings is given
+* for these entries.
 */
for (; index < GEN9_NUM_MOCS_ENTRIES; index++) {
*cs++ = i915_mmio_reg_offset(mocs_register(engine, index));
-   *cs++ = table->table[0].control_value;
+   *cs++ = table->table[I915_MOCS_PTE].control_value;
Entries from enum i915_mocs_table_index are not guaranteed to mean the 
same thing in future gens;
but for the time, that will work. And later it might still work, we 
don't know.

-Tomasz

}
  
  	*cs++ = MI_NOOP;

@@ -345,7 +336,7 @@ static int emit_mocs_l3cc_table(struct i915_request *rq,
  
  	*cs++ = MI_LOAD_REGISTER_IMM(GEN9_NUM_MOCS_ENTRIES / 2);
  
-	for (i = 0; i < table->size/2; i++) {

+   for (i = 0; i < table->size / 2; i++) {
*cs++ = i915_mmio_reg_offset(GEN9_LNCFCMOCS(i));
*cs++ = l3cc_combine(table, 2 * i, 2 * i + 1);
}
@@ -353,18 +344,18 @@ static int emit_mocs_l3cc_table(struct i915_request *rq,
if (table->size & 0x01) {
/* Odd table size - 1 left over */
*cs++ = i915_mmio_reg_offset(GEN9_LNCFCMOCS(i));
-   *cs++ = l3cc_combine(table, 2 * i, 0);
+   *cs++ = l3cc_combine(table, 2 * i, I915_MOCS_PTE);
i++;
}
  
  	/*

-* Now set the rest of the table to uncached - use entry 0 as
-* this will be uncached. Leave the last pair uninitialised as
-* they are reserved by the hardware.
+* Now set the unused entries to PTE. These entries are officially
+* undefined and no contract for the contents and settings is given
+* for these entries.
 */
for (; i < GEN9_NUM_MOCS_ENTRIES / 2; i++) {
*cs++ = i915_mmio_reg_offset(GEN9_LNCFCMOCS(i));
-   *cs++ = l3cc_combine(table, 0, 0);
+   *cs++ = l3cc_combine(table, I915_MOCS_PTE, I915_MOCS_PTE);
}
  
  	*cs++ = MI_NOOP;

@@ -395,22 +386,21 @@ void intel_mocs_init_l3cc_table(struct drm_i915_private 
*dev_priv)
if (!get_mocs_settings(dev_priv, ))
return;
  
-	for (i = 0; i < table.size/2; i++)

-   I915_WRITE(GEN9_LNCFCMOCS(i), l3cc_combine(, 2*i, 2*i+1));
+   for (i = 

Re: [Intel-gfx] [PATCH v8 2/7] drm/i915: Simplify MOCS table definition

2019-01-23 Thread Lis, Tomasz



On 2019-01-22 06:12, Lucas De Marchi wrote:

Make the defines for LE and L3 caching options to contain the shifts and
remove the zeros from the tables as shifting zeros always result in
zero.

Starting from Ice Lake the MOCS table is defined in the spec and
contains all entries. So to simplify checking the table with the values
set in code, the value is now part of the macro name. This allows to
still give the most used option and sensible name, but also to easily
cross check the table from the spec for gen >= 11.

By removing the zeros we avoid maintaining a huge table since the one
from spec contains many more entries. The new table for Ice Lake will
be added by other patches, this only reformats the table.

While at it also fix the indentation.

Signed-off-by: Lucas De Marchi 

That is much cleaner.
Reviewed-by: Tomasz Lis 
-Tomasz

---
  drivers/gpu/drm/i915/intel_mocs.c | 80 +++
  1 file changed, 29 insertions(+), 51 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_mocs.c 
b/drivers/gpu/drm/i915/intel_mocs.c
index 0d6b94a239d6..4ea80bb7dcc8 100644
--- a/drivers/gpu/drm/i915/intel_mocs.c
+++ b/drivers/gpu/drm/i915/intel_mocs.c
@@ -36,8 +36,8 @@ struct drm_i915_mocs_table {
  };
  
  /* Defines for the tables (XXX_MOCS_0 - XXX_MOCS_63) */

-#define LE_CACHEABILITY(value) ((value) << 0)
-#define LE_TGT_CACHE(value)((value) << 2)
+#define _LE_CACHEABILITY(value)((value) << 0)
+#define _LE_TGT_CACHE(value)   ((value) << 2)
  #define LE_LRUM(value)((value) << 4)
  #define LE_AOM(value) ((value) << 6)
  #define LE_RSC(value) ((value) << 7)
@@ -48,28 +48,28 @@ struct drm_i915_mocs_table {
  /* Defines for the tables (LNCFMOCS0 - LNCFMOCS31) - two entries per word */
  #define L3_ESC(value) ((value) << 0)
  #define L3_SCC(value) ((value) << 1)
-#define L3_CACHEABILITY(value) ((value) << 4)
+#define _L3_CACHEABILITY(value)((value) << 4)
  
  /* Helper defines */

  #define GEN9_NUM_MOCS_ENTRIES 62  /* 62 out of 64 - 63 & 64 are reserved. */
  
  /* (e)LLC caching options */

-#define LE_PAGETABLE   0
-#define LE_UC  1
-#define LE_WT  2
-#define LE_WB  3
-
-/* L3 caching options */
-#define L3_DIRECT  0
-#define L3_UC  1
-#define L3_RESERVED2
-#define L3_WB  3
+#define LE_0_PAGETABLE _LE_CACHEABILITY(0)
+#define LE_1_UC_LE_CACHEABILITY(1)
+#define LE_2_WT_LE_CACHEABILITY(2)
+#define LE_3_WB_LE_CACHEABILITY(3)
  
  /* Target cache */

-#define LE_TC_PAGETABLE0
-#define LE_TC_LLC  1
-#define LE_TC_LLC_ELLC 2
-#define LE_TC_LLC_ELLC_ALT 3
+#define LE_TC_0_PAGETABLE  _LE_TGT_CACHE(0)
+#define LE_TC_1_LLC_LE_TGT_CACHE(1)
+#define LE_TC_2_LLC_ELLC   _LE_TGT_CACHE(2)
+#define LE_TC_3_LLC_ELLC_ALT   _LE_TGT_CACHE(3)
+
+/* L3 caching options */
+#define L3_0_DIRECT_L3_CACHEABILITY(0)
+#define L3_1_UC_L3_CACHEABILITY(1)
+#define L3_2_RESERVED  _L3_CACHEABILITY(2)
+#define L3_3_WB_L3_CACHEABILITY(3)
  
  /*

   * MOCS tables
@@ -96,31 +96,21 @@ struct drm_i915_mocs_table {
  static const struct drm_i915_mocs_entry skylake_mocs_table[] = {
[I915_MOCS_UNCACHED] = {
  /* 0x0009 */
- .control_value = LE_CACHEABILITY(LE_UC) |
-  LE_TGT_CACHE(LE_TC_LLC_ELLC) |
-  LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
-  LE_PFM(0) | LE_SCF(0),
-
+ .control_value = LE_1_UC | LE_TC_2_LLC_ELLC,
  /* 0x0010 */
- .l3cc_value =L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
+ .l3cc_value =L3_1_UC,
},
[I915_MOCS_PTE] = {
  /* 0x0038 */
- .control_value = LE_CACHEABILITY(LE_PAGETABLE) |
-  LE_TGT_CACHE(LE_TC_LLC_ELLC) |
-  LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
-  LE_PFM(0) | LE_SCF(0),
+ .control_value = LE_0_PAGETABLE | LE_TC_2_LLC_ELLC | LE_LRUM(3),
  /* 0x0030 */
- .l3cc_value =L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
+ .l3cc_value =L3_3_WB,
},
[I915_MOCS_CACHED] = {
  /* 0x003b */
- .control_value = LE_CACHEABILITY(LE_WB) |
-  LE_TGT_CACHE(LE_TC_LLC_ELLC) |
-  LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
-  LE_PFM(0) | LE_SCF(0),
+ .control_value = LE_3_WB | LE_TC_2_LLC_ELLC | LE_LRUM(3),
  /* 0x0030 */
- .l3cc_value =   L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
+ .l3cc_value =   L3_3_WB,
   

Re: [Intel-gfx] [PATCH v7 3/4] drm/i915/icl: Define MOCS table for Icelake

2018-12-21 Thread Lis, Tomasz



On 2018-12-21 13:29, Tvrtko Ursulin wrote:


On 14/12/2018 18:20, Lucas De Marchi wrote:

From: Tomasz Lis 

The table has been unified across OSes to minimize virtualization 
overhead.


The MOCS table is now published as part of bspec, and versioned. Entries
are supposed to never be modified, but new ones can be added. Adding
entries increases table version. The patch includes version 1 entries.

Meaning of each entry is now explained in bspec, and user mode clients
are expected to know what each entry means. The 3 entries used for 
previous
platforms are still compatible with their legacy definitions, but 
that is

not guaranteed to be true for future platforms.

v2: Fixed SCC values, improved commit comment (Daniele)
v3: Improved MOCS table comment (Daniele)
v4: Moved new entries below gen9 ones. Put common entries into
 definition to be used in multiple arrays. (Lucas)
v5: Made defines for or-ing flags. Renamed macros from MOCS_TABLE
 to MOCS_ENTRIES. Switched LE_CoS to upper case. (Joonas)
v6: Removed definitions of reserved entries. (Michal)
 Increased limit of entries sent to the hardware on gen11+.
v7: Simplify table as done for previou gens (Lucas)

BSpec: 34007
BSpec: 560

Signed-off-by: Tomasz Lis 
Reviewed-by: Daniele Ceraolo Spurio 
Reviewed-by: Lucas De Marchi 
Signed-off-by: Lucas De Marchi 
---
  drivers/gpu/drm/i915/intel_mocs.c | 187 ++
  1 file changed, 162 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_mocs.c 
b/drivers/gpu/drm/i915/intel_mocs.c

index 577633cefb8a..dfc4edea020f 100644
--- a/drivers/gpu/drm/i915/intel_mocs.c
+++ b/drivers/gpu/drm/i915/intel_mocs.c
@@ -44,6 +44,8 @@ struct drm_i915_mocs_table {
  #define LE_SCC(value)    ((value) << 8)
  #define LE_PFM(value)    ((value) << 11)
  #define LE_SCF(value)    ((value) << 14)
+#define LE_COS(value)    ((value) << 15)
+#define LE_SSE(value)    ((value) << 17)
    /* Defines for the tables (LNCFMOCS0 - LNCFMOCS31) - two entries 
per word */

  #define L3_ESC(value)    ((value) << 0)
@@ -52,6 +54,10 @@ struct drm_i915_mocs_table {
    /* Helper defines */
  #define GEN9_NUM_MOCS_ENTRIES    62  /* 62 out of 64 - 63 & 64 are 
reserved. */
+#define GEN11_NUM_MOCS_ENTRIES    64  /* 63-64 are reserved, but 
configured. */

+
+#define NUM_MOCS_ENTRIES(i915) \
+    (INTEL_GEN(i915) < 11 ? GEN9_NUM_MOCS_ENTRIES : 
GEN11_NUM_MOCS_ENTRIES)

    /* (e)LLC caching options */
  #define LE_0_PAGETABLE    _LE_CACHEABILITY(0)
@@ -80,21 +86,21 @@ struct drm_i915_mocs_table {
   * LNCFCMOCS0 - LNCFCMOCS32 registers.
   *
   * These tables are intended to be kept reasonably consistent across
- * platforms. However some of the fields are not applicable to all of
- * them.
+ * HW platforms, and for ICL+, be identical across OSes. To achieve
+ * that, for Icelake and above, list of entries is published as part
+ * of bspec.
   *
   * Entries not part of the following tables are undefined as far as
- * userspace is concerned and shouldn't be relied upon.  For the time
- * being they will be implicitly initialized to the strictest caching
- * configuration (uncached) to guarantee forwards compatibility with
- * userspace programs written against more recent kernels providing
- * additional MOCS entries.
+ * userspace is concerned and shouldn't be relied upon.
+ *
+ * The last two entries are reserved by the hardware. For ICL+ they
+ * should be initialized according to bspec and never used, for older
+ * platforms they should never be written to.
   *
- * NOTE: These tables MUST start with being uncached and the length
- *   MUST be less than 63 as the last two registers are reserved
- *   by the hardware.  These tables are part of the kernel ABI and
- *   may only be updated incrementally by adding entries at the
- *   end.
+ * NOTE: These tables are part of bspec and defined as part of hardware
+ *   interface for ICL+. For older platforms, they are part of 
kernel

+ *   ABI. It is expected that existing entries will remain constant
+ *   and the tables will only be updated by adding new entries.
   */
    #define GEN9_MOCS_ENTRIES \
@@ -132,6 +138,132 @@ static const struct drm_i915_mocs_entry 
broxton_mocs_table[] = {

  },
  };
  +#define GEN11_MOCS_ENTRIES \
+    [0] = { \
+    /* Base - Uncached (Deprecated) */ \
+    .control_value = LE_1_UC | LE_TC_1_LLC, \
+    .l3cc_value = L3_1_UC \
+    }, \
+    [1] = { \
+    /* Base - L3 + LeCC:PAT (Deprecated) */ \
+    .control_value = LE_0_PAGETABLE | LE_TC_1_LLC, \
+    .l3cc_value = L3_3_WB \
+    }, \
+    [2] = { \
+    /* Base - L3 + LLC */ \
+    .control_value = LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), \
+    .l3cc_value = L3_3_WB \
+    }, \
+    [3] = { \
+    /* Base - Uncached */ \
+    .control_value = LE_1_UC | LE_TC_1_LLC, \
+    .l3cc_value = L3_1_UC \
+    }, \
+    [4] = { \
+    /* Base - L3 */ \
+    

Re: [Intel-gfx] [PATCH v6] drm/i915/icl: Preempt-to-idle support in execlists.

2018-12-17 Thread Lis, Tomasz



On 2018-12-14 12:10, Joonas Lahtinen wrote:

Quoting Tvrtko Ursulin (2018-12-10 17:40:34)

On 09/11/2018 17:18, Tomasz Lis wrote:

The patch adds support of preempt-to-idle requesting by setting a proper
bit within Execlist Control Register, and receiving preemption result from
Context Status Buffer.

Preemption in previous gens required a special batch buffer to be executed,
so the Command Streamer never preempted to idle directly. In Icelake it is
possible, as there is a hardware mechanism to inform the kernel about
status of the preemption request.

This patch does not cover using the new preemption mechanism when GuC is
active.

The advantage of this new preemption path is that one less context switch is
needed, and returning information about preempion being complete is received
earlier. This leads to significant improvement in our IGT latency test.

Test performed: `gem_exec_latency --run-subtest render-preemption`, executed
100 times, on the same platform, same kernel, without and with this patch.
Then taken average of the execution latency times:

subcase   old preempt.icl preempt.
render-render 853.2036840.1176
render-bsd2328.8708   2083.2576
render-blt2080.1501   1852.0792
render-vebox  1553.5134   1428.762

Improvement observed:

subcase   improvement
render-render  1.53%
render-bsd10.55%
render-blt10.96%
render-vebox   8.03%

Who can explain what do the parts other than render-render mean? At
least I can make sense of render-render - measure how long it takes for
one context to preempt another, but render-$other draws a blank for me.
How are engines pre-empting one another?
These cases submit low priority spin buffer to 'render' ring, and then 
on high priority context they submit two buffers which only write 
current timestamp to a known location: first one to 'render', and second 
to the other engine.
Submission to the other engine with the same context makes sure that 
'render' gets back to spin buffer after each iteration. The two high 
priority tasks are not parallelized, because they announce they write to 
the same object.


The time taken to do all operations in one iteration is then displayed. 
So the time includes:

- preempting spin buffer
- executing timestamp write from within 'render'
- executing timestamp write from within other ring (while at the same 
time, spin buffer gets back to execution on render)


If I'm not mistaken with the above, it looks like the 'render-render' 
case is the worst one to measure performance increase - if both high 
priority tasks are executed on the same engine, there's a considerable 
chance that the hardware will get straight to another pair of them, 
without going though spin buffer and preempting it.


-Tomasz



But anyway, even if only the 1.53% improvement is the real one, FWIW
that's I think good enough to justify the patch. It is sufficiently
small and contained that I don't see a problem. So:

Acked-by: Tvrtko Ursulin 

According to Chris, the baseline measurements are off by a decade or so
compared to where they should be. This might be attributed to execution
on frequency locked parts?

Would it be worthy to repeat the numbers with some unlocked parts?

Regards, Joonas


Regards,

Tvrtko


v2: Added needs_preempt_context() change so that it is not created when
  preempt-to-idle is supported. (Chris)
  Updated setting HWACK flag so that it is cleared after
  preempt-to-dle. (Chris, Daniele)
  Updated to use I915_ENGINE_HAS_PREEMPTION flag. (Chris)

v3: Fixed needs_preempt_context() change. (Chris)
  Merged preemption trigger functions to one. (Chris)
  Fixed conyext state tonot assume COMPLETED_MASK after preemption,
  since idle-to-idle case will not have it set.

v4: Simplified needs_preempt_context() change. (Daniele)
  Removed clearing HWACK flag in idle-to-idle preempt. (Daniele)

v5: Renamed inject_preempt_context(). (Daniele)
  Removed duplicated GEM_BUG_ON() on HWACK (Daniele)

v6: Added performance test results.

Bspec: 18922
Cc: Joonas Lahtinen 
Cc: Chris Wilson 
Cc: Daniele Ceraolo Spurio 
Cc: Michal Winiarski 
Cc: Mika Kuoppala 
Reviewed-by: Daniele Ceraolo Spurio 
Signed-off-by: Tomasz Lis 
---
   drivers/gpu/drm/i915/i915_drv.h  |   2 +
   drivers/gpu/drm/i915/i915_gem_context.c  |   3 +-
   drivers/gpu/drm/i915/i915_pci.c  |   3 +-
   drivers/gpu/drm/i915/intel_device_info.h |   1 +
   drivers/gpu/drm/i915/intel_lrc.c | 109 
+--
   drivers/gpu/drm/i915/intel_lrc.h |   1 +
   6 files changed, 84 insertions(+), 35 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 08d25aa..d2cc9f1 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2579,6 +2579,8 @@ intel_info(const struct drm_i915_private *dev_priv)
   ((dev_priv)->info.has_logical_ring_elsq)
   #define 

Re: [Intel-gfx] [PATCH v2 1/2] drm/i915/icl: Define MOCS table for Icelake

2018-10-23 Thread Lis, Tomasz



On 2018-10-22 19:40, Daniele Ceraolo Spurio wrote:



On 22/10/18 10:13, Tomasz Lis wrote:
The table has been unified across OSes to minimize virtualization 
overhead.


The MOCS table is now published as part of bspec, and versioned. Entries
are supposed to never be modified, but new ones can be added. Adding
entries increases table version. The patch includes version 1 entries.

Meaning of each entry is now explained in bspec, and user mode clients
are expected to know what each entry means. The 3 entries used for 
previous
platforms are still compatible with their legacy definitions, but 
that is

not guaranteed to be true for future platforms.

BSpec: 34007
BSpec: 560
Signed-off-by: Tomasz Lis 
Cc: Joonas Lahtinen 
Cc: Chris Wilson 
Cc: Mika Kuoppala 
Cc: Daniele Ceraolo Spurio 
Cc: Zhenyu Wang 
Cc: Zhi A Wang 
Cc: Anuj Phogat 
---
  drivers/gpu/drm/i915/intel_mocs.c | 246 
+-

  1 file changed, 244 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_mocs.c 
b/drivers/gpu/drm/i915/intel_mocs.c

index 77e9871..dc34e83 100644
--- a/drivers/gpu/drm/i915/intel_mocs.c
+++ b/drivers/gpu/drm/i915/intel_mocs.c
@@ -44,6 +44,8 @@ struct drm_i915_mocs_table {
  #define LE_SCC(value)    ((value) << 8)
  #define LE_PFM(value)    ((value) << 11)
  #define LE_SCF(value)    ((value) << 14)
+#define LE_CoS(value)    ((value) << 15)
+#define LE_SSE(value)    ((value) << 17)
    /* Defines for the tables (LNCFMOCS0 - LNCFMOCS31) - two entries 
per word */

  #define L3_ESC(value)    ((value) << 0)
@@ -96,6 +98,243 @@ struct drm_i915_mocs_table {
   *   may only be updated incrementally by adding entries at the
   *   end.
   */


This comment needs to be updated as some of the expectations are not 
true anymore, e.g. we're now defining entries 62 and 63 in SW, usage 
of mocs 0 is changing. Also could be useful to add some of the info 
you have in the commit message here, like the fact that the table is 
versioned in the specs for gen11+, so it is close to the table.


From a POV of following the specs, with the updated comments this is:

Reviewed-by: Daniele Ceraolo Spurio 

But please get acks from the relevant interested parties to make sure 
there are no concerns with the new approach.

I believe all parties are informed; will add some more cc's to v3.
Also having someone else double-check the table as well would be nice 
since I might have missed something and it's going to be hard to catch 
issues in testing if the only impact is a very small performance delta.


Thanks,
Daniele
The IGT test for MOCS settings compares explicitly content of the 
registers with expected values hard-coded into that test; since someone 
else than me will be updating that test for correct Icelake values, we 
can count that as second verification.

-Tomasz



+static const struct drm_i915_mocs_entry icelake_mocs_table[] = {
+    [0] = {
+  /* Base - Uncached (Deprecated) */
+  .control_value = LE_CACHEABILITY(LE_UC) |
+   LE_TGT_CACHE(LE_TC_LLC) |
+   LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
+   LE_PFM(0) | LE_SCF(0) | LE_CoS(0) | LE_SSE(0),
+
+  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
+    },
+    [1] = {
+  /* Base - L3 + LeCC:PAT (Deprecated) */
+  .control_value = LE_CACHEABILITY(LE_PAGETABLE) |
+   LE_TGT_CACHE(LE_TC_LLC) |
+   LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
+   LE_PFM(0) | LE_SCF(0) | LE_CoS(0) | LE_SSE(0),
+
+  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
+    },
+    [2] = {
+  /* Base - L3 + LLC */
+  .control_value = LE_CACHEABILITY(LE_WB) |
+   LE_TGT_CACHE(LE_TC_LLC) |
+   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
+   LE_PFM(0) | LE_SCF(0) | LE_CoS(0) | LE_SSE(0),
+
+  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
+    },
+    [3] = {
+  /* Base - Uncached */
+  .control_value = LE_CACHEABILITY(LE_UC) |
+   LE_TGT_CACHE(LE_TC_LLC) |
+   LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
+   LE_PFM(0) | LE_SCF(0) | LE_CoS(0) | LE_SSE(0),
+
+  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
+    },
+    [4] = {
+  /* Base - L3 */
+  .control_value = LE_CACHEABILITY(LE_UC) |
+   LE_TGT_CACHE(LE_TC_LLC) |
+   LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
+   LE_PFM(0) | LE_SCF(0) | LE_CoS(0) | LE_SSE(0),
+
+  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
+    },
+    [5] = {
+  /* Base - LLC */
+  .control_value = LE_CACHEABILITY(LE_WB) |
+   LE_TGT_CACHE(LE_TC_LLC) |
+   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
+   LE_PFM(0) | LE_SCF(0) | LE_CoS(0) | LE_SSE(0),
+
+  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | 

Re: [Intel-gfx] [PATCH v2 2/2] drm/i915/icl: Add IOCTL for getting MOCS table version

2018-10-23 Thread Lis, Tomasz



On 2018-10-22 20:18, Daniele Ceraolo Spurio wrote:



On 22/10/18 10:13, Tomasz Lis wrote:

For Icelake and above, MOCS table for each platform is published
within bspec. The table is versioned, and new entries are assigned
a version number. Existing entries do not change and their version
is constant.

This introduces a parameter which allows getting max version number
of the MOCS entries currently supported, ie. value of 2 would mean
only version 1 and version 2 entries are initialized and can be used
by the user mode clients.

BSpec: 34007
Signed-off-by: Tomasz Lis 
Cc: Joonas Lahtinen 
Cc: Chris Wilson 
Cc: Mika Kuoppala 
Cc: Daniele Ceraolo Spurio 
Cc: Zhenyu Wang 
Cc: Zhi A Wang 
Cc: Anuj Phogat 
---
  drivers/gpu/drm/i915/i915_drv.c   |  6 ++
  drivers/gpu/drm/i915/intel_mocs.c | 12 
  drivers/gpu/drm/i915/intel_mocs.h |  1 +
  include/uapi/drm/i915_drm.h   | 11 +++
  4 files changed, 30 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drv.c 
b/drivers/gpu/drm/i915/i915_drv.c

index baac35f..92fa8fd 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -53,6 +53,7 @@
  #include "i915_vgpu.h"
  #include "intel_drv.h"
  #include "intel_uc.h"
+#include "intel_mocs.h"
    static struct drm_driver driver;
  @@ -444,6 +445,11 @@ static int i915_getparam_ioctl(struct 
drm_device *dev, void *data,

  case I915_PARAM_MMAP_GTT_COHERENT:
  value = INTEL_INFO(dev_priv)->has_coherent_ggtt;
  break;
+    case I915_PARAM_MOCS_TABLE_VERSION:
+    value = intel_mocs_table_version(dev_priv);
+    if (!value)
+    return -ENODEV;


Do we really want to return -ENODEV for platforms that do have a MOCS 
table programmed, but the table is not one versioned in the specs 
(i.e. Gen9-10)? I think returning "0" for those to indicate 
"kernel-defined table" would be ok and we could limit -ENODEV for 
platforms that don't have a table at all. But what wins is what the 
callers of the ioctl would like to get from the kernel ;)



+    break;
  default:
  DRM_DEBUG("Unknown parameter %d\n", param->param);
  return -EINVAL;
diff --git a/drivers/gpu/drm/i915/intel_mocs.c 
b/drivers/gpu/drm/i915/intel_mocs.c

index dc34e83..fc1e98b 100644
--- a/drivers/gpu/drm/i915/intel_mocs.c
+++ b/drivers/gpu/drm/i915/intel_mocs.c
@@ -469,6 +469,18 @@ static i915_reg_t mocs_register(enum 
intel_engine_id engine_id, int index)

  }
    /**
+ * intel_mocs_table_version() - get version of mocs table 
implementation

+ * @i915: i915 device struct.
+ */
+int intel_mocs_table_version(struct drm_i915_private *i915)
+{
+    if (IS_ICELAKE(i915))
+    return 1;


Can we add this version value as a define above the table, to keep 
them close to each other?


If we agree on my suggestion above to differentiate between no table 
at all (< 0), kernel-defined table (= 0) and spec-defined versioned 
table (> 0), it might also be useful to store the version with the 
table info in drm_i915_mocs_table and then have 
intel_mocs_table_version call get_mocs_settings(), e.g:


int intel_mocs_table_version(struct drm_i915_private *i915)
{
struct drm_i915_mocs_table table;

if (!get_mocs_settings(i915, ))
    return -ENODEV;

return table->version;
}

Thanks,
Daniele
That does sound reasonable. And I agree we should really ask userland 
what exactly they want.
Right now there is no request from any UMD for such interface; we will 
get back to this patch when it is requested.
Joonas also mentioned there is no need for an interface which always 
returns "1", and is expected to return only that value for future 
platforms as well. That's a good argument.


I will remove this patch from the current series.

-Tomasz



+    else
+    return 0;
+}
+
+/**
   * intel_mocs_init_engine() - emit the mocs control table
   * @engine:    The engine for whom to emit the registers.
   *
diff --git a/drivers/gpu/drm/i915/intel_mocs.h 
b/drivers/gpu/drm/i915/intel_mocs.h

index d89080d..dc1d64a 100644
--- a/drivers/gpu/drm/i915/intel_mocs.h
+++ b/drivers/gpu/drm/i915/intel_mocs.h
@@ -55,5 +55,6 @@
  int intel_rcs_context_init_mocs(struct i915_request *rq);
  void intel_mocs_init_l3cc_table(struct drm_i915_private *dev_priv);
  void intel_mocs_init_engine(struct intel_engine_cs *engine);
+int intel_mocs_table_version(struct drm_i915_private *i915);
    #endif
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 298b2e1..16aafc4 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -559,6 +559,17 @@ typedef struct drm_i915_irq_wait {
   */
  #define I915_PARAM_MMAP_GTT_COHERENT    52
  +/*
+ * Query MOCS table version used during hardware initialization.
+ *
+ * The MOCS table for each platform is published as part of bspec. 
Entries in
+ * the table are supposed to never be modified, but new enties are 
added, making
+ * more indexes in the table valid. This parameter informs which 

Re: [Intel-gfx] [PATCH v5] drm/i915/icl: Preempt-to-idle support in execlists.

2018-10-23 Thread Lis, Tomasz



On 2018-10-23 11:13, Joonas Lahtinen wrote:

Quoting Lis, Tomasz (2018-10-19 19:00:15)


On 2018-10-16 12:53, Joonas Lahtinen wrote:

Quoting Tomasz Lis (2018-10-15 20:29:18)

The patch adds support of preempt-to-idle requesting by setting a proper
bit within Execlist Control Register, and receiving preemption result from
Context Status Buffer.

Preemption in previous gens required a special batch buffer to be executed,
so the Command Streamer never preempted to idle directly. In Icelake it is
possible, as there is a hardware mechanism to inform the kernel about
status of the preemption request.

This patch does not cover using the new preemption mechanism when GuC is
active.

v2: Added needs_preempt_context() change so that it is not created when
  preempt-to-idle is supported. (Chris)
  Updated setting HWACK flag so that it is cleared after
  preempt-to-dle. (Chris, Daniele)
  Updated to use I915_ENGINE_HAS_PREEMPTION flag. (Chris)

v3: Fixed needs_preempt_context() change. (Chris)
  Merged preemption trigger functions to one. (Chris)
  Fixed conyext state tonot assume COMPLETED_MASK after preemption,
  since idle-to-idle case will not have it set.

v4: Simplified needs_preempt_context() change. (Daniele)
  Removed clearing HWACK flag in idle-to-idle preempt. (Daniele)

v5: Renamed inject_preempt_context(). (Daniele)
  Removed duplicated GEM_BUG_ON() on HWACK (Daniele)

Bspec: 18922
Cc: Joonas Lahtinen 
Cc: Chris Wilson 
Cc: Daniele Ceraolo Spurio 
Cc: Michal Winiarski 
Cc: Mika Kuoppala 
Reviewed-by: Daniele Ceraolo Spurio 

This R-b was on v4, and should be indicated with # v4 comment.

The commit message doesn't say much about why preempting to idle is
beneficial? The pre-Gen11 codepath needs to be maintained anyway.

Regards, Joonas

The benefit is one less context switch - there is no "preempt context".

Yes.

But that still doesn't quite explain what material benefits there are? :)

Is there some actual workloads/microbenchmarks that get an improvement?

This alters the behavior between different platforms for a very delicate
feature, probably resulting in slightly different bugs. So there should
be some more reasoning than just because we can.

Regards, Joonas
Less context switching does imply perf improvement, though it would 
require measurement - it might be hardly detectable. We may even lose 
performance - without measurements, we don't know. So not a strong argument.


One more benefit I could think of is - GuC path will use 
preempt-to-idle, so this would make execlists use the same path as GuC. 
But that's not a strong argument as well.


I must agree - there doesn't seem to be any strong enough reason to go 
with this change.

We might consider it after we have performance data.

-Tomasz

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v5] drm/i915/icl: Preempt-to-idle support in execlists.

2018-10-19 Thread Lis, Tomasz



On 2018-10-16 12:53, Joonas Lahtinen wrote:

Quoting Tomasz Lis (2018-10-15 20:29:18)

The patch adds support of preempt-to-idle requesting by setting a proper
bit within Execlist Control Register, and receiving preemption result from
Context Status Buffer.

Preemption in previous gens required a special batch buffer to be executed,
so the Command Streamer never preempted to idle directly. In Icelake it is
possible, as there is a hardware mechanism to inform the kernel about
status of the preemption request.

This patch does not cover using the new preemption mechanism when GuC is
active.

v2: Added needs_preempt_context() change so that it is not created when
 preempt-to-idle is supported. (Chris)
 Updated setting HWACK flag so that it is cleared after
 preempt-to-dle. (Chris, Daniele)
 Updated to use I915_ENGINE_HAS_PREEMPTION flag. (Chris)

v3: Fixed needs_preempt_context() change. (Chris)
 Merged preemption trigger functions to one. (Chris)
 Fixed conyext state tonot assume COMPLETED_MASK after preemption,
 since idle-to-idle case will not have it set.

v4: Simplified needs_preempt_context() change. (Daniele)
 Removed clearing HWACK flag in idle-to-idle preempt. (Daniele)

v5: Renamed inject_preempt_context(). (Daniele)
 Removed duplicated GEM_BUG_ON() on HWACK (Daniele)

Bspec: 18922
Cc: Joonas Lahtinen 
Cc: Chris Wilson 
Cc: Daniele Ceraolo Spurio 
Cc: Michal Winiarski 
Cc: Mika Kuoppala 
Reviewed-by: Daniele Ceraolo Spurio 

This R-b was on v4, and should be indicated with # v4 comment.

The commit message doesn't say much about why preempting to idle is
beneficial? The pre-Gen11 codepath needs to be maintained anyway.

Regards, Joonas

The benefit is one less context switch - there is no "preempt context".
-Tomasz


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [RFC] drm/i915/guc: New GuC stage descriptors

2018-10-17 Thread Lis, Tomasz



On 2018-10-12 20:25, Daniele Ceraolo Spurio wrote:

With the new interface, GuC now requires every lrc to be registered in
one of the stage descriptors, which have been re-designed so that each
descriptor can store up to 64 lrc per class (i.e. equal to the possible
SW counter values).
Similarly to what happened with the previous legacy design, it is possible
to have a single "proxy" descriptor that owns the workqueue and the
doorbell and use it for all submission. To distinguish the proxy
descriptors from the one used for lrc storage, the latter have been
called "principal". A descriptor can't be both a proxy and a principal
at the same time; to enforce this, since we only use 1 proxy descriptor
per client, we reserve enough descriptor from the bottom of the pool to
be used as proxy and leave the others as principals. For simplicity, we
currently map context IDs 1:1 to principal descriptors, but we could
have more contexts in flight if needed by using the SW counter.
Does this have any influence on the concept of "default context" used by 
UMDs?

Or is this completely separate?

Note that the lrcs need to be mapped in the principal descriptor until
guc is done with them. This means that we can't release the HW id when
the user app closes the ctx because it might still be in flight with GuC
and that we need to be careful when unpinning because the fact that the

s/the//

a request on the next context has completed doesn't mean that GuC is
done processing the first one. See in-code comments for details.

NOTE: GuC is not going to look at lrcs that are not in flight, so we
could potentially skip the unpinning steps. However, the unpinnig steps
perform extra correctness check so better keep them until we're sure
that the flow is solid.

Based on an initial patch by Oscar Mateo.

RFC: this will be sent as part of the updated series once we have
the gen9 FW with the new interface, but since the flow is
significantly different compared to the previous version I'd like
to gather some early feedback to make sure there aren't any major
issues.

Signed-off-by: Daniele Ceraolo Spurio 
Signed-off-by: Michal Wajdeczko 
Cc: Chris Wilson 
Cc: Michel Thierry 
Cc: Michal Winiarski 
Cc: Tomasz Lis 
---
  drivers/gpu/drm/i915/i915_debugfs.c |  30 +-
  drivers/gpu/drm/i915/i915_drv.h |   5 +-
  drivers/gpu/drm/i915/i915_gem_context.c |   9 +-
  drivers/gpu/drm/i915/intel_guc.h|  14 +-
  drivers/gpu/drm/i915/intel_guc_fwif.h   |  73 +++--
  drivers/gpu/drm/i915/intel_guc_submission.c | 346 +++-
  drivers/gpu/drm/i915/intel_lrc.c|  18 +-
  drivers/gpu/drm/i915/intel_lrc.h|   7 +
  drivers/gpu/drm/i915/selftests/intel_guc.c  |   4 +-
  9 files changed, 360 insertions(+), 146 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index 00c551d3e409..04bbde4a38a6 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2474,11 +2474,10 @@ static int i915_guc_stage_pool(struct seq_file *m, void 
*data)
  
  		seq_printf(m, "GuC stage descriptor %u:\n", index);

seq_printf(m, "\tIndex: %u\n", desc->stage_id);
+   seq_printf(m, "\tProxy Index: %u\n", desc->proxy_id);
seq_printf(m, "\tAttribute: 0x%x\n", desc->attribute);
seq_printf(m, "\tPriority: %d\n", desc->priority);
seq_printf(m, "\tDoorbell id: %d\n", desc->db_id);
-   seq_printf(m, "\tEngines used: 0x%x\n",
-  desc->engines_used);
seq_printf(m, "\tDoorbell trigger phy: 0x%llx, cpu: 0x%llx, uK: 
0x%x\n",
   desc->db_trigger_phy,
   desc->db_trigger_cpu,
@@ -2490,18 +2489,21 @@ static int i915_guc_stage_pool(struct seq_file *m, void 
*data)
seq_putc(m, '\n');
  
  		for_each_engine_masked(engine, dev_priv, client->engines, tmp) {

-   u32 guc_engine_id = engine->guc_id;
-   struct guc_execlist_context *lrc =
-   >lrc[guc_engine_id];
-
-   seq_printf(m, "\t%s LRC:\n", engine->name);
-   seq_printf(m, "\t\tContext desc: 0x%x\n",
-  lrc->context_desc);
-   seq_printf(m, "\t\tContext id: 0x%x\n", 
lrc->context_id);
-   seq_printf(m, "\t\tLRCA: 0x%x\n", lrc->ring_lrca);
-   seq_printf(m, "\t\tRing begin: 0x%x\n", 
lrc->ring_begin);
-   seq_printf(m, "\t\tRing end: 0x%x\n", lrc->ring_end);
-   seq_putc(m, '\n');
+   u8 class = engine->class;
+   u8 inst = engine->instance;
+
+   if (desc->lrc_alloc_map[class].bitmap & BIT(inst)) {
+   struct guc_execlist_context *lrc =
+ 

Re: [Intel-gfx] [PATCH v2] drm/i915: GEM_WARN_ON considered harmful

2018-10-17 Thread Lis, Tomasz



On 2018-10-12 08:31, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

GEM_WARN_ON currently has dangerous semantics where it is completely
compiled out on !GEM_DEBUG builds. This can leave users who expect it to
be more like a WARN_ON, just without a warning in non-debug builds, in
complete ignorance.

Another gotcha with it is that it cannot be used as a statement. Which is
again different from a standard kernel WARN_ON.

This patch fixes both problems by making it behave as one would expect.

It can now be used both as an expression and as statement, and also the
condition evaluates properly in all builds - code under the conditional
will therefore not unexpectedly disappear.

To satisfy call sites which really want the code under the conditional to
completely disappear, we add GEM_DEBUG_WARN_ON and convert some of the
callers to it. This one can also be used as both expression and statement.

 From the above it follows GEM_DEBUG_WARN_ON should be used in situations
where we are certain the condition will be hit during development, but at
a place in code where error can be handled to the benefit of not crashing
the machine.

GEM_WARN_ON on the other hand should be used where condition may happen in
production and we just want to distinguish the level of debugging output
emitted between the production and debug build.

v2:
  * Dropped BUG_ON hunk.
I have to agree with the need to "fix" GEM_WARN_ON() - ran into not 
realizing the difference to WARN_ON() myself. And while nobody likes 
multiplying variants of macros, I can't disagree with the need of having 
DEBUG-only version as well.


Reviewed-by: Tomasz Lis 


Signed-off-by: Tvrtko Ursulin 
Cc: Chris Wilson 
Cc: Joonas Lahtinen 
Cc: Matthew Auld 
Cc: Mika Kuoppala 
Cc: Tomasz Lis 
---
  drivers/gpu/drm/i915/i915_gem.h  | 4 +++-
  drivers/gpu/drm/i915/i915_vma.c  | 8 
  drivers/gpu/drm/i915/intel_engine_cs.c   | 8 
  drivers/gpu/drm/i915/intel_lrc.c | 6 +++---
  drivers/gpu/drm/i915/intel_workarounds.c | 2 +-
  5 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.h b/drivers/gpu/drm/i915/i915_gem.h
index 599c4f6eb1ea..b0e4b976880c 100644
--- a/drivers/gpu/drm/i915/i915_gem.h
+++ b/drivers/gpu/drm/i915/i915_gem.h
@@ -47,17 +47,19 @@ struct drm_i915_private;
  #define GEM_DEBUG_DECL(var) var
  #define GEM_DEBUG_EXEC(expr) expr
  #define GEM_DEBUG_BUG_ON(expr) GEM_BUG_ON(expr)
+#define GEM_DEBUG_WARN_ON(expr) GEM_WARN_ON(expr)
  
  #else
  
  #define GEM_SHOW_DEBUG() (0)
  
  #define GEM_BUG_ON(expr) BUILD_BUG_ON_INVALID(expr)

-#define GEM_WARN_ON(expr) (BUILD_BUG_ON_INVALID(expr), 0)
+#define GEM_WARN_ON(expr) ({ unlikely(!!(expr)); })
  
  #define GEM_DEBUG_DECL(var)

  #define GEM_DEBUG_EXEC(expr) do { } while (0)
  #define GEM_DEBUG_BUG_ON(expr)
+#define GEM_DEBUG_WARN_ON(expr) ({ BUILD_BUG_ON_INVALID(expr); 0; })
  #endif
  
  #if IS_ENABLED(CONFIG_DRM_I915_TRACE_GEM)

diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 31efc971a3a8..82652c3d1bed 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -305,12 +305,12 @@ int i915_vma_bind(struct i915_vma *vma, enum 
i915_cache_level cache_level,
GEM_BUG_ON(!drm_mm_node_allocated(>node));
GEM_BUG_ON(vma->size > vma->node.size);
  
-	if (GEM_WARN_ON(range_overflows(vma->node.start,

-   vma->node.size,
-   vma->vm->total)))
+   if (GEM_DEBUG_WARN_ON(range_overflows(vma->node.start,
+ vma->node.size,
+ vma->vm->total)))
return -ENODEV;
  
-	if (GEM_WARN_ON(!flags))

+   if (GEM_DEBUG_WARN_ON(!flags))
return -EINVAL;
  
  	bind_flags = 0;

diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c 
b/drivers/gpu/drm/i915/intel_engine_cs.c
index f27dbe26bcc1..78e42c0825d2 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -273,13 +273,13 @@ intel_engine_setup(struct drm_i915_private *dev_priv,
BUILD_BUG_ON(MAX_ENGINE_CLASS >= BIT(GEN11_ENGINE_CLASS_WIDTH));
BUILD_BUG_ON(MAX_ENGINE_INSTANCE >= BIT(GEN11_ENGINE_INSTANCE_WIDTH));
  
-	if (GEM_WARN_ON(info->class > MAX_ENGINE_CLASS))

+   if (GEM_DEBUG_WARN_ON(info->class > MAX_ENGINE_CLASS))
return -EINVAL;
  
-	if (GEM_WARN_ON(info->instance > MAX_ENGINE_INSTANCE))

+   if (GEM_DEBUG_WARN_ON(info->instance > MAX_ENGINE_INSTANCE))
return -EINVAL;
  
-	if (GEM_WARN_ON(dev_priv->engine_class[info->class][info->instance]))

+   if 
(GEM_DEBUG_WARN_ON(dev_priv->engine_class[info->class][info->instance]))
return -EINVAL;
  
  	GEM_BUG_ON(dev_priv->engine[id]);

@@ -402,7 +402,7 @@ int intel_engines_init(struct drm_i915_private *dev_priv)
err = -EINVAL;
err_id = 

Re: [Intel-gfx] [RFC] drm/i915: GEM_WARN_ON considered harmful

2018-10-11 Thread Lis, Tomasz

So I understand we agree on the change, just waiting for non-RFC version?

-Tomasz

On 2018-09-24 11:34, Jani Nikula wrote:

On Thu, 20 Sep 2018, Tvrtko Ursulin  wrote:

Ping!

Any comments here?

Main goal was to allow GEM_WARN_ON as a statement, plus also protect
uses in if statements, which there are some who I think don't expect the
branch to completely disappear.

I've said before I don't like the conditional early returns vanishing
depending on config options, but I've been shot down. I think this patch
is an improvement.


BR,
Jani.



Regards,

Tvrtko

On 07/09/2018 12:53, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

GEM_WARN_ON currently has dangerous semantics where it is completely
compiled out on !GEM_DEBUG builds. This can leave users who expect it to
be more like a WARN_ON, just without a warning in non-debug builds, in
complete ignorance.

Another gotcha with it is that it cannot be used as a statement. Which is
again different from a standard kernel WARN_ON.

This patch fixes both problems by making it behave as one would expect.

It can now be used both as an expression and as statement, and also the
condition evaluates properly in all builds - code under the conditional
will therefore not unexpectedly disappear.

To satisfy call sites which really want the code under the conditional to
completely disappear, we add GEM_DEBUG_WARN_ON and convert some of the
callers to it. This one can also be used as both expression and statement.

  From the above it follows GEM_DEBUG_WARN_ON should be used in situations
where we are certain the condition will be hit during development, but at
a place in code where error can be handled to the benefit of not crashing
the machine.

GEM_WARN_ON on the other hand should be used where condition may happen in
production and we just want to distinguish the level of debugging output
emitted between the production and debug build.

Signed-off-by: Tvrtko Ursulin 
Cc: Chris Wilson 
Cc: Joonas Lahtinen 
Cc: Matthew Auld 
Cc: Mika Kuoppala 
---
Quickly put together and compile tested only!
---
   drivers/gpu/drm/i915/i915_gem.h  | 4 +++-
   drivers/gpu/drm/i915/i915_vma.c  | 8 
   drivers/gpu/drm/i915/intel_engine_cs.c   | 8 
   drivers/gpu/drm/i915/intel_lrc.c | 8 
   drivers/gpu/drm/i915/intel_workarounds.c | 2 +-
   5 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.h b/drivers/gpu/drm/i915/i915_gem.h
index 599c4f6eb1ea..b0e4b976880c 100644
--- a/drivers/gpu/drm/i915/i915_gem.h
+++ b/drivers/gpu/drm/i915/i915_gem.h
@@ -47,17 +47,19 @@ struct drm_i915_private;
   #define GEM_DEBUG_DECL(var) var
   #define GEM_DEBUG_EXEC(expr) expr
   #define GEM_DEBUG_BUG_ON(expr) GEM_BUG_ON(expr)
+#define GEM_DEBUG_WARN_ON(expr) GEM_WARN_ON(expr)
   
   #else
   
   #define GEM_SHOW_DEBUG() (0)
   
   #define GEM_BUG_ON(expr) BUILD_BUG_ON_INVALID(expr)

-#define GEM_WARN_ON(expr) (BUILD_BUG_ON_INVALID(expr), 0)
+#define GEM_WARN_ON(expr) ({ unlikely(!!(expr)); })
   
   #define GEM_DEBUG_DECL(var)

   #define GEM_DEBUG_EXEC(expr) do { } while (0)
   #define GEM_DEBUG_BUG_ON(expr)
+#define GEM_DEBUG_WARN_ON(expr) ({ BUILD_BUG_ON_INVALID(expr); 0; })
   #endif
   
   #if IS_ENABLED(CONFIG_DRM_I915_TRACE_GEM)

diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 31efc971a3a8..82652c3d1bed 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -305,12 +305,12 @@ int i915_vma_bind(struct i915_vma *vma, enum 
i915_cache_level cache_level,
GEM_BUG_ON(!drm_mm_node_allocated(>node));
GEM_BUG_ON(vma->size > vma->node.size);
   
-	if (GEM_WARN_ON(range_overflows(vma->node.start,

-   vma->node.size,
-   vma->vm->total)))
+   if (GEM_DEBUG_WARN_ON(range_overflows(vma->node.start,
+ vma->node.size,
+ vma->vm->total)))
return -ENODEV;
   
-	if (GEM_WARN_ON(!flags))

+   if (GEM_DEBUG_WARN_ON(!flags))
return -EINVAL;
   
   	bind_flags = 0;

diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c 
b/drivers/gpu/drm/i915/intel_engine_cs.c
index 10cd051ba29e..8dbdb18b2668 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -273,13 +273,13 @@ intel_engine_setup(struct drm_i915_private *dev_priv,
BUILD_BUG_ON(MAX_ENGINE_CLASS >= BIT(GEN11_ENGINE_CLASS_WIDTH));
BUILD_BUG_ON(MAX_ENGINE_INSTANCE >= BIT(GEN11_ENGINE_INSTANCE_WIDTH));
   
-	if (GEM_WARN_ON(info->class > MAX_ENGINE_CLASS))

+   if (GEM_DEBUG_WARN_ON(info->class > MAX_ENGINE_CLASS))
return -EINVAL;
   
-	if (GEM_WARN_ON(info->instance > MAX_ENGINE_INSTANCE))

+   if (GEM_DEBUG_WARN_ON(info->instance > MAX_ENGINE_INSTANCE))
return -EINVAL;
   
-	if 

Re: [Intel-gfx] [PATCH 08/21] drm/i915/guc: Make use of the SW counter field in the context descriptor

2018-08-31 Thread Lis, Tomasz



On 2018-08-30 16:15, Lis, Tomasz wrote:



On 2018-08-30 02:08, Lionel Landwerlin wrote:

On 29/08/2018 20:16, Michal Wajdeczko wrote:

The new context descriptor format contains two assignable fields:
the SW Context ID (technically 11 bits, but practically limited to 2032
entries due to some being reserved for future use by the GuC) and the
SW Counter (6 bits).

We don't want to limit ourselves too much in the maximum number of
concurrent contexts we want to allow, so ideally we want to employ
every possible bit available. Unfortunately, a further limitation in
the interface with the GuC means the combination of SW Context ID +
SW Counter has to be unique within the same engine class (as we use
the SW Context ID to index in the GuC stage descriptor pool, and the
Engine Class + SW Counter to index in the 2-dimensional lrc array).
This essentially means we need to somehow encode the engine instance.

Since the BSpec allows 6 bits for engine instance, we use the whole
SW counter for this task. If the limitation of 2032 maximum 
simultaneous

contexts is too restrictive, we can always squeeze things a bit more
(3 extras bits for hw_id, 3 bits for instance) and things will still
work (Gen11 does not instance more than 8 engines of any class).

Another alternative would be to generate the hw_id per HW context
instead of per GEM context, but that has other problems (e.g. maximum
number of user-created contexts would be variable, no relationship
between a GuC principal descriptor and the proxy descriptor it uses, 
...)


Bspec: 12254

Signed-off-by: Oscar Mateo 
Signed-off-by: Rodrigo Vivi 
Signed-off-by: Michal Wajdeczko 
Cc: Joonas Lahtinen 
Cc: Daniele Ceraolo Spurio 
Cc: Michel Thierry 

Tested-by: Tomasz Lis 

---
  drivers/gpu/drm/i915/i915_drv.h | 15 +++
  drivers/gpu/drm/i915/i915_gem_context.c |  5 -
  drivers/gpu/drm/i915/i915_gem_context.h |  2 ++
  drivers/gpu/drm/i915/i915_reg.h |  2 ++
  drivers/gpu/drm/i915/intel_lrc.c    | 12 +---
  5 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h 
b/drivers/gpu/drm/i915/i915_drv.h

index e5b9d3c..34f5495 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1866,14 +1866,21 @@ struct drm_i915_private {
  struct llist_head free_list;
  struct work_struct free_work;
  -    /* The hw wants to have a stable context identifier for the
+    /*
+ * The HW wants to have a stable context identifier for the
   * lifetime of the context (for OA, PASID, faults, etc).
   * This is limited in execlists to 21 bits.
+ * In enhanced execlist (GEN11+) this is limited to 11 bits
+ * (the SW Context ID field) but GuC limits it a bit further
+ * (11 bits - 16) due to some entries being reserved for 
future

+ * use (so the firmware only supports a GuC stage descriptor
+ * pool of 2032 entries).
   */
  struct ida hw_ida;
-#define MAX_CONTEXT_HW_ID (1<<21) /* exclusive */
-#define MAX_GUC_CONTEXT_HW_ID (1 << 20) /* exclusive */
-#define GEN11_MAX_CONTEXT_HW_ID (1<<11) /* exclusive */
+#define MAX_CONTEXT_HW_ID    (1 << 21) /* exclusive */
+#define MAX_GUC_CONTEXT_HW_ID    (1 << 20) /* exclusive */
+#define GEN11_MAX_CONTEXT_HW_ID    (1 << 11) /* exclusive */
+#define GEN11_MAX_CONTEXT_HW_ID_WITH_GUC (GEN11_MAX_CONTEXT_HW_ID - 
16)

  } contexts;
    u32 fdi_rx_config;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c 
b/drivers/gpu/drm/i915/i915_gem_context.c

index f15a039..e3b500c 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -209,7 +209,10 @@ static int assign_hw_id(struct drm_i915_private 
*dev_priv, unsigned *out)

  unsigned int max;
    if (INTEL_GEN(dev_priv) >= 11) {
-    max = GEN11_MAX_CONTEXT_HW_ID;
+    if (USES_GUC_SUBMISSION(dev_priv))
+    max = GEN11_MAX_CONTEXT_HW_ID_WITH_GUC;
+    else
+    max = GEN11_MAX_CONTEXT_HW_ID;
  } else {
  /*
   * When using GuC in proxy submission, GuC consumes the
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h 
b/drivers/gpu/drm/i915/i915_gem_context.h

index 851dad6..4b87f5d 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -154,6 +154,8 @@ struct i915_gem_context {
  struct intel_ring *ring;
  u32 *lrc_reg_state;
  u64 lrc_desc;
+    u32 sw_context_id;
+    u32 sw_counter;
  int pin_count;
    const struct intel_context_ops *ops;
diff --git a/drivers/gpu/drm/i915/i915_reg.h 
b/drivers/gpu/drm/i915/i915_reg.h

index f232178..ea65d7b 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -3900,6 +3900,8 @@ enum {
  #define GEN8_CTX_ID_WIDTH 21
  #define GEN11_SW_CTX_ID_SHIFT 37
  #d

Re: [Intel-gfx] [PATCH 06/21] drm/i915/guc: Use guc_class instead of engine_class in fw interface

2018-08-30 Thread Lis, Tomasz

Uhh, sorry - answered on wrong patch.

Please ignore this one.

-Tomasz

On 2018-08-30 15:29, Lis, Tomasz wrote:



On 2018-08-30 02:16, Lionel Landwerlin wrote:

On 29/08/2018 20:58, Michel Thierry wrote:

+Lionel
(please see below as this touches the lrca format & relates to OA 
reporting too)


On 8/29/2018 12:10 PM, Michal Wajdeczko wrote:

Until now the GuC and HW engine class has been the same, which allowed
us to use them interchangeable. But it is better to start doing the
right thing and use the GuC definitions for the firmware interface.

We also keep the same class id in the ctx descriptor to be able to 
have

the same values in the driver and firmware logs.

Signed-off-by: Michel Thierry 
Signed-off-by: Rodrigo Vivi 
Signed-off-by: Michal Wajdeczko 
Cc: Daniele Ceraolo Spurio 
Cc: Michel Thierry 
Cc: Lucas De Marchi 
Cc: Tomasz Lis 

Tested-by: Tomasz Lis 

---
  drivers/gpu/drm/i915/intel_engine_cs.c  | 13 +
  drivers/gpu/drm/i915/intel_guc_fwif.h   |  7 +++
  drivers/gpu/drm/i915/intel_lrc.c    | 10 +-
  drivers/gpu/drm/i915/intel_ringbuffer.h |  2 ++
  4 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c 
b/drivers/gpu/drm/i915/intel_engine_cs.c

index 1a34e8f..bc81354 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -85,6 +85,7 @@ struct engine_info {
  unsigned int hw_id;
  unsigned int uabi_id;
  u8 class;
+    u8 guc_class;
  u8 instance;
  /* mmio bases table *must* be sorted in reverse gen order */
  struct engine_mmio_base {
@@ -98,6 +99,7 @@ struct engine_info {
  .hw_id = RCS_HW,
  .uabi_id = I915_EXEC_RENDER,
  .class = RENDER_CLASS,
+    .guc_class = GUC_RENDER_CLASS,
  .instance = 0,
  .mmio_bases = {
  { .gen = 1, .base = RENDER_RING_BASE }
@@ -107,6 +109,7 @@ struct engine_info {
  .hw_id = BCS_HW,
  .uabi_id = I915_EXEC_BLT,
  .class = COPY_ENGINE_CLASS,
+    .guc_class = GUC_BLITTER_CLASS,
  .instance = 0,
  .mmio_bases = {
  { .gen = 6, .base = BLT_RING_BASE }
@@ -116,6 +119,7 @@ struct engine_info {
  .hw_id = VCS_HW,
  .uabi_id = I915_EXEC_BSD,
  .class = VIDEO_DECODE_CLASS,
+    .guc_class = GUC_VIDEO_CLASS,
  .instance = 0,
  .mmio_bases = {
  { .gen = 11, .base = GEN11_BSD_RING_BASE },
@@ -127,6 +131,7 @@ struct engine_info {
  .hw_id = VCS2_HW,
  .uabi_id = I915_EXEC_BSD,
  .class = VIDEO_DECODE_CLASS,
+    .guc_class = GUC_VIDEO_CLASS,
  .instance = 1,
  .mmio_bases = {
  { .gen = 11, .base = GEN11_BSD2_RING_BASE },
@@ -137,6 +142,7 @@ struct engine_info {
  .hw_id = VCS3_HW,
  .uabi_id = I915_EXEC_BSD,
  .class = VIDEO_DECODE_CLASS,
+    .guc_class = GUC_VIDEO_CLASS,
  .instance = 2,
  .mmio_bases = {
  { .gen = 11, .base = GEN11_BSD3_RING_BASE }
@@ -146,6 +152,7 @@ struct engine_info {
  .hw_id = VCS4_HW,
  .uabi_id = I915_EXEC_BSD,
  .class = VIDEO_DECODE_CLASS,
+    .guc_class = GUC_VIDEO_CLASS,
  .instance = 3,
  .mmio_bases = {
  { .gen = 11, .base = GEN11_BSD4_RING_BASE }
@@ -155,6 +162,7 @@ struct engine_info {
  .hw_id = VECS_HW,
  .uabi_id = I915_EXEC_VEBOX,
  .class = VIDEO_ENHANCEMENT_CLASS,
+    .guc_class = GUC_VIDEOENHANCE_CLASS,
  .instance = 0,
  .mmio_bases = {
  { .gen = 11, .base = GEN11_VEBOX_RING_BASE },
@@ -165,6 +173,7 @@ struct engine_info {
  .hw_id = VECS2_HW,
  .uabi_id = I915_EXEC_VEBOX,
  .class = VIDEO_ENHANCEMENT_CLASS,
+    .guc_class = GUC_VIDEOENHANCE_CLASS,
  .instance = 1,
  .mmio_bases = {
  { .gen = 11, .base = GEN11_VEBOX2_RING_BASE }
@@ -276,6 +285,9 @@ static void __sprint_engine_name(char *name, 
const struct engine_info *info)

  if (GEM_WARN_ON(info->class > MAX_ENGINE_CLASS))
  return -EINVAL;
  +    if (GEM_WARN_ON(info->guc_class >= GUC_MAX_ENGINE_CLASSES))
+    return -EINVAL;
+
  if (GEM_WARN_ON(info->instance > MAX_ENGINE_INSTANCE))
  return -EINVAL;
  @@ -291,6 +303,7 @@ static void __sprint_engine_name(char *name, 
const struct engine_info *info)

  engine->i915 = dev_priv;
  __sprint_engine_name(engine->name, info);
  engine->hw_id = engine->guc_id = info->hw_id;
+    engine->guc_class = info->guc_class;
  engine->mmio_base = __engine_mmio_base(dev_priv, 
info->mmio_bases);

  engine->class = info->class;
  engine->instance = info->instance;
diff --git a/drivers/gpu/drm/i915/intel_guc_fwif.h 
b/drivers/gpu/drm/i915/intel_guc_fwif.h

index 963da91..5b7a05b 100644
--- a/drivers/gpu/drm/i915/intel_guc_fwif.

Re: [Intel-gfx] [PATCH 08/21] drm/i915/guc: Make use of the SW counter field in the context descriptor

2018-08-30 Thread Lis, Tomasz



On 2018-08-30 02:08, Lionel Landwerlin wrote:

On 29/08/2018 20:16, Michal Wajdeczko wrote:

The new context descriptor format contains two assignable fields:
the SW Context ID (technically 11 bits, but practically limited to 2032
entries due to some being reserved for future use by the GuC) and the
SW Counter (6 bits).

We don't want to limit ourselves too much in the maximum number of
concurrent contexts we want to allow, so ideally we want to employ
every possible bit available. Unfortunately, a further limitation in
the interface with the GuC means the combination of SW Context ID +
SW Counter has to be unique within the same engine class (as we use
the SW Context ID to index in the GuC stage descriptor pool, and the
Engine Class + SW Counter to index in the 2-dimensional lrc array).
This essentially means we need to somehow encode the engine instance.

Since the BSpec allows 6 bits for engine instance, we use the whole
SW counter for this task. If the limitation of 2032 maximum simultaneous
contexts is too restrictive, we can always squeeze things a bit more
(3 extras bits for hw_id, 3 bits for instance) and things will still
work (Gen11 does not instance more than 8 engines of any class).

Another alternative would be to generate the hw_id per HW context
instead of per GEM context, but that has other problems (e.g. maximum
number of user-created contexts would be variable, no relationship
between a GuC principal descriptor and the proxy descriptor it uses, 
...)


Bspec: 12254

Signed-off-by: Oscar Mateo 
Signed-off-by: Rodrigo Vivi 
Signed-off-by: Michal Wajdeczko 
Cc: Joonas Lahtinen 
Cc: Daniele Ceraolo Spurio 
Cc: Michel Thierry 

Tested-by: Tomasz Lis 

---
  drivers/gpu/drm/i915/i915_drv.h | 15 +++
  drivers/gpu/drm/i915/i915_gem_context.c |  5 -
  drivers/gpu/drm/i915/i915_gem_context.h |  2 ++
  drivers/gpu/drm/i915/i915_reg.h |  2 ++
  drivers/gpu/drm/i915/intel_lrc.c    | 12 +---
  5 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h 
b/drivers/gpu/drm/i915/i915_drv.h

index e5b9d3c..34f5495 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1866,14 +1866,21 @@ struct drm_i915_private {
  struct llist_head free_list;
  struct work_struct free_work;
  -    /* The hw wants to have a stable context identifier for the
+    /*
+ * The HW wants to have a stable context identifier for the
   * lifetime of the context (for OA, PASID, faults, etc).
   * This is limited in execlists to 21 bits.
+ * In enhanced execlist (GEN11+) this is limited to 11 bits
+ * (the SW Context ID field) but GuC limits it a bit further
+ * (11 bits - 16) due to some entries being reserved for future
+ * use (so the firmware only supports a GuC stage descriptor
+ * pool of 2032 entries).
   */
  struct ida hw_ida;
-#define MAX_CONTEXT_HW_ID (1<<21) /* exclusive */
-#define MAX_GUC_CONTEXT_HW_ID (1 << 20) /* exclusive */
-#define GEN11_MAX_CONTEXT_HW_ID (1<<11) /* exclusive */
+#define MAX_CONTEXT_HW_ID    (1 << 21) /* exclusive */
+#define MAX_GUC_CONTEXT_HW_ID    (1 << 20) /* exclusive */
+#define GEN11_MAX_CONTEXT_HW_ID    (1 << 11) /* exclusive */
+#define GEN11_MAX_CONTEXT_HW_ID_WITH_GUC (GEN11_MAX_CONTEXT_HW_ID - 16)
  } contexts;
    u32 fdi_rx_config;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c 
b/drivers/gpu/drm/i915/i915_gem_context.c

index f15a039..e3b500c 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -209,7 +209,10 @@ static int assign_hw_id(struct drm_i915_private 
*dev_priv, unsigned *out)

  unsigned int max;
    if (INTEL_GEN(dev_priv) >= 11) {
-    max = GEN11_MAX_CONTEXT_HW_ID;
+    if (USES_GUC_SUBMISSION(dev_priv))
+    max = GEN11_MAX_CONTEXT_HW_ID_WITH_GUC;
+    else
+    max = GEN11_MAX_CONTEXT_HW_ID;
  } else {
  /*
   * When using GuC in proxy submission, GuC consumes the
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h 
b/drivers/gpu/drm/i915/i915_gem_context.h

index 851dad6..4b87f5d 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -154,6 +154,8 @@ struct i915_gem_context {
  struct intel_ring *ring;
  u32 *lrc_reg_state;
  u64 lrc_desc;
+    u32 sw_context_id;
+    u32 sw_counter;
  int pin_count;
    const struct intel_context_ops *ops;
diff --git a/drivers/gpu/drm/i915/i915_reg.h 
b/drivers/gpu/drm/i915/i915_reg.h

index f232178..ea65d7b 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -3900,6 +3900,8 @@ enum {
  #define GEN8_CTX_ID_WIDTH 21
  #define GEN11_SW_CTX_ID_SHIFT 37
  #define GEN11_SW_CTX_ID_WIDTH 11
+#define GEN11_SW_COUNTER_SHIFT 55
+#define GEN11_SW_COUNTER_WIDTH 

Re: [Intel-gfx] [PATCH 06/21] drm/i915/guc: Use guc_class instead of engine_class in fw interface

2018-08-30 Thread Lis, Tomasz



On 2018-08-30 02:16, Lionel Landwerlin wrote:

On 29/08/2018 20:58, Michel Thierry wrote:

+Lionel
(please see below as this touches the lrca format & relates to OA 
reporting too)


On 8/29/2018 12:10 PM, Michal Wajdeczko wrote:

Until now the GuC and HW engine class has been the same, which allowed
us to use them interchangeable. But it is better to start doing the
right thing and use the GuC definitions for the firmware interface.

We also keep the same class id in the ctx descriptor to be able to have
the same values in the driver and firmware logs.

Signed-off-by: Michel Thierry 
Signed-off-by: Rodrigo Vivi 
Signed-off-by: Michal Wajdeczko 
Cc: Daniele Ceraolo Spurio 
Cc: Michel Thierry 
Cc: Lucas De Marchi 
Cc: Tomasz Lis 

Tested-by: Tomasz Lis 

---
  drivers/gpu/drm/i915/intel_engine_cs.c  | 13 +
  drivers/gpu/drm/i915/intel_guc_fwif.h   |  7 +++
  drivers/gpu/drm/i915/intel_lrc.c    | 10 +-
  drivers/gpu/drm/i915/intel_ringbuffer.h |  2 ++
  4 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c 
b/drivers/gpu/drm/i915/intel_engine_cs.c

index 1a34e8f..bc81354 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -85,6 +85,7 @@ struct engine_info {
  unsigned int hw_id;
  unsigned int uabi_id;
  u8 class;
+    u8 guc_class;
  u8 instance;
  /* mmio bases table *must* be sorted in reverse gen order */
  struct engine_mmio_base {
@@ -98,6 +99,7 @@ struct engine_info {
  .hw_id = RCS_HW,
  .uabi_id = I915_EXEC_RENDER,
  .class = RENDER_CLASS,
+    .guc_class = GUC_RENDER_CLASS,
  .instance = 0,
  .mmio_bases = {
  { .gen = 1, .base = RENDER_RING_BASE }
@@ -107,6 +109,7 @@ struct engine_info {
  .hw_id = BCS_HW,
  .uabi_id = I915_EXEC_BLT,
  .class = COPY_ENGINE_CLASS,
+    .guc_class = GUC_BLITTER_CLASS,
  .instance = 0,
  .mmio_bases = {
  { .gen = 6, .base = BLT_RING_BASE }
@@ -116,6 +119,7 @@ struct engine_info {
  .hw_id = VCS_HW,
  .uabi_id = I915_EXEC_BSD,
  .class = VIDEO_DECODE_CLASS,
+    .guc_class = GUC_VIDEO_CLASS,
  .instance = 0,
  .mmio_bases = {
  { .gen = 11, .base = GEN11_BSD_RING_BASE },
@@ -127,6 +131,7 @@ struct engine_info {
  .hw_id = VCS2_HW,
  .uabi_id = I915_EXEC_BSD,
  .class = VIDEO_DECODE_CLASS,
+    .guc_class = GUC_VIDEO_CLASS,
  .instance = 1,
  .mmio_bases = {
  { .gen = 11, .base = GEN11_BSD2_RING_BASE },
@@ -137,6 +142,7 @@ struct engine_info {
  .hw_id = VCS3_HW,
  .uabi_id = I915_EXEC_BSD,
  .class = VIDEO_DECODE_CLASS,
+    .guc_class = GUC_VIDEO_CLASS,
  .instance = 2,
  .mmio_bases = {
  { .gen = 11, .base = GEN11_BSD3_RING_BASE }
@@ -146,6 +152,7 @@ struct engine_info {
  .hw_id = VCS4_HW,
  .uabi_id = I915_EXEC_BSD,
  .class = VIDEO_DECODE_CLASS,
+    .guc_class = GUC_VIDEO_CLASS,
  .instance = 3,
  .mmio_bases = {
  { .gen = 11, .base = GEN11_BSD4_RING_BASE }
@@ -155,6 +162,7 @@ struct engine_info {
  .hw_id = VECS_HW,
  .uabi_id = I915_EXEC_VEBOX,
  .class = VIDEO_ENHANCEMENT_CLASS,
+    .guc_class = GUC_VIDEOENHANCE_CLASS,
  .instance = 0,
  .mmio_bases = {
  { .gen = 11, .base = GEN11_VEBOX_RING_BASE },
@@ -165,6 +173,7 @@ struct engine_info {
  .hw_id = VECS2_HW,
  .uabi_id = I915_EXEC_VEBOX,
  .class = VIDEO_ENHANCEMENT_CLASS,
+    .guc_class = GUC_VIDEOENHANCE_CLASS,
  .instance = 1,
  .mmio_bases = {
  { .gen = 11, .base = GEN11_VEBOX2_RING_BASE }
@@ -276,6 +285,9 @@ static void __sprint_engine_name(char *name, 
const struct engine_info *info)

  if (GEM_WARN_ON(info->class > MAX_ENGINE_CLASS))
  return -EINVAL;
  +    if (GEM_WARN_ON(info->guc_class >= GUC_MAX_ENGINE_CLASSES))
+    return -EINVAL;
+
  if (GEM_WARN_ON(info->instance > MAX_ENGINE_INSTANCE))
  return -EINVAL;
  @@ -291,6 +303,7 @@ static void __sprint_engine_name(char *name, 
const struct engine_info *info)

  engine->i915 = dev_priv;
  __sprint_engine_name(engine->name, info);
  engine->hw_id = engine->guc_id = info->hw_id;
+    engine->guc_class = info->guc_class;
  engine->mmio_base = __engine_mmio_base(dev_priv, 
info->mmio_bases);

  engine->class = info->class;
  engine->instance = info->instance;
diff --git a/drivers/gpu/drm/i915/intel_guc_fwif.h 
b/drivers/gpu/drm/i915/intel_guc_fwif.h

index 963da91..5b7a05b 100644
--- a/drivers/gpu/drm/i915/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/intel_guc_fwif.h
@@ -39,6 +39,13 @@
  #define GUC_VIDEO_ENGINE2    4
  #define GUC_MAX_ENGINES_NUM    (GUC_VIDEO_ENGINE2 + 1)
  +#define 

Re: [Intel-gfx] [PATCH] drm/i915: Only force GGTT coherency w/a on required chipsets

2018-07-20 Thread Lis, Tomasz



On 2018-07-20 12:19, Chris Wilson wrote:

Not all chipsets have an internal buffer delaying the visibility of
writes via the GGTT being visible by other physical paths, but we use a
very heavy workaround for all. We only need to apply that workarounds to
the chipsets we know suffer from the delay and the resulting coherency
issue.

Similarly, the same inconsistent coherency fouls up our ABI promise that
a write into a mmap_gtt is immediately visible to others. Since the HW
has made that a lie, let userspace know when that contract is broken.
(Not that userspace would want to use mmap_gtt on those chipsets for
other performance reasons...)

Testcase: igt/drv_selftest/live_coherency
Testcase: igt/gem_mmap_gtt/coherency
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100587
Signed-off-by: Chris Wilson 
Cc: Joonas Lahtinen 
Is there any mention of this bug/feature in bspec? Would be nice to have 
a reference.


Reviewed-by: Tomasz Lis 

---
Resend with --function-context
-Chris
---
  drivers/gpu/drm/i915/i915_drv.c  |  3 +++
  drivers/gpu/drm/i915/i915_gem.c  |  5 +
  drivers/gpu/drm/i915/i915_pci.c  | 10 ++
  drivers/gpu/drm/i915/intel_device_info.h |  1 +
  include/uapi/drm/i915_drm.h  | 22 ++
  5 files changed, 41 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index f8cfd16be534..18a45e7a3d7c 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -302,152 +302,155 @@ static void intel_detect_pch(struct drm_i915_private 
*dev_priv)
  static int i915_getparam_ioctl(struct drm_device *dev, void *data,
   struct drm_file *file_priv)
  {
struct drm_i915_private *dev_priv = to_i915(dev);
struct pci_dev *pdev = dev_priv->drm.pdev;
drm_i915_getparam_t *param = data;
int value;
  
  	switch (param->param) {

case I915_PARAM_IRQ_ACTIVE:
case I915_PARAM_ALLOW_BATCHBUFFER:
case I915_PARAM_LAST_DISPATCH:
case I915_PARAM_HAS_EXEC_CONSTANTS:
/* Reject all old ums/dri params. */
return -ENODEV;
case I915_PARAM_CHIPSET_ID:
value = pdev->device;
break;
case I915_PARAM_REVISION:
value = pdev->revision;
break;
case I915_PARAM_NUM_FENCES_AVAIL:
value = dev_priv->num_fence_regs;
break;
case I915_PARAM_HAS_OVERLAY:
value = dev_priv->overlay ? 1 : 0;
break;
case I915_PARAM_HAS_BSD:
value = !!dev_priv->engine[VCS];
break;
case I915_PARAM_HAS_BLT:
value = !!dev_priv->engine[BCS];
break;
case I915_PARAM_HAS_VEBOX:
value = !!dev_priv->engine[VECS];
break;
case I915_PARAM_HAS_BSD2:
value = !!dev_priv->engine[VCS2];
break;
case I915_PARAM_HAS_LLC:
value = HAS_LLC(dev_priv);
break;
case I915_PARAM_HAS_WT:
value = HAS_WT(dev_priv);
break;
case I915_PARAM_HAS_ALIASING_PPGTT:
value = USES_PPGTT(dev_priv);
break;
case I915_PARAM_HAS_SEMAPHORES:
value = HAS_LEGACY_SEMAPHORES(dev_priv);
break;
case I915_PARAM_HAS_SECURE_BATCHES:
value = capable(CAP_SYS_ADMIN);
break;
case I915_PARAM_CMD_PARSER_VERSION:
value = i915_cmd_parser_get_version(dev_priv);
break;
case I915_PARAM_SUBSLICE_TOTAL:
value = sseu_subslice_total(_INFO(dev_priv)->sseu);
if (!value)
return -ENODEV;
break;
case I915_PARAM_EU_TOTAL:
value = INTEL_INFO(dev_priv)->sseu.eu_total;
if (!value)
return -ENODEV;
break;
case I915_PARAM_HAS_GPU_RESET:
value = i915_modparams.enable_hangcheck &&
intel_has_gpu_reset(dev_priv);
if (value && intel_has_reset_engine(dev_priv))
value = 2;
break;
case I915_PARAM_HAS_RESOURCE_STREAMER:
value = HAS_RESOURCE_STREAMER(dev_priv);
break;
case I915_PARAM_HAS_POOLED_EU:
value = HAS_POOLED_EU(dev_priv);
break;
case I915_PARAM_MIN_EU_IN_POOL:
value = INTEL_INFO(dev_priv)->sseu.min_eu_in_pool;
break;
case I915_PARAM_HUC_STATUS:
value = intel_huc_check_status(_priv->huc);
if (value < 0)
return value;
break;
case I915_PARAM_MMAP_GTT_VERSION:
/* Though we've 

Re: [Intel-gfx] [PATCH v6] drm/i915: Add IOCTL Param to control data port coherency.

2018-07-19 Thread Lis, Tomasz



On 2018-07-19 09:12, Joonas Lahtinen wrote:

Quoting Lis, Tomasz (2018-07-18 18:28:32)

On 2018-07-18 16:42, Tvrtko Ursulin wrote:

On 18/07/2018 14:24, Joonas Lahtinen wrote:

Quoting Tomasz Lis (2018-07-16 16:07:16)




+++ b/include/uapi/drm/i915_drm.h
@@ -1456,6 +1456,13 @@ struct drm_i915_gem_context_param {
   #define   I915_CONTEXT_MAX_USER_PRIORITY   1023 /* inclusive */
   #define   I915_CONTEXT_DEFAULT_PRIORITY    0
   #define   I915_CONTEXT_MIN_USER_PRIORITY   -1023 /* inclusive */
+/*
+ * When data port level coherency is enabled, the GPU will update
memory
+ * buffers shared with CPU, by forcing internal cache units to send
memory
+ * writes to higher level caches faster. Enabling data port
coherency has
+ * a performance cost.
+ */

I was under impression this is enabled by default and it can be disabled
for a performance optimization?

This is true, coherency is kept by default. We disable it as a
workaround: performance-related for gen11, and due to minor hardware
issue on previous platforms. See WaForceEnableNonCoherent.

Ok, then you definitely want to rephrase the comment to bake that
information in it. Now it sounds like it needs to be turned on to have
coherency.

I'm not sure if I understand what you're asking for.
Should I emphasize that the feature is disabled unless the flag is set? 
This seem obvious...
Or should I provide the reason why it is disabled on specific platforms? 
This should probably be done within workaround setup, not in user api 
definition. Or maybe it's enough to have it in Bspec? Bspec links are 
provided in the patch.

Or should I just mention the workaround name?
-Tomasz

Regards, Joonas


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v6] drm/i915: Add IOCTL Param to control data port coherency.

2018-07-18 Thread Lis, Tomasz



On 2018-07-18 16:42, Tvrtko Ursulin wrote:


On 18/07/2018 14:24, Joonas Lahtinen wrote:

Quoting Tomasz Lis (2018-07-16 16:07:16)
+static int emit_set_data_port_coherency(struct i915_request *rq, 
bool enable)

+{
+   u32 *cs;
+   i915_reg_t reg;
+
+   GEM_BUG_ON(rq->engine->class != RENDER_CLASS);
+   GEM_BUG_ON(INTEL_GEN(rq->i915) < 9);
+
+   cs = intel_ring_begin(rq, 4);
+   if (IS_ERR(cs))
+   return PTR_ERR(cs);
+
+   if (INTEL_GEN(rq->i915) >= 11)
+   reg = ICL_HDC_MODE;
+   else if (INTEL_GEN(rq->i915) >= 10)
+   reg = CNL_HDC_CHICKEN0;
+   else
+   reg = HDC_CHICKEN0;
+
+   *cs++ = MI_LOAD_REGISTER_IMM(1);
+   *cs++ = i915_mmio_reg_offset(reg);
+   /* Enabling coherency means disabling the bit which forces 
it off */


This comment is still spurious, please get rid of the habit of writing
comments about "what" the code is doing, useful comments should be
limited to "why", which is quite self explanatory here, that's the way
the register is.

Ok, I will read the related doc:
https://www.kernel.org/doc/html/v4.10/process/coding-style.html#commenting



+static int
+intel_lr_context_update_data_port_coherency(struct i915_request *rq)
+{
+   struct i915_gem_context *ctx = rq->gem_context;
+   bool enable = test_bit(CONTEXT_DATA_PORT_COHERENT_REQUESTED, 
>flags);

+   int ret;
+
+ lockdep_assert_held(>i915->drm.struct_mutex);
+
+   if (test_bit(CONTEXT_DATA_PORT_COHERENT_ACTIVE, >flags) 
== enable)

+   return 0;
+
+   ret = emit_set_data_port_coherency(rq, enable);
+
+   if (!ret) {
+   if (enable)
+ __set_bit(CONTEXT_DATA_PORT_COHERENT_ACTIVE, >flags);
+   else
+ __clear_bit(CONTEXT_DATA_PORT_COHERENT_ACTIVE, >flags);
+   }


Do we have indication that the hardware feature will be unreliable in
responding to the requests? I don't think you need the differentiation
of requested vs. active. If there is an error, we can just report 
back to
the user as a failed IOCTL. Now it adds unnecessary complication for 
no benefit.


Requested vs active is for implementing the lazy emit.

AFAIR it does propagate the error out of execbuf (although we never 
ever expect it to happen), and this is just to keep the internal 
house-keeping in sync.


Regards,

Tvrtko

@@ -2164,6 +2221,13 @@ static int gen8_emit_flush_render(struct 
i915_request *request,

 /* WaForGAMHang:kbl */
 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
 dc_flush_wa = true;
+
+   /* Emit the switch of data port coherency state if 
needed */


Ditto for spurious comment, just about what the code does.


+++ b/include/uapi/drm/i915_drm.h
@@ -1456,6 +1456,13 @@ struct drm_i915_gem_context_param {
  #define   I915_CONTEXT_MAX_USER_PRIORITY   1023 /* inclusive */
  #define   I915_CONTEXT_DEFAULT_PRIORITY    0
  #define   I915_CONTEXT_MIN_USER_PRIORITY   -1023 /* inclusive */
+/*
+ * When data port level coherency is enabled, the GPU will update 
memory
+ * buffers shared with CPU, by forcing internal cache units to send 
memory
+ * writes to higher level caches faster. Enabling data port 
coherency has

+ * a performance cost.
+ */


I was under impression this is enabled by default and it can be disabled
for a performance optimization?
This is true, coherency is kept by default. We disable it as a 
workaround: performance-related for gen11, and due to minor hardware 
issue on previous platforms. See WaForceEnableNonCoherent.

-Tomasz


Regards, Joonas
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx



___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v5] drm/i915: Add IOCTL Param to control data port coherency.

2018-07-13 Thread Lis, Tomasz



On 2018-07-13 12:40, Tvrtko Ursulin wrote:


On 12/07/2018 16:10, Tomasz Lis wrote:
The patch adds a parameter to control the data port coherency 
functionality
on a per-context level. When the IOCTL is called, a command to switch 
data
port coherency state is added to the ordered list. All prior requests 
are
executed on old coherency settings, and all exec requests after the 
IOCTL

will use new settings.

Rationale:

The OpenCL driver develpers requested a functionality to control cache
coherency at data port level. Keeping the coherency at that level is 
disabled

by default due to its performance costs. OpenCL driver is planning to
enable it for a small subset of submissions, when such functionality is
required. Below are answers to basic question explaining background
of the functionality and reasoning for the proposed implementation:

1. Why do we need a coherency enable/disable switch for memory that 
is shared

between CPU and GEN (GPU)?

Memory coherency between CPU and GEN, while being a great feature 
that enables
CL_MEM_SVM_FINE_GRAIN_BUFFER OCL capability on Intel GEN 
architecture, adds
overhead related to tracking (snooping) memory inside different cache 
units

(L1$, L2$, L3$, LLC$, etc.). At the same time, minority of modern OCL
applications actually use CL_MEM_SVM_FINE_GRAIN_BUFFER (and hence 
require
memory coherency between CPU and GPU). The goal of coherency 
enable/disable
switch is to remove overhead of memory coherency when memory 
coherency is not

needed.

2. Why do we need a global coherency switch?

In order to support I/O commands from within EUs (Execution Units), 
Intel GEN
ISA (GEN Instruction Set Assembly) contains dedicated "send" 
instructions.

These send instructions provide several addressing models. One of these
addressing models (named "stateless") provides most flexible I/O 
using plain
virtual addresses (as opposed to buffer_handle+offset models). This 
"stateless"
model is similar to regular memory load/store operations available on 
typical
CPUs. Since this model provides I/O using arbitrary virtual 
addresses, it
enables algorithmic designs that are based on pointer-to-pointer 
(e.g. buffer

of pointers) concepts. For instance, it allows creating tree-like data
structures such as:
    
   |  NODE1 |
   | uint64_t data  |
   +|
   | NODE*  |  NODE*|
   ++---+
 /  \
    /    \
   |  NODE2 |    |  NODE3 |
   | uint64_t data  |    | uint64_t data  |
   +|    +|
   | NODE*  |  NODE*|    | NODE*  |  NODE*|
   ++---+    ++---+

Please note that pointers inside such structures can point to memory 
locations
in different OCL allocations  - e.g. NODE1 and NODE2 can reside in 
one OCL

allocation while NODE3 resides in a completely separate OCL allocation.
Additionally, such pointers can be shared with CPU (i.e. using SVM - 
Shared
Virtual Memory feature). Using pointers from different allocations 
doesn't
affect the stateless addressing model which even allows scattered 
reading from
different allocations at the same time (i.e. by utilizing SIMD-nature 
of send

instructions).

When it comes to coherency programming, send instructions in 
stateless model
can be encoded (at ISA level) to either use or disable coherency. 
However, for
generic OCL applications (such as example with tree-like data 
structure), OCL
compiler is not able to determine origin of memory pointed to by an 
arbitrary

pointer - i.e. is not able to track given pointer back to a specific
allocation. As such, it's not able to decide whether coherency is 
needed or not
for specific pointer (or for specific I/O instruction). As a result, 
compiler

encodes all stateless sends as coherent (doing otherwise would lead to
functional issues resulting from data corruption). Please note that 
it would be
possible to workaround this (e.g. based on allocations map and 
pointer bounds

checking prior to each I/O instruction) but the performance cost of such
workaround would be many times greater than the cost of keeping 
coherency
always enabled. As such, enabling/disabling memory coherency at GEN 
ISA level

is not feasible and alternative method is needed.

Such alternative solution is to have a global coherency switch that 
allows

disabling coherency for single (though entire) GPU submission. This is
beneficial because this way we:
* can enable (and pay for) coherency only in submissions that 
actually need

coherency (submissions that use CL_MEM_SVM_FINE_GRAIN_BUFFER resources)
* don't care about coherency at GEN ISA granularity (no performance 
impact)


3. Will coherency switch be used frequently?

There are scenarios that will require frequent toggling 

Re: [Intel-gfx] [PATCH v4] drm/i915: Add IOCTL Param to control data port coherency.

2018-07-11 Thread Lis, Tomasz



On 2018-07-10 20:03, Lis, Tomasz wrote:



On 2018-07-09 18:28, Tvrtko Ursulin wrote:


On 09/07/2018 14:20, Tomasz Lis wrote:


diff --git a/drivers/gpu/drm/i915/intel_lrc.h 
b/drivers/gpu/drm/i915/intel_lrc.h

index 1593194..f6965ae 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
[...]
+/*
+ * When data port level coherency is enabled, the GPU will update 
memory
+ * buffers shared with CPU, by forcing internal cache units to send 
memory
+ * writes to real RAM faster. Keeping such coherency has 
performance cost.


Is this comment correct? Is it actually sending memory writes to 
_RAM_, or just the coherency mode enabled, even if only targetting 
CPU or shared cache, which adds a cost?
I'm not sure whether there are further coherency modes to choose how 
"deep" coherency goes. The use case of OCL Team is to see gradual 
changes in the buffers on CPU side while the execution progresses. 
Write to RAM is needed to achieve that. And that limits performance by 
using RAM bandwidth.


It was pointed out to me that last level cache is shared between CPU and 
GPU on non-atoms. Which means my argument was invalid, an most likely 
the coherency option does not enforce RAM write. I will update the comment.

-Tomasz

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v4] drm/i915: Add IOCTL Param to control data port coherency.

2018-07-10 Thread Lis, Tomasz



On 2018-07-09 18:28, Tvrtko Ursulin wrote:


On 09/07/2018 14:20, Tomasz Lis wrote:
The patch adds a parameter to control the data port coherency 
functionality
on a per-context level. When the IOCTL is called, a command to switch 
data
port coherency state is added to the ordered list. All prior requests 
are
executed on old coherency settings, and all exec requests after the 
IOCTL

will use new settings.

Rationale:

The OpenCL driver develpers requested a functionality to control cache
coherency at data port level. Keeping the coherency at that level is 
disabled

by default due to its performance costs. OpenCL driver is planning to
enable it for a small subset of submissions, when such functionality is
required. Below are answers to basic question explaining background
of the functionality and reasoning for the proposed implementation:

1. Why do we need a coherency enable/disable switch for memory that 
is shared

between CPU and GEN (GPU)?

Memory coherency between CPU and GEN, while being a great feature 
that enables
CL_MEM_SVM_FINE_GRAIN_BUFFER OCL capability on Intel GEN 
architecture, adds
overhead related to tracking (snooping) memory inside different cache 
units

(L1$, L2$, L3$, LLC$, etc.). At the same time, minority of modern OCL
applications actually use CL_MEM_SVM_FINE_GRAIN_BUFFER (and hence 
require
memory coherency between CPU and GPU). The goal of coherency 
enable/disable
switch is to remove overhead of memory coherency when memory 
coherency is not

needed.

2. Why do we need a global coherency switch?

In order to support I/O commands from within EUs (Execution Units), 
Intel GEN
ISA (GEN Instruction Set Assembly) contains dedicated "send" 
instructions.

These send instructions provide several addressing models. One of these
addressing models (named "stateless") provides most flexible I/O 
using plain
virtual addresses (as opposed to buffer_handle+offset models). This 
"stateless"
model is similar to regular memory load/store operations available on 
typical
CPUs. Since this model provides I/O using arbitrary virtual 
addresses, it
enables algorithmic designs that are based on pointer-to-pointer 
(e.g. buffer

of pointers) concepts. For instance, it allows creating tree-like data
structures such as:
    
   |  NODE1 |
   | uint64_t data  |
   +|
   | NODE*  |  NODE*|
   ++---+
 /  \
    /    \
   |  NODE2 |    |  NODE3 |
   | uint64_t data  |    | uint64_t data  |
   +|    +|
   | NODE*  |  NODE*|    | NODE*  |  NODE*|
   ++---+    ++---+

Please note that pointers inside such structures can point to memory 
locations
in different OCL allocations  - e.g. NODE1 and NODE2 can reside in 
one OCL

allocation while NODE3 resides in a completely separate OCL allocation.
Additionally, such pointers can be shared with CPU (i.e. using SVM - 
Shared
Virtual Memory feature). Using pointers from different allocations 
doesn't
affect the stateless addressing model which even allows scattered 
reading from
different allocations at the same time (i.e. by utilizing SIMD-nature 
of send

instructions).

When it comes to coherency programming, send instructions in 
stateless model
can be encoded (at ISA level) to either use or disable coherency. 
However, for
generic OCL applications (such as example with tree-like data 
structure), OCL
compiler is not able to determine origin of memory pointed to by an 
arbitrary

pointer - i.e. is not able to track given pointer back to a specific
allocation. As such, it's not able to decide whether coherency is 
needed or not
for specific pointer (or for specific I/O instruction). As a result, 
compiler

encodes all stateless sends as coherent (doing otherwise would lead to
functional issues resulting from data corruption). Please note that 
it would be
possible to workaround this (e.g. based on allocations map and 
pointer bounds

checking prior to each I/O instruction) but the performance cost of such
workaround would be many times greater than the cost of keeping 
coherency
always enabled. As such, enabling/disabling memory coherency at GEN 
ISA level

is not feasible and alternative method is needed.

Such alternative solution is to have a global coherency switch that 
allows

disabling coherency for single (though entire) GPU submission. This is
beneficial because this way we:
* can enable (and pay for) coherency only in submissions that 
actually need

coherency (submissions that use CL_MEM_SVM_FINE_GRAIN_BUFFER resources)
* don't care about coherency at GEN ISA granularity (no performance 
impact)


3. Will coherency switch be used frequently?

There are scenarios that will require frequent toggling 

Re: [Intel-gfx] [PATCH v4] drm/i915: Add IOCTL Param to control data port coherency.

2018-07-10 Thread Lis, Tomasz



On 2018-07-09 18:37, Chris Wilson wrote:

Quoting Tvrtko Ursulin (2018-07-09 17:28:02)

On 09/07/2018 14:20, Tomasz Lis wrote:

+static int i915_gem_context_clear_data_port_coherent(struct i915_gem_context 
*ctx)
+{
+ int ret;
+
+ ret = intel_lr_context_modify_data_port_coherency(ctx, false);
+ if (!ret)
+ __clear_bit(CONTEXT_DATA_PORT_COHERENT, >flags);
+ return ret;

Is there a good reason you allow userspace to keep emitting unlimited
number of commands which actually do not change the status? If not
please consider gating the command emission with
test_and_set_bit/test_and_clear_bit. Hm.. apart even with that they
could keep toggling ad infinitum with no real work in between. Has it
been considered to only save the desired state in set param and then
emit it, if needed, before next execbuf? Minor thing in any case, just
curious since I wasn't following the threads.

The first patch tried to add a bit to execbuf, and having been
mistakenly down that road before, we asked if there was any alternative.
(Now if you've also been following execbuf3 conversations, having a
packet for privileged LRI is definitely something we want.)

Setting the value in the context register is precisely what we want to
do, and trivially serialised with execbuf since we have to serialise
reservation of ring space, i.e. the normal rules of request generation.
(execbuf is just a client and nothing special). From that point of view,
we only care about frequency, if it is very frequent it should be
controlled by userspace inside the batch (but it can't due to there
being dangerous bits inside the reg aiui). At the other end of the
scale, is context_setparam for set-once. And there should be no
inbetween as that requires costly batch flushes.
-Chris

Joonas did brought that concern in his review; here it is, with my response:

On 2018-06-21 15:47, Lis, Tomasz wrote:

On 2018-06-21 08:39, Joonas Lahtinen wrote:
I'm thinking we should set this value when it has changed, when we 
insert the

requests into the command stream. So if you change back and forth, while
not emitting any requests, nothing really happens. If you change the 
value and

emit a request, we should emit a LRI before the jump to the commands.
Similary if you keep setting the value to the value it already was in,
nothing will happen, again.

When I considered that, my way of reasoning was:
If we execute the flag changing buffer right away, it may be sent to 
hardware faster if there is no job in progress.
If we use the lazy way, and trigger the change just before submission 
-  there will be additional conditions in submission code, plus the 
change will be made when there is another job pending (though it's not 
a considerable payload to just switch a flag).
If user space switches the flag back and forth without much sense, 
then there is something wrong with the user space driver, and it 
shouldn't be up to kernel to fix that.


This is why I chosen the current approach. But I can change it if you 
wish.


So while I think the current solution is better from performance 
standpoint, but I will change it if you request that.

-Tomasz

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v4] drm/i915: Add IOCTL Param to control data port coherency.

2018-07-09 Thread Lis, Tomasz



On 2018-07-09 16:24, Lionel Landwerlin wrote:

On 09/07/18 15:03, Lis, Tomasz wrote:



On 2018-07-09 15:48, Lionel Landwerlin wrote:

On 09/07/18 14:20, Tomasz Lis wrote:
The patch adds a parameter to control the data port coherency 
functionality
on a per-context level. When the IOCTL is called, a command to 
switch data
port coherency state is added to the ordered list. All prior 
requests are
executed on old coherency settings, and all exec requests after the 
IOCTL

will use new settings.

Rationale:

The OpenCL driver develpers requested a functionality to control cache
coherency at data port level. Keeping the coherency at that level 
is disabled

by default due to its performance costs. OpenCL driver is planning to
enable it for a small subset of submissions, when such 
functionality is

required. Below are answers to basic question explaining background
of the functionality and reasoning for the proposed implementation:

1. Why do we need a coherency enable/disable switch for memory that 
is shared

between CPU and GEN (GPU)?

Memory coherency between CPU and GEN, while being a great feature 
that enables
CL_MEM_SVM_FINE_GRAIN_BUFFER OCL capability on Intel GEN 
architecture, adds
overhead related to tracking (snooping) memory inside different 
cache units

(L1$, L2$, L3$, LLC$, etc.). At the same time, minority of modern OCL
applications actually use CL_MEM_SVM_FINE_GRAIN_BUFFER (and hence 
require
memory coherency between CPU and GPU). The goal of coherency 
enable/disable
switch is to remove overhead of memory coherency when memory 
coherency is not

needed.

2. Why do we need a global coherency switch?

In order to support I/O commands from within EUs (Execution Units), 
Intel GEN
ISA (GEN Instruction Set Assembly) contains dedicated "send" 
instructions.
These send instructions provide several addressing models. One of 
these
addressing models (named "stateless") provides most flexible I/O 
using plain
virtual addresses (as opposed to buffer_handle+offset models). This 
"stateless"
model is similar to regular memory load/store operations available 
on typical
CPUs. Since this model provides I/O using arbitrary virtual 
addresses, it
enables algorithmic designs that are based on pointer-to-pointer 
(e.g. buffer

of pointers) concepts. For instance, it allows creating tree-like data
structures such as:
    
   |  NODE1 |
   | uint64_t data  |
   +|
   | NODE*  |  NODE*|
   ++---+
 /  \
    /    \
   |  NODE2 |    |  NODE3 |
   | uint64_t data  |    | uint64_t data  |
   +|    +|
   | NODE*  |  NODE*|    | NODE*  |  NODE*|
   ++---+    ++---+

Please note that pointers inside such structures can point to 
memory locations
in different OCL allocations  - e.g. NODE1 and NODE2 can reside in 
one OCL
allocation while NODE3 resides in a completely separate OCL 
allocation.
Additionally, such pointers can be shared with CPU (i.e. using SVM 
- Shared
Virtual Memory feature). Using pointers from different allocations 
doesn't
affect the stateless addressing model which even allows scattered 
reading from
different allocations at the same time (i.e. by utilizing 
SIMD-nature of send

instructions).

When it comes to coherency programming, send instructions in 
stateless model
can be encoded (at ISA level) to either use or disable coherency. 
However, for
generic OCL applications (such as example with tree-like data 
structure), OCL
compiler is not able to determine origin of memory pointed to by an 
arbitrary

pointer - i.e. is not able to track given pointer back to a specific
allocation. As such, it's not able to decide whether coherency is 
needed or not
for specific pointer (or for specific I/O instruction). As a 
result, compiler

encodes all stateless sends as coherent (doing otherwise would lead to
functional issues resulting from data corruption). Please note that 
it would be
possible to workaround this (e.g. based on allocations map and 
pointer bounds
checking prior to each I/O instruction) but the performance cost of 
such
workaround would be many times greater than the cost of keeping 
coherency
always enabled. As such, enabling/disabling memory coherency at GEN 
ISA level

is not feasible and alternative method is needed.

Such alternative solution is to have a global coherency switch that 
allows

disabling coherency for single (though entire) GPU submission. This is
beneficial because this way we:
* can enable (and pay for) coherency only in submissions that 
actually need
coherency (submissions that use CL_MEM_SVM_FINE_GRAIN_BUFFER 
resources)
* don't care about coherency at GEN ISA granularit

Re: [Intel-gfx] [PATCH v4] drm/i915: Add IOCTL Param to control data port coherency.

2018-07-09 Thread Lis, Tomasz



On 2018-07-09 15:48, Lionel Landwerlin wrote:

On 09/07/18 14:20, Tomasz Lis wrote:
The patch adds a parameter to control the data port coherency 
functionality
on a per-context level. When the IOCTL is called, a command to switch 
data
port coherency state is added to the ordered list. All prior requests 
are
executed on old coherency settings, and all exec requests after the 
IOCTL

will use new settings.

Rationale:

The OpenCL driver develpers requested a functionality to control cache
coherency at data port level. Keeping the coherency at that level is 
disabled

by default due to its performance costs. OpenCL driver is planning to
enable it for a small subset of submissions, when such functionality is
required. Below are answers to basic question explaining background
of the functionality and reasoning for the proposed implementation:

1. Why do we need a coherency enable/disable switch for memory that 
is shared

between CPU and GEN (GPU)?

Memory coherency between CPU and GEN, while being a great feature 
that enables
CL_MEM_SVM_FINE_GRAIN_BUFFER OCL capability on Intel GEN 
architecture, adds
overhead related to tracking (snooping) memory inside different cache 
units

(L1$, L2$, L3$, LLC$, etc.). At the same time, minority of modern OCL
applications actually use CL_MEM_SVM_FINE_GRAIN_BUFFER (and hence 
require
memory coherency between CPU and GPU). The goal of coherency 
enable/disable
switch is to remove overhead of memory coherency when memory 
coherency is not

needed.

2. Why do we need a global coherency switch?

In order to support I/O commands from within EUs (Execution Units), 
Intel GEN
ISA (GEN Instruction Set Assembly) contains dedicated "send" 
instructions.

These send instructions provide several addressing models. One of these
addressing models (named "stateless") provides most flexible I/O 
using plain
virtual addresses (as opposed to buffer_handle+offset models). This 
"stateless"
model is similar to regular memory load/store operations available on 
typical
CPUs. Since this model provides I/O using arbitrary virtual 
addresses, it
enables algorithmic designs that are based on pointer-to-pointer 
(e.g. buffer

of pointers) concepts. For instance, it allows creating tree-like data
structures such as:
    
   |  NODE1 |
   | uint64_t data  |
   +|
   | NODE*  |  NODE*|
   ++---+
 /  \
    /    \
   |  NODE2 |    |  NODE3 |
   | uint64_t data  |    | uint64_t data  |
   +|    +|
   | NODE*  |  NODE*|    | NODE*  |  NODE*|
   ++---+    ++---+

Please note that pointers inside such structures can point to memory 
locations
in different OCL allocations  - e.g. NODE1 and NODE2 can reside in 
one OCL

allocation while NODE3 resides in a completely separate OCL allocation.
Additionally, such pointers can be shared with CPU (i.e. using SVM - 
Shared
Virtual Memory feature). Using pointers from different allocations 
doesn't
affect the stateless addressing model which even allows scattered 
reading from
different allocations at the same time (i.e. by utilizing SIMD-nature 
of send

instructions).

When it comes to coherency programming, send instructions in 
stateless model
can be encoded (at ISA level) to either use or disable coherency. 
However, for
generic OCL applications (such as example with tree-like data 
structure), OCL
compiler is not able to determine origin of memory pointed to by an 
arbitrary

pointer - i.e. is not able to track given pointer back to a specific
allocation. As such, it's not able to decide whether coherency is 
needed or not
for specific pointer (or for specific I/O instruction). As a result, 
compiler

encodes all stateless sends as coherent (doing otherwise would lead to
functional issues resulting from data corruption). Please note that 
it would be
possible to workaround this (e.g. based on allocations map and 
pointer bounds

checking prior to each I/O instruction) but the performance cost of such
workaround would be many times greater than the cost of keeping 
coherency
always enabled. As such, enabling/disabling memory coherency at GEN 
ISA level

is not feasible and alternative method is needed.

Such alternative solution is to have a global coherency switch that 
allows

disabling coherency for single (though entire) GPU submission. This is
beneficial because this way we:
* can enable (and pay for) coherency only in submissions that 
actually need

coherency (submissions that use CL_MEM_SVM_FINE_GRAIN_BUFFER resources)
* don't care about coherency at GEN ISA granularity (no performance 
impact)


3. Will coherency switch be used frequently?

There are scenarios that will require frequent toggling 

Re: [Intel-gfx] [PATCH v4] drm/i915/gen11: Preempt-to-idle support in execlists.

2018-06-29 Thread Lis, Tomasz



On 2018-06-11 18:37, Daniele Ceraolo Spurio wrote:



On 25/05/18 11:26, Tomasz Lis wrote:

The patch adds support of preempt-to-idle requesting by setting a proper
bit within Execlist Control Register, and receiving preemption result 
from

Context Status Buffer.

Preemption in previous gens required a special batch buffer to be 
executed,
so the Command Streamer never preempted to idle directly. In Icelake 
it is

possible, as there is a hardware mechanism to inform the kernel about
status of the preemption request.

This patch does not cover using the new preemption mechanism when GuC is
active.

v2: Added needs_preempt_context() change so that it is not created when
 preempt-to-idle is supported. (Chris)
 Updated setting HWACK flag so that it is cleared after
 preempt-to-dle. (Chris, Daniele)
 Updated to use I915_ENGINE_HAS_PREEMPTION flag. (Chris)

v3: Fixed needs_preempt_context() change. (Chris)
 Merged preemption trigger functions to one. (Chris)
 Fixed conyext state tonot assume COMPLETED_MASK after preemption,
 since idle-to-idle case will not have it set.

v4: Simplified needs_preempt_context() change. (Daniele)
 Removed clearing HWACK flag in idle-to-idle preempt. (Daniele)

Cc: Joonas Lahtinen 
Cc: Chris Wilson 
Cc: Daniele Ceraolo Spurio 
Cc: Michal Winiarski 
Cc: Mika Kuoppala 
Bspec: 18922
Signed-off-by: Tomasz Lis 
---
  drivers/gpu/drm/i915/i915_drv.h  |   2 +
  drivers/gpu/drm/i915/i915_gem_context.c  |   3 +-
  drivers/gpu/drm/i915/i915_pci.c  |   3 +-
  drivers/gpu/drm/i915/intel_device_info.h |   1 +
  drivers/gpu/drm/i915/intel_lrc.c | 113 
+--

  drivers/gpu/drm/i915/intel_lrc.h |   1 +
  6 files changed, 86 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h 
b/drivers/gpu/drm/i915/i915_drv.h

index 487922f..35eddf7 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2534,6 +2534,8 @@ intel_info(const struct drm_i915_private 
*dev_priv)

  ((dev_priv)->info.has_logical_ring_elsq)
  #define HAS_LOGICAL_RING_PREEMPTION(dev_priv) \
  ((dev_priv)->info.has_logical_ring_preemption)
+#define HAS_HW_PREEMPT_TO_IDLE(dev_priv) \
+    ((dev_priv)->info.has_hw_preempt_to_idle)
    #define HAS_EXECLISTS(dev_priv) HAS_LOGICAL_RING_CONTEXTS(dev_priv)
  diff --git a/drivers/gpu/drm/i915/i915_gem_context.c 
b/drivers/gpu/drm/i915/i915_gem_context.c

index 45393f6..341a5ff 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -455,7 +455,8 @@ destroy_kernel_context(struct i915_gem_context 
**ctxp)

    static bool needs_preempt_context(struct drm_i915_private *i915)
  {
-    return HAS_LOGICAL_RING_PREEMPTION(i915);
+    return HAS_LOGICAL_RING_PREEMPTION(i915) &&
+   !HAS_HW_PREEMPT_TO_IDLE(i915);
  }
    int i915_gem_contexts_init(struct drm_i915_private *dev_priv)
diff --git a/drivers/gpu/drm/i915/i915_pci.c 
b/drivers/gpu/drm/i915/i915_pci.c

index 97a91e6a..ee09926 100644
--- a/drivers/gpu/drm/i915/i915_pci.c
+++ b/drivers/gpu/drm/i915/i915_pci.c
@@ -593,7 +593,8 @@ static const struct intel_device_info 
intel_cannonlake_info = {

  GEN(11), \
  .ddb_size = 2048, \
  .has_csr = 0, \
-    .has_logical_ring_elsq = 1
+    .has_logical_ring_elsq = 1, \
+    .has_hw_preempt_to_idle = 1
    static const struct intel_device_info intel_icelake_11_info = {
  GEN11_FEATURES,
diff --git a/drivers/gpu/drm/i915/intel_device_info.h 
b/drivers/gpu/drm/i915/intel_device_info.h

index 933e316..4eb97b5 100644
--- a/drivers/gpu/drm/i915/intel_device_info.h
+++ b/drivers/gpu/drm/i915/intel_device_info.h
@@ -98,6 +98,7 @@ enum intel_platform {
  func(has_logical_ring_contexts); \
  func(has_logical_ring_elsq); \
  func(has_logical_ring_preemption); \
+    func(has_hw_preempt_to_idle); \
  func(has_overlay); \
  func(has_pooled_eu); \
  func(has_psr); \
diff --git a/drivers/gpu/drm/i915/intel_lrc.c 
b/drivers/gpu/drm/i915/intel_lrc.c

index 8a6058b..f95cb37 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -154,6 +154,7 @@
  #define GEN8_CTX_STATUS_ACTIVE_IDLE    (1 << 3)
  #define GEN8_CTX_STATUS_COMPLETE    (1 << 4)
  #define GEN8_CTX_STATUS_LITE_RESTORE    (1 << 15)
+#define GEN11_CTX_STATUS_PREEMPT_IDLE    (1 << 29)
    #define GEN8_CTX_STATUS_COMPLETED_MASK \
   (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
@@ -522,31 +523,46 @@ static void port_assign(struct execlist_port 
*port, struct i915_request *rq)

  static void inject_preempt_context(struct intel_engine_cs *engine)


continuing the discussion from the previous patch, I still think that 
we should rename this function now that it doesn't inject a context on 
some gens. A new function name should be relatively trivial to handle 
from other patch series hitting the area (compared to having a second 
function).

Ok, will rename it then.

Re: [Intel-gfx] [PATCH v1] drm/i915: Add IOCTL Param to control data port coherency.

2018-06-21 Thread Lis, Tomasz



On 2018-06-21 08:39, Joonas Lahtinen wrote:

Changelog would be much appreciated. And this is not the first version
of the series. It helps to remind the reviewer that original
implementation was changed into IOCTl based on feedback. Please see the
git log in i915 for some examples.
Will add. I considered this a separate series, as it is a different 
implementation.


Quoting Tomasz Lis (2018-06-20 18:03:07)

The patch adds a parameter to control the data port coherency functionality
on a per-context level. When the IOCTL is called, a command to switch data
port coherency state is added to the ordered list. All prior requests are
executed on old coherency settings, and all exec requests after the IOCTL
will use new settings.

Rationale:

The OpenCL driver develpers requested a functionality to control cache
coherency at data port level. Keeping the coherency at that level is disabled
by default due to its performance costs. OpenCL driver is planning to
enable it for a small subset of submissions, when such functionality is
required. Below are answers to basic question explaining background
of the functionality and reasoning for the proposed implementation:

1. Why do we need a coherency enable/disable switch for memory that is shared
between CPU and GEN (GPU)?

Memory coherency between CPU and GEN, while being a great feature that enables
CL_MEM_SVM_FINE_GRAIN_BUFFER OCL capability on Intel GEN architecture, adds
overhead related to tracking (snooping) memory inside different cache units
(L1$, L2$, L3$, LLC$, etc.). At the same time, minority of modern OCL
applications actually use CL_MEM_SVM_FINE_GRAIN_BUFFER (and hence require
memory coherency between CPU and GPU). The goal of coherency enable/disable
switch is to remove overhead of memory coherency when memory coherency is not
needed.

2. Why do we need a global coherency switch?

In order to support I/O commands from within EUs (Execution Units), Intel GEN
ISA (GEN Instruction Set Assembly) contains dedicated "send" instructions.
These send instructions provide several addressing models. One of these
addressing models (named "stateless") provides most flexible I/O using plain
virtual addresses (as opposed to buffer_handle+offset models). This "stateless"
model is similar to regular memory load/store operations available on typical
CPUs. Since this model provides I/O using arbitrary virtual addresses, it
enables algorithmic designs that are based on pointer-to-pointer (e.g. buffer
of pointers) concepts. For instance, it allows creating tree-like data
structures such as:

   |  NODE1 |
   | uint64_t data  |
   +|
   | NODE*  |  NODE*|
   ++---+
 /  \
/\
   |  NODE2 ||  NODE3 |
   | uint64_t data  || uint64_t data  |
   +|+|
   | NODE*  |  NODE*|| NODE*  |  NODE*|
   ++---+++---+

Please note that pointers inside such structures can point to memory locations
in different OCL allocations  - e.g. NODE1 and NODE2 can reside in one OCL
allocation while NODE3 resides in a completely separate OCL allocation.
Additionally, such pointers can be shared with CPU (i.e. using SVM - Shared
Virtual Memory feature). Using pointers from different allocations doesn't
affect the stateless addressing model which even allows scattered reading from
different allocations at the same time (i.e. by utilizing SIMD-nature of send
instructions).

When it comes to coherency programming, send instructions in stateless model
can be encoded (at ISA level) to either use or disable coherency. However, for
generic OCL applications (such as example with tree-like data structure), OCL
compiler is not able to determine origin of memory pointed to by an arbitrary
pointer - i.e. is not able to track given pointer back to a specific
allocation. As such, it's not able to decide whether coherency is needed or not
for specific pointer (or for specific I/O instruction). As a result, compiler
encodes all stateless sends as coherent (doing otherwise would lead to
functional issues resulting from data corruption). Please note that it would be
possible to workaround this (e.g. based on allocations map and pointer bounds
checking prior to each I/O instruction) but the performance cost of such
workaround would be many times greater than the cost of keeping coherency
always enabled. As such, enabling/disabling memory coherency at GEN ISA level
is not feasible and alternative method is needed.

Such alternative solution is to have a global coherency switch that allows
disabling coherency for single (though entire) GPU submission. This is
beneficial because this way we:
* can enable (and pay for) coherency only in 

Re: [Intel-gfx] [PATCH v1] drm/i915: Add IOCTL Param to control data port coherency.

2018-06-21 Thread Lis, Tomasz



On 2018-06-21 09:05, Chris Wilson wrote:

Quoting Tomasz Lis (2018-06-20 16:03:07)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 33bc914..c69dc26 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -258,6 +258,57 @@ intel_lr_context_descriptor_update(struct i915_gem_context 
*ctx,
 ce->lrc_desc = desc;
  }
  
+static int emit_set_data_port_coherency(struct i915_request *req, bool enable)

+{
+   u32 *cs;
+   i915_reg_t reg;
+
+   GEM_BUG_ON(req->engine->class != RENDER_CLASS);
+   GEM_BUG_ON(INTEL_GEN(req->i915) < 9);
+
+   cs = intel_ring_begin(req, 4);
+   if (IS_ERR(cs))
+   return PTR_ERR(cs);
+
+   if (INTEL_GEN(req->i915) >= 10)
+   reg = CNL_HDC_CHICKEN0;
+   else
+   reg = HDC_CHICKEN0;
+
+   /* FIXME: this feature may be unuseable on CNL; If this checks to be
+*  true, we should enodev for CNL. */
+   *cs++ = MI_LOAD_REGISTER_IMM(1);
+   *cs++ = i915_mmio_reg_offset(reg);
+   /* Enabling coherency means disabling the bit which forces it off */
+   if (enable)
+   *cs++ = _MASKED_BIT_DISABLE(HDC_FORCE_NON_COHERENT);
+   else
+   *cs++ = _MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT);
+   *cs++ = MI_NOOP;
+
+   intel_ring_advance(req, cs);
+
+   return 0;
+}

There's nothing specific to the logical ringbuffer context here afaics.
It could have just been done inside the single
i915_gem_context_set_data_port_coherency(). Also makes it clearer that
i915_gem_context_set_data_port_coherency needs struct_mutex.

cmd = HDC_FORCE_NON_COHERENT << 16;
if (!coherent)
cmd |= HDC_FORCE_NON_COHERENT;
*cs++ = cmd;

Does that read any clearer?

Sorry, I don't think I follow.
Should I move the code out of logical ringbuffer context (intel_lrc.c)?
Should I merge the emit_set_data_port_coherency() with 
intel_lr_context_modify_data_port_coherency()?

Should I lock a mutex while adding the request?
-Tomasz



diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index 1593194..214e291 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -104,4 +104,8 @@ struct i915_gem_context;
  
  void intel_lr_context_resume(struct drm_i915_private *dev_priv);
  
+int

+intel_lr_context_modify_data_port_coherency(struct i915_gem_context *ctx,
+bool enable);
+
  #endif /* _INTEL_LRC_H_ */
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 7f5634c..fab072f 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1453,6 +1453,7 @@ struct drm_i915_gem_context_param {
  #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE0x4
  #define I915_CONTEXT_PARAM_BANNABLE0x5
  #define I915_CONTEXT_PARAM_PRIORITY0x6
+#define I915_CONTEXT_PARAM_COHERENCY   0x7

DATAPORT_COHERENCY
There are many different caches.

There should be some commentary around here telling userspace what the
contract is.

Will do.



  #define   I915_CONTEXT_MAX_USER_PRIORITY   1023 /* inclusive */
  #define   I915_CONTEXT_DEFAULT_PRIORITY0
  #define   I915_CONTEXT_MIN_USER_PRIORITY   -1023 /* inclusive */

COHERENCY has MAX/MIN_USER_PRIORITY, interesting. I thought it was just
a boolean.
-Chris

I did not noticed the structure of defines here; will move the new define.
-Tomasz

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v3] drm/i915/gen11: Preempt-to-idle support in execlists.

2018-05-22 Thread Lis, Tomasz



On 2018-05-22 16:39, Ceraolo Spurio, Daniele wrote:



On 5/21/2018 3:16 AM, Lis, Tomasz wrote:



On 2018-05-18 23:08, Daniele Ceraolo Spurio wrote:



On 11/05/18 08:45, Tomasz Lis wrote:
The patch adds support of preempt-to-idle requesting by setting a 
proper
bit within Execlist Control Register, and receiving preemption 
result from

Context Status Buffer.

Preemption in previous gens required a special batch buffer to be 
executed,
so the Command Streamer never preempted to idle directly. In 
Icelake it is

possible, as there is a hardware mechanism to inform the kernel about
status of the preemption request.

This patch does not cover using the new preemption mechanism when 
GuC is

active.

v2: Added needs_preempt_context() change so that it is not created 
when

 preempt-to-idle is supported. (Chris)
 Updated setting HWACK flag so that it is cleared after
 preempt-to-dle. (Chris, Daniele)
 Updated to use I915_ENGINE_HAS_PREEMPTION flag. (Chris)

v3: Fixed needs_preempt_context() change. (Chris)
 Merged preemption trigger functions to one. (Chris)
 Fixed context state to not assume COMPLETED_MASK after 
preemption,

 since idle-to-idle case will not have it set.

Bspec: 18922
Signed-off-by: Tomasz Lis <tomasz@intel.com>
---
  drivers/gpu/drm/i915/i915_drv.h  |   2 +
  drivers/gpu/drm/i915/i915_gem_context.c  |   5 +-
  drivers/gpu/drm/i915/i915_pci.c  |   3 +-
  drivers/gpu/drm/i915/intel_device_info.h |   1 +
  drivers/gpu/drm/i915/intel_lrc.c | 115 
++-

  drivers/gpu/drm/i915/intel_lrc.h |   1 +
  6 files changed, 92 insertions(+), 35 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h 
b/drivers/gpu/drm/i915/i915_drv.h

index 57fb3aa..6e9647b 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2535,6 +2535,8 @@ intel_info(const struct drm_i915_private 
*dev_priv)

  ((dev_priv)->info.has_logical_ring_elsq)
  #define HAS_LOGICAL_RING_PREEMPTION(dev_priv) \
  ((dev_priv)->info.has_logical_ring_preemption)
+#define HAS_HW_PREEMPT_TO_IDLE(dev_priv) \
+    ((dev_priv)->info.has_hw_preempt_to_idle)
    #define HAS_EXECLISTS(dev_priv) 
HAS_LOGICAL_RING_CONTEXTS(dev_priv)
  diff --git a/drivers/gpu/drm/i915/i915_gem_context.c 
b/drivers/gpu/drm/i915/i915_gem_context.c

index 33f8a4b..bdac129 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -454,7 +454,10 @@ destroy_kernel_context(struct i915_gem_context 
**ctxp)

    static bool needs_preempt_context(struct drm_i915_private *i915)
  {
-    return HAS_LOGICAL_RING_PREEMPTION(i915);
+    return HAS_LOGICAL_RING_PREEMPTION(i915) &&
+   (!HAS_HW_PREEMPT_TO_IDLE(i915) ||
+    (HAS_HW_PREEMPT_TO_IDLE(i915) &&
+    !USES_GUC_SUBMISSION(i915)));


Why do we keep the preempt context for !USES_GUC_SUBMISSION(i915) 
even if HAS_HW_PREEMPT_TO_IDLE(i915)? After this patch we shouldn't 
need it anymore, right?
The patch only provides gen11 way for the non-GuC submission. This is 
why the condition is so convoluted - preempt_context is still needed 
if we use GuC.

This will be simplified after GuC paches are added.


mmm I think this check is the other way around because it returns true 
when HAS_HW_PREEMPT_TO_IDLE for !USES_GUC_SUBMISSION, so when GuC is 
not in use.

Yes, agreed. USES_GUC_SUBMISSION should not be negated.
BTW, GuC does not support using the preempt context on platforms that 
have HW supported preempt-to-idle, so there is no need to keep the 
preempt context around for GuC.
Oh, I did not knew that. So the preemption is completely disabled on 
gen11 with GuC then? (because patches for gen11 preempt-to-idle are not 
upstreamed)?



  }
    int i915_gem_contexts_init(struct drm_i915_private *dev_priv)
diff --git a/drivers/gpu/drm/i915/i915_pci.c 
b/drivers/gpu/drm/i915/i915_pci.c

index 4364922..66b6700 100644
--- a/drivers/gpu/drm/i915/i915_pci.c
+++ b/drivers/gpu/drm/i915/i915_pci.c
@@ -595,7 +595,8 @@ static const struct intel_device_info 
intel_cannonlake_info = {

  GEN(11), \
  .ddb_size = 2048, \
  .has_csr = 0, \
-    .has_logical_ring_elsq = 1
+    .has_logical_ring_elsq = 1, \
+    .has_hw_preempt_to_idle = 1
    static const struct intel_device_info intel_icelake_11_info = {
  GEN11_FEATURES,
diff --git a/drivers/gpu/drm/i915/intel_device_info.h 
b/drivers/gpu/drm/i915/intel_device_info.h

index 933e316..4eb97b5 100644
--- a/drivers/gpu/drm/i915/intel_device_info.h
+++ b/drivers/gpu/drm/i915/intel_device_info.h
@@ -98,6 +98,7 @@ enum intel_platform {
  func(has_logical_ring_contexts); \
  func(has_logical_ring_elsq); \
  func(has_logical_ring_preemption); \
+    func(has_hw_preempt_to_idle); \
  func(has_overlay); \
  func(has_pooled_eu); \
  func(has_psr); \
diff --git a/drivers/gpu/drm/i915/intel_lrc.c 
b/drivers/gpu/drm/i915/intel_lrc.c

Re: [Intel-gfx] [PATCH v3] drm/i915/gen11: Preempt-to-idle support in execlists.

2018-05-21 Thread Lis, Tomasz



On 2018-05-18 23:08, Daniele Ceraolo Spurio wrote:



On 11/05/18 08:45, Tomasz Lis wrote:

The patch adds support of preempt-to-idle requesting by setting a proper
bit within Execlist Control Register, and receiving preemption result 
from

Context Status Buffer.

Preemption in previous gens required a special batch buffer to be 
executed,
so the Command Streamer never preempted to idle directly. In Icelake 
it is

possible, as there is a hardware mechanism to inform the kernel about
status of the preemption request.

This patch does not cover using the new preemption mechanism when GuC is
active.

v2: Added needs_preempt_context() change so that it is not created when
 preempt-to-idle is supported. (Chris)
 Updated setting HWACK flag so that it is cleared after
 preempt-to-dle. (Chris, Daniele)
 Updated to use I915_ENGINE_HAS_PREEMPTION flag. (Chris)

v3: Fixed needs_preempt_context() change. (Chris)
 Merged preemption trigger functions to one. (Chris)
 Fixed context state to not assume COMPLETED_MASK after preemption,
 since idle-to-idle case will not have it set.

Bspec: 18922
Signed-off-by: Tomasz Lis <tomasz@intel.com>
---
  drivers/gpu/drm/i915/i915_drv.h  |   2 +
  drivers/gpu/drm/i915/i915_gem_context.c  |   5 +-
  drivers/gpu/drm/i915/i915_pci.c  |   3 +-
  drivers/gpu/drm/i915/intel_device_info.h |   1 +
  drivers/gpu/drm/i915/intel_lrc.c | 115 
++-

  drivers/gpu/drm/i915/intel_lrc.h |   1 +
  6 files changed, 92 insertions(+), 35 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h 
b/drivers/gpu/drm/i915/i915_drv.h

index 57fb3aa..6e9647b 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2535,6 +2535,8 @@ intel_info(const struct drm_i915_private 
*dev_priv)

  ((dev_priv)->info.has_logical_ring_elsq)
  #define HAS_LOGICAL_RING_PREEMPTION(dev_priv) \
  ((dev_priv)->info.has_logical_ring_preemption)
+#define HAS_HW_PREEMPT_TO_IDLE(dev_priv) \
+    ((dev_priv)->info.has_hw_preempt_to_idle)
    #define HAS_EXECLISTS(dev_priv) HAS_LOGICAL_RING_CONTEXTS(dev_priv)
  diff --git a/drivers/gpu/drm/i915/i915_gem_context.c 
b/drivers/gpu/drm/i915/i915_gem_context.c

index 33f8a4b..bdac129 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -454,7 +454,10 @@ destroy_kernel_context(struct i915_gem_context 
**ctxp)

    static bool needs_preempt_context(struct drm_i915_private *i915)
  {
-    return HAS_LOGICAL_RING_PREEMPTION(i915);
+    return HAS_LOGICAL_RING_PREEMPTION(i915) &&
+   (!HAS_HW_PREEMPT_TO_IDLE(i915) ||
+    (HAS_HW_PREEMPT_TO_IDLE(i915) &&
+    !USES_GUC_SUBMISSION(i915)));


Why do we keep the preempt context for !USES_GUC_SUBMISSION(i915) even 
if HAS_HW_PREEMPT_TO_IDLE(i915)? After this patch we shouldn't need it 
anymore, right?
The patch only provides gen11 way for the non-GuC submission. This is 
why the condition is so convoluted - preempt_context is still needed if 
we use GuC.

This will be simplified after GuC paches are added.



  }
    int i915_gem_contexts_init(struct drm_i915_private *dev_priv)
diff --git a/drivers/gpu/drm/i915/i915_pci.c 
b/drivers/gpu/drm/i915/i915_pci.c

index 4364922..66b6700 100644
--- a/drivers/gpu/drm/i915/i915_pci.c
+++ b/drivers/gpu/drm/i915/i915_pci.c
@@ -595,7 +595,8 @@ static const struct intel_device_info 
intel_cannonlake_info = {

  GEN(11), \
  .ddb_size = 2048, \
  .has_csr = 0, \
-    .has_logical_ring_elsq = 1
+    .has_logical_ring_elsq = 1, \
+    .has_hw_preempt_to_idle = 1
    static const struct intel_device_info intel_icelake_11_info = {
  GEN11_FEATURES,
diff --git a/drivers/gpu/drm/i915/intel_device_info.h 
b/drivers/gpu/drm/i915/intel_device_info.h

index 933e316..4eb97b5 100644
--- a/drivers/gpu/drm/i915/intel_device_info.h
+++ b/drivers/gpu/drm/i915/intel_device_info.h
@@ -98,6 +98,7 @@ enum intel_platform {
  func(has_logical_ring_contexts); \
  func(has_logical_ring_elsq); \
  func(has_logical_ring_preemption); \
+    func(has_hw_preempt_to_idle); \
  func(has_overlay); \
  func(has_pooled_eu); \
  func(has_psr); \
diff --git a/drivers/gpu/drm/i915/intel_lrc.c 
b/drivers/gpu/drm/i915/intel_lrc.c

index 29dcf34..8fe6795 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -154,6 +154,7 @@
  #define GEN8_CTX_STATUS_ACTIVE_IDLE    (1 << 3)
  #define GEN8_CTX_STATUS_COMPLETE    (1 << 4)
  #define GEN8_CTX_STATUS_LITE_RESTORE    (1 << 15)
+#define GEN11_CTX_STATUS_PREEMPT_IDLE    (1 << 29)
    #define GEN8_CTX_STATUS_COMPLETED_MASK \
   (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
@@ -526,31 +527,49 @@ static void port_assign(struct execlist_port 
*port, struct i915_request *rq)

  static void inject_preempt_context(struct intel_engine_cs *engine)


For gen11+ we don't

Re: [Intel-gfx] [PATCH v1] drm/i915/gen11: Preempt-to-idle support in execlists.

2018-04-26 Thread Lis, Tomasz



On 2018-03-30 21:45, Daniele Ceraolo Spurio wrote:



On 30/03/18 08:42, Lis, Tomasz wrote:



On 2018-03-29 00:28, Chris Wilson wrote:

Quoting Lis, Tomasz (2018-03-28 17:06:58)


On 2018-03-28 01:27, Chris Wilson wrote:

Quoting Tomasz Lis (2018-03-27 16:17:59)
The patch adds support of preempt-to-idle requesting by setting a 
proper
bit within Execlist Control Register, and receiving preemption 
result from

Context Status Buffer.

Preemption in previous gens required a special batch buffer to be 
executed,
so the Command Streamer never preempted to idle directly. In 
Icelake it is
possible, as there is a hardware mechanism to inform the kernel 
about

status of the preemption request.

This patch does not cover using the new preemption mechanism when 
GuC is

active.

Bspec: 18922
Signed-off-by: Tomasz Lis <tomasz@intel.com>
---
   drivers/gpu/drm/i915/i915_drv.h  |  2 ++
   drivers/gpu/drm/i915/i915_pci.c  |  3 ++-
   drivers/gpu/drm/i915/intel_device_info.h |  1 +
   drivers/gpu/drm/i915/intel_lrc.c | 45 
+++-

   drivers/gpu/drm/i915/intel_lrc.h |  1 +
   5 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h 
b/drivers/gpu/drm/i915/i915_drv.h

index 800230b..c32580b 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2514,6 +2514,8 @@ intel_info(const struct drm_i915_private 
*dev_priv)

((dev_priv)->info.has_logical_ring_elsq)
   #define HAS_LOGICAL_RING_PREEMPTION(dev_priv) \
((dev_priv)->info.has_logical_ring_preemption)
+#define HAS_HW_PREEMPT_TO_IDLE(dev_priv) \
+ ((dev_priv)->info.has_hw_preempt_to_idle)
   #define HAS_EXECLISTS(dev_priv) 
HAS_LOGICAL_RING_CONTEXTS(dev_priv)
diff --git a/drivers/gpu/drm/i915/i915_pci.c 
b/drivers/gpu/drm/i915/i915_pci.c

index 4364922..66b6700 100644
--- a/drivers/gpu/drm/i915/i915_pci.c
+++ b/drivers/gpu/drm/i915/i915_pci.c
@@ -595,7 +595,8 @@ static const struct intel_device_info 
intel_cannonlake_info = {

  GEN(11), \
  .ddb_size = 2048, \
  .has_csr = 0, \
-   .has_logical_ring_elsq = 1
+   .has_logical_ring_elsq = 1, \
+   .has_hw_preempt_to_idle = 1
   static const struct intel_device_info intel_icelake_11_info = {
  GEN11_FEATURES,
diff --git a/drivers/gpu/drm/i915/intel_device_info.h 
b/drivers/gpu/drm/i915/intel_device_info.h

index 933e316..4eb97b5 100644
--- a/drivers/gpu/drm/i915/intel_device_info.h
+++ b/drivers/gpu/drm/i915/intel_device_info.h
@@ -98,6 +98,7 @@ enum intel_platform {
  func(has_logical_ring_contexts); \
  func(has_logical_ring_elsq); \
  func(has_logical_ring_preemption); \
+   func(has_hw_preempt_to_idle); \
  func(has_overlay); \
  func(has_pooled_eu); \
  func(has_psr); \
diff --git a/drivers/gpu/drm/i915/intel_lrc.c 
b/drivers/gpu/drm/i915/intel_lrc.c

index ba7f783..1a22de4 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -153,6 +153,7 @@
   #define GEN8_CTX_STATUS_ACTIVE_IDLE    (1 << 3)
   #define GEN8_CTX_STATUS_COMPLETE   (1 << 4)
   #define GEN8_CTX_STATUS_LITE_RESTORE   (1 << 15)
+#define GEN11_CTX_STATUS_PREEMPT_IDLE  (1 << 29)
   #define GEN8_CTX_STATUS_COMPLETED_MASK \
   (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
@@ -183,7 +184,9 @@ static inline bool need_preempt(const struct 
intel_engine_cs *engine,

  const struct i915_request *last,
  int prio)
   {
-   return engine->i915->preempt_context && prio > 
max(rq_prio(last), 0);

+   return (engine->i915->preempt_context ||
+   HAS_HW_PREEMPT_TO_IDLE(engine->i915)) &&
Well, you haven't actually disabled allocating the preempt_context 
so...

Yes.. I had mixed feelings about changing needs_preempt_context() now,
as that would mean adding a temporary condition on GuC until the GuC
preemption is merged.
I will add the conditions and disable the allocation in v2 of the 
patch.
But at any rate, making this an engine->flag would eliminate one 
pointer

dance.

Could be an interesting idea for a separate patch.

To land first ;)

:)
Sure, I can do that.

+    prio > max(rq_prio(last), 0);
   }
   /**
@@ -535,6 +538,25 @@ static void inject_preempt_context(struct 
intel_engine_cs *engine)

execlists_set_active(>execlists, EXECLISTS_ACTIVE_PREEMPT);
   }
+static void gen11_preempt_to_idle(struct intel_engine_cs *engine)
+{
+   struct intel_engine_execlists *execlists = 
>execlists;

+
+   GEM_TRACE("%s\n", engine->name);
+
+   /*
+    * hardware which HAS_HW_PREEMPT_TO_IDLE(), always also
+    * HAS_LOGICAL_RING_ELSQ(), so we can assume ctrl_reg is set
+    */
+   GEM_BUG_ON(execlists->ctrl_reg != NULL);
+
+   /* trigger preemption to idle */
+   w

Re: [Intel-gfx] [PATCH v1] drm/i915/gen11: Preempt-to-idle support in execlists.

2018-04-12 Thread Lis, Tomasz



On 2018-03-30 20:23, Daniele Ceraolo Spurio wrote:



On 27/03/18 16:27, Chris Wilson wrote:

Quoting Tomasz Lis (2018-03-27 16:17:59)
The patch adds support of preempt-to-idle requesting by setting a 
proper
bit within Execlist Control Register, and receiving preemption 
result from

Context Status Buffer.

Preemption in previous gens required a special batch buffer to be 
executed,
so the Command Streamer never preempted to idle directly. In Icelake 
it is

possible, as there is a hardware mechanism to inform the kernel about
status of the preemption request.

This patch does not cover using the new preemption mechanism when 
GuC is

active.

Bspec: 18922
Signed-off-by: Tomasz Lis <tomasz@intel.com>
---
  drivers/gpu/drm/i915/i915_drv.h  |  2 ++
  drivers/gpu/drm/i915/i915_pci.c  |  3 ++-
  drivers/gpu/drm/i915/intel_device_info.h |  1 +
  drivers/gpu/drm/i915/intel_lrc.c | 45 
+++-

  drivers/gpu/drm/i915/intel_lrc.h |  1 +
  5 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h 
b/drivers/gpu/drm/i915/i915_drv.h

index 800230b..c32580b 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2514,6 +2514,8 @@ intel_info(const struct drm_i915_private 
*dev_priv)

 ((dev_priv)->info.has_logical_ring_elsq)
  #define HAS_LOGICAL_RING_PREEMPTION(dev_priv) \
((dev_priv)->info.has_logical_ring_preemption)
+#define HAS_HW_PREEMPT_TO_IDLE(dev_priv) \
+   ((dev_priv)->info.has_hw_preempt_to_idle)
    #define HAS_EXECLISTS(dev_priv) HAS_LOGICAL_RING_CONTEXTS(dev_priv)
  diff --git a/drivers/gpu/drm/i915/i915_pci.c 
b/drivers/gpu/drm/i915/i915_pci.c

index 4364922..66b6700 100644
--- a/drivers/gpu/drm/i915/i915_pci.c
+++ b/drivers/gpu/drm/i915/i915_pci.c
@@ -595,7 +595,8 @@ static const struct intel_device_info 
intel_cannonlake_info = {

 GEN(11), \
 .ddb_size = 2048, \
 .has_csr = 0, \
-   .has_logical_ring_elsq = 1
+   .has_logical_ring_elsq = 1, \
+   .has_hw_preempt_to_idle = 1
    static const struct intel_device_info intel_icelake_11_info = {
 GEN11_FEATURES,
diff --git a/drivers/gpu/drm/i915/intel_device_info.h 
b/drivers/gpu/drm/i915/intel_device_info.h

index 933e316..4eb97b5 100644
--- a/drivers/gpu/drm/i915/intel_device_info.h
+++ b/drivers/gpu/drm/i915/intel_device_info.h
@@ -98,6 +98,7 @@ enum intel_platform {
 func(has_logical_ring_contexts); \
 func(has_logical_ring_elsq); \
 func(has_logical_ring_preemption); \
+   func(has_hw_preempt_to_idle); \
 func(has_overlay); \
 func(has_pooled_eu); \
 func(has_psr); \
diff --git a/drivers/gpu/drm/i915/intel_lrc.c 
b/drivers/gpu/drm/i915/intel_lrc.c

index ba7f783..1a22de4 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -153,6 +153,7 @@
  #define GEN8_CTX_STATUS_ACTIVE_IDLE    (1 << 3)
  #define GEN8_CTX_STATUS_COMPLETE   (1 << 4)
  #define GEN8_CTX_STATUS_LITE_RESTORE   (1 << 15)
+#define GEN11_CTX_STATUS_PREEMPT_IDLE  (1 << 29)
    #define GEN8_CTX_STATUS_COMPLETED_MASK \
  (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
@@ -183,7 +184,9 @@ static inline bool need_preempt(const struct 
intel_engine_cs *engine,

 const struct i915_request *last,
 int prio)
  {
-   return engine->i915->preempt_context && prio > 
max(rq_prio(last), 0);

+   return (engine->i915->preempt_context ||
+   HAS_HW_PREEMPT_TO_IDLE(engine->i915)) &&


Well, you haven't actually disabled allocating the preempt_context so...

But at any rate, making this an engine->flag would eliminate one pointer
dance.



Can't we re-use I915_SCHEDULER_CAP_PREEMPTION in 
engine->i915->caps.scheduler? That btw like here to be set if 
i915->preempt_context || HAS_HW_PREEMPT_TO_IDLE(i915)
The engine->flag which Chris introduced is now used to set 
I915_SCHEDULER_CAP_PREEMPTION.



+    prio > max(rq_prio(last), 0);
  }
    /**
@@ -535,6 +538,25 @@ static void inject_preempt_context(struct 
intel_engine_cs *engine)
 execlists_set_active(>execlists, 
EXECLISTS_ACTIVE_PREEMPT);

  }
  +static void gen11_preempt_to_idle(struct intel_engine_cs *engine)
+{
+   struct intel_engine_execlists *execlists = >execlists;
+
+   GEM_TRACE("%s\n", engine->name);
+
+   /*
+    * hardware which HAS_HW_PREEMPT_TO_IDLE(), always also
+    * HAS_LOGICAL_RING_ELSQ(), so we can assume ctrl_reg is set
+    */
+   GEM_BUG_ON(execlists->ctrl_reg != NULL);


Shouldn't this check be the other way around?
Wow. I have no idea how I was able to test this patch and not trigger 
this. You are right.



+
+   /* trigger preemption to idle */
+   writel

Re: [Intel-gfx] [PATCH v1] drm/i915/gen11: Preempt-to-idle support in execlists.

2018-03-30 Thread Lis, Tomasz



On 2018-03-29 00:28, Chris Wilson wrote:

Quoting Lis, Tomasz (2018-03-28 17:06:58)


On 2018-03-28 01:27, Chris Wilson wrote:

Quoting Tomasz Lis (2018-03-27 16:17:59)

The patch adds support of preempt-to-idle requesting by setting a proper
bit within Execlist Control Register, and receiving preemption result from
Context Status Buffer.

Preemption in previous gens required a special batch buffer to be executed,
so the Command Streamer never preempted to idle directly. In Icelake it is
possible, as there is a hardware mechanism to inform the kernel about
status of the preemption request.

This patch does not cover using the new preemption mechanism when GuC is
active.

Bspec: 18922
Signed-off-by: Tomasz Lis <tomasz@intel.com>
---
   drivers/gpu/drm/i915/i915_drv.h  |  2 ++
   drivers/gpu/drm/i915/i915_pci.c  |  3 ++-
   drivers/gpu/drm/i915/intel_device_info.h |  1 +
   drivers/gpu/drm/i915/intel_lrc.c | 45 
+++-
   drivers/gpu/drm/i915/intel_lrc.h |  1 +
   5 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 800230b..c32580b 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2514,6 +2514,8 @@ intel_info(const struct drm_i915_private *dev_priv)
  ((dev_priv)->info.has_logical_ring_elsq)
   #define HAS_LOGICAL_RING_PREEMPTION(dev_priv) \
  ((dev_priv)->info.has_logical_ring_preemption)
+#define HAS_HW_PREEMPT_TO_IDLE(dev_priv) \
+   ((dev_priv)->info.has_hw_preempt_to_idle)
   
   #define HAS_EXECLISTS(dev_priv) HAS_LOGICAL_RING_CONTEXTS(dev_priv)
   
diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c

index 4364922..66b6700 100644
--- a/drivers/gpu/drm/i915/i915_pci.c
+++ b/drivers/gpu/drm/i915/i915_pci.c
@@ -595,7 +595,8 @@ static const struct intel_device_info intel_cannonlake_info 
= {
  GEN(11), \
  .ddb_size = 2048, \
  .has_csr = 0, \
-   .has_logical_ring_elsq = 1
+   .has_logical_ring_elsq = 1, \
+   .has_hw_preempt_to_idle = 1
   
   static const struct intel_device_info intel_icelake_11_info = {

  GEN11_FEATURES,
diff --git a/drivers/gpu/drm/i915/intel_device_info.h 
b/drivers/gpu/drm/i915/intel_device_info.h
index 933e316..4eb97b5 100644
--- a/drivers/gpu/drm/i915/intel_device_info.h
+++ b/drivers/gpu/drm/i915/intel_device_info.h
@@ -98,6 +98,7 @@ enum intel_platform {
  func(has_logical_ring_contexts); \
  func(has_logical_ring_elsq); \
  func(has_logical_ring_preemption); \
+   func(has_hw_preempt_to_idle); \
  func(has_overlay); \
  func(has_pooled_eu); \
  func(has_psr); \
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index ba7f783..1a22de4 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -153,6 +153,7 @@
   #define GEN8_CTX_STATUS_ACTIVE_IDLE(1 << 3)
   #define GEN8_CTX_STATUS_COMPLETE   (1 << 4)
   #define GEN8_CTX_STATUS_LITE_RESTORE   (1 << 15)
+#define GEN11_CTX_STATUS_PREEMPT_IDLE  (1 << 29)
   
   #define GEN8_CTX_STATUS_COMPLETED_MASK \

   (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
@@ -183,7 +184,9 @@ static inline bool need_preempt(const struct 
intel_engine_cs *engine,
  const struct i915_request *last,
  int prio)
   {
-   return engine->i915->preempt_context && prio > max(rq_prio(last), 0);
+   return (engine->i915->preempt_context ||
+   HAS_HW_PREEMPT_TO_IDLE(engine->i915)) &&

Well, you haven't actually disabled allocating the preempt_context so...

Yes.. I had mixed feelings about changing needs_preempt_context() now,
as that would mean adding a temporary condition on GuC until the GuC
preemption is merged.
I will add the conditions and disable the allocation in v2 of the patch.

But at any rate, making this an engine->flag would eliminate one pointer
dance.

Could be an interesting idea for a separate patch.

To land first ;)

:)
Sure, I can do that.

+prio > max(rq_prio(last), 0);
   }
   
   /**

@@ -535,6 +538,25 @@ static void inject_preempt_context(struct intel_engine_cs 
*engine)
  execlists_set_active(>execlists, EXECLISTS_ACTIVE_PREEMPT);
   }
   
+static void gen11_preempt_to_idle(struct intel_engine_cs *engine)

+{
+   struct intel_engine_execlists *execlists = >execlists;
+
+   GEM_TRACE("%s\n", engine->name);
+
+   /*
+* hardware which HAS_HW_PREEMPT_TO_IDLE(), always also
+* HAS_LOGICAL_RING_ELSQ(), so we can assume ctrl_reg is set
+*/
+   GEM_BUG_ON(execlists->ctrl_reg != NULL);
+
+   /* trigger preemption to idle */
+   writel(EL_CTRL_PR

Re: [Intel-gfx] [PATCH v1] drm/i915/gen11: Preempt-to-idle support in execlists.

2018-03-28 Thread Lis, Tomasz



On 2018-03-28 01:27, Chris Wilson wrote:

Quoting Tomasz Lis (2018-03-27 16:17:59)

The patch adds support of preempt-to-idle requesting by setting a proper
bit within Execlist Control Register, and receiving preemption result from
Context Status Buffer.

Preemption in previous gens required a special batch buffer to be executed,
so the Command Streamer never preempted to idle directly. In Icelake it is
possible, as there is a hardware mechanism to inform the kernel about
status of the preemption request.

This patch does not cover using the new preemption mechanism when GuC is
active.

Bspec: 18922
Signed-off-by: Tomasz Lis <tomasz@intel.com>
---
  drivers/gpu/drm/i915/i915_drv.h  |  2 ++
  drivers/gpu/drm/i915/i915_pci.c  |  3 ++-
  drivers/gpu/drm/i915/intel_device_info.h |  1 +
  drivers/gpu/drm/i915/intel_lrc.c | 45 +++-
  drivers/gpu/drm/i915/intel_lrc.h |  1 +
  5 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 800230b..c32580b 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2514,6 +2514,8 @@ intel_info(const struct drm_i915_private *dev_priv)
 ((dev_priv)->info.has_logical_ring_elsq)
  #define HAS_LOGICAL_RING_PREEMPTION(dev_priv) \
 ((dev_priv)->info.has_logical_ring_preemption)
+#define HAS_HW_PREEMPT_TO_IDLE(dev_priv) \
+   ((dev_priv)->info.has_hw_preempt_to_idle)
  
  #define HAS_EXECLISTS(dev_priv) HAS_LOGICAL_RING_CONTEXTS(dev_priv)
  
diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c

index 4364922..66b6700 100644
--- a/drivers/gpu/drm/i915/i915_pci.c
+++ b/drivers/gpu/drm/i915/i915_pci.c
@@ -595,7 +595,8 @@ static const struct intel_device_info intel_cannonlake_info 
= {
 GEN(11), \
 .ddb_size = 2048, \
 .has_csr = 0, \
-   .has_logical_ring_elsq = 1
+   .has_logical_ring_elsq = 1, \
+   .has_hw_preempt_to_idle = 1
  
  static const struct intel_device_info intel_icelake_11_info = {

 GEN11_FEATURES,
diff --git a/drivers/gpu/drm/i915/intel_device_info.h 
b/drivers/gpu/drm/i915/intel_device_info.h
index 933e316..4eb97b5 100644
--- a/drivers/gpu/drm/i915/intel_device_info.h
+++ b/drivers/gpu/drm/i915/intel_device_info.h
@@ -98,6 +98,7 @@ enum intel_platform {
 func(has_logical_ring_contexts); \
 func(has_logical_ring_elsq); \
 func(has_logical_ring_preemption); \
+   func(has_hw_preempt_to_idle); \
 func(has_overlay); \
 func(has_pooled_eu); \
 func(has_psr); \
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index ba7f783..1a22de4 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -153,6 +153,7 @@
  #define GEN8_CTX_STATUS_ACTIVE_IDLE(1 << 3)
  #define GEN8_CTX_STATUS_COMPLETE   (1 << 4)
  #define GEN8_CTX_STATUS_LITE_RESTORE   (1 << 15)
+#define GEN11_CTX_STATUS_PREEMPT_IDLE  (1 << 29)
  
  #define GEN8_CTX_STATUS_COMPLETED_MASK \

  (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
@@ -183,7 +184,9 @@ static inline bool need_preempt(const struct 
intel_engine_cs *engine,
 const struct i915_request *last,
 int prio)
  {
-   return engine->i915->preempt_context && prio > max(rq_prio(last), 0);
+   return (engine->i915->preempt_context ||
+   HAS_HW_PREEMPT_TO_IDLE(engine->i915)) &&

Well, you haven't actually disabled allocating the preempt_context so...
Yes.. I had mixed feelings about changing needs_preempt_context() now, 
as that would mean adding a temporary condition on GuC until the GuC 
preemption is merged.

I will add the conditions and disable the allocation in v2 of the patch.

But at any rate, making this an engine->flag would eliminate one pointer
dance.

Could be an interesting idea for a separate patch.



+prio > max(rq_prio(last), 0);
  }
  
  /**

@@ -535,6 +538,25 @@ static void inject_preempt_context(struct intel_engine_cs 
*engine)
 execlists_set_active(>execlists, EXECLISTS_ACTIVE_PREEMPT);
  }
  
+static void gen11_preempt_to_idle(struct intel_engine_cs *engine)

+{
+   struct intel_engine_execlists *execlists = >execlists;
+
+   GEM_TRACE("%s\n", engine->name);
+
+   /*
+* hardware which HAS_HW_PREEMPT_TO_IDLE(), always also
+* HAS_LOGICAL_RING_ELSQ(), so we can assume ctrl_reg is set
+*/
+   GEM_BUG_ON(execlists->ctrl_reg != NULL);
+
+   /* trigger preemption to idle */
+   writel(EL_CTRL_PREEMPT_TO_IDLE, execlists->ctrl_reg);

Future plans? Because just inserting the branch into the setter of
inject_preempt_context() resolves a lot of conflicts with other work.

M

Re: [Intel-gfx] [RFC v1] drm/i915: Add Exec param to control data port coherency.

2018-03-27 Thread Lis, Tomasz



On 2018-03-21 20:42, Oscar Mateo wrote:



On 3/21/2018 3:16 AM, Chris Wilson wrote:

Quoting Oscar Mateo (2018-03-20 18:43:45)


On 3/19/2018 7:14 AM, Lis, Tomasz wrote:


On 2018-03-19 13:43, Chris Wilson wrote:

Quoting Tomasz Lis (2018-03-19 12:37:35)

The patch adds a parameter to control the data port coherency
functionality
on a per-exec call basis. When data port coherency flag value is
different
than what it was in previous call for the context, a command to
switch data
port coherency state is added before the buffer to be executed.

So this is part of the context? Why do it at exec level?

It is part of the context, stored within HDC chicken bit register.
The exec level was requested by the OCL team, due to concerns about
performance cost of context setparam calls.


   If exec level
is desired, why not whitelist it?
-Chris

If we have no issue in whitelisting the register, I'm sure OCL will
agree to that.
I assumed the whitelisting will be unacceptable because of security
concerns with some options.
The register also changes its position and content between gens, which
makes whitelisting hard to manage.


I think a security analysis of this register was already done, and the
result was that it contains some other bits that could be dangerous. In
CNL those bits were moved out of the way and the HDC_CHICKEN0 register
can be whitelisted (WaAllowUMDToControlCoherency). In ICL the register
should already be non-privileged.

The previous alternative to whitelisting was running through a command
parser for validation. That's a very general mechanism suitable for a
wide range of sins.
-Chris


Are you suggesting that we enable the cmd parser for every Gen < CNL 
for this particular usage only? :P


It is a solution that would allow to do what we want without any 
additions to module interface.
It may be worth considering if we think the coherency setting will be 
temporary and removed in future gens, as we wouldn't want to have 
obsolete flags.


I think the setting will stay with us, as it is needed to support 
CL_MEM_SVM_FINE_GRAIN_BUFFER flag from OpenCL spec.
Keeping coherency will always cost performance, so we will likely always 
have a hardware setting to switch it.
But the bspec says coherency override control will be removed in future 
projects...



___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [RFC v1] drm/i915: Add Exec param to control data port coherency.

2018-03-20 Thread Lis, Tomasz



On 2018-03-19 15:26, Chris Wilson wrote:

Quoting Lis, Tomasz (2018-03-19 14:14:19)


On 2018-03-19 13:43, Chris Wilson wrote:

Quoting Tomasz Lis (2018-03-19 12:37:35)

The patch adds a parameter to control the data port coherency functionality
on a per-exec call basis. When data port coherency flag value is different
than what it was in previous call for the context, a command to switch data
port coherency state is added before the buffer to be executed.

So this is part of the context? Why do it at exec level?

It is part of the context, stored within HDC chicken bit register.
The exec level was requested by the OCL team, due to concerns about
performance cost of context setparam calls.

What? Oh dear, oh dear, thrice oh dear. The context setparam would look
like:

if (arg != context->value) {
rq = request_alloc(context, RCS);
cs = ring_begin(rq, 4);
cs++ = MI_LRI;
cs++ = reg;
cs++ = magic;
cs++ = MI_NOOP;
request_add(rq);
context->value = arg
}

The argument is whether stuffing it into a crowded, v.frequently
executed execbuf is better than an irregular setparam. If they want to
flip it on every batch, use execbuf. If it's going to be very
infrequent, setparam.
Implementing the data port coherency switch as context setparam would 
not be a problem, I agree.
But this is not a solution OCL is willing to accept. Any additional 
IOCTL call is a concern for the OCL developers.


For more explanation on switch frequency - please look at the cover 
letter I provided; here's the related part of it:

(note: the data port coherency is called fine grain coherency within UMD)

   3. Will coherency switch be used frequently?

   There are scenarios that will require frequent toggling of the coherency
   switch.
   E.g. an application has two OCL compute kernels: kern_master and kern_worker.
   kern_master uses, concurrently with CPU, some fine grain SVM resources
   (CL_MEM_SVM_FINE_GRAIN_BUFFER). These resources contain descriptors of
   computational work that needs to be executed. kern_master analyzes incoming
   work descriptors and populates a plain OCL buffer (non-fine-grain) with 
payload
   for kern_worker. Once kern_master is done, kern_worker kicks-in and processes
   the payload that kern_master produced. These two kernels work in a loop, one
   after another. Since only kern_master requires coherency, kern_worker should
   not be forced to pay for it. This means that we need to have the ability to
   toggle coherency switch on or off per each GPU submission:
   (ENABLE COHERENCY) kern_master -> (DISABLE COHERENCY)kern_worker -> (ENABLE
   COHERENCY) kern_master -> (DISABLE COHERENCY)kern_worker -> ...


That discussion must be part of the rationale in the commitlog.

Will add.
Should I place the whole text from cover letter within the commit comment?

Otoh, execbuf3 would accept it as a command packet. Hmm.
I know we have execbuf2, but execbuf3? Are you proposing to add 
something like that?

   If exec level
is desired, why not whitelist it?

If we have no issue in whitelisting the register, I'm sure OCL will
agree to that.
I assumed the whitelisting will be unacceptable because of security
concerns with some options.
The register also changes its position and content between gens, which
makes whitelisting hard to manage.

Main purpose of chicken bit registers, in general, is to allow work
around for hardware features which could  be buggy or could have
unintended influence on the platform.
The data port coherency functionality landed there for the same reasons;
then it twisted itself in a way that we now need user space to switch it.
Is it really ok to whitelist chicken bit registers?

It all depends on whether it breaks segregation. If the only users
affected are themselves, fine. Otherwise, no.
-Chris
Chicken Bit registers are definitely not planned as safe for use. While 
meaning of bits within HDC_CHICKEN0 change between gens, I doubt any of 
the registers *can't* be used to cause GPU hung.

-Tomasz

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [RFC v1] Data port coherency control for UMDs.

2018-03-19 Thread Lis, Tomasz



On 2018-03-19 14:53, Joonas Lahtinen wrote:

+ Dave, as FYI

Quoting Tomasz Lis (2018-03-19 14:37:34)

The OpenCL driver develpers requested a functionality to control cache
coherency at data port level. Keeping the coherency at that level is disabled
by default due to its performance costs. OpenCL driver is planning to
enable it for a small subset of submissions, when such functionality is
required.

Can you please link to the corresponding OpenCL driver changes? I'm
assuming this relates to the new-driver-to-be-adopted, instead of
Beignet?

It is for the new driver; I will ask the OCL developers to provide a link.


How is the story/schedule looking for adopting the new driver to
distros?

I guess that's another question for OCL guys, I don't know.

Seeing the userspace counterpart and tests will help in assessing the
suggested changes.

Regards, Joonas
I prepared an IGT test for that, I will send it to a proper mailing list 
soon.

-Tomasz

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [RFC v1] drm/i915: Add Exec param to control data port coherency.

2018-03-19 Thread Lis, Tomasz



On 2018-03-19 13:43, Chris Wilson wrote:

Quoting Tomasz Lis (2018-03-19 12:37:35)

The patch adds a parameter to control the data port coherency functionality
on a per-exec call basis. When data port coherency flag value is different
than what it was in previous call for the context, a command to switch data
port coherency state is added before the buffer to be executed.

So this is part of the context? Why do it at exec level?


It is part of the context, stored within HDC chicken bit register.
The exec level was requested by the OCL team, due to concerns about 
performance cost of context setparam calls.



  If exec level
is desired, why not whitelist it?
-Chris


If we have no issue in whitelisting the register, I'm sure OCL will 
agree to that.
I assumed the whitelisting will be unacceptable because of security 
concerns with some options.
The register also changes its position and content between gens, which 
makes whitelisting hard to manage.


Main purpose of chicken bit registers, in general, is to allow work 
around for hardware features which could  be buggy or could have 
unintended influence on the platform.
The data port coherency functionality landed there for the same reasons; 
then it twisted itself in a way that we now need user space to switch it.

Is it really ok to whitelist chicken bit registers?
-Tomasz

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v2] drm/i915: Move the scheduler feature bits into the purview of the engines

2018-02-02 Thread Lis, Tomasz
So the functional purpose of this patch is to provide capabilities 
(including preemption status) within error information. I agree this is 
required.



On 2018-02-01 20:02, Chris Wilson wrote:

Rather than having the high level ioctl interface guess the underlying
implementation details, having the implementation declare what
capabilities it exports. We define an intel_driver_caps, similar to the
intel_device_info, which instead of trying to describe the HW gives
details on what the driver itself supports. This is then populated by
the engine backend for the new scheduler capability field for use
elsewhere.

v2: Use caps.scheduler for validating CONTEXT_PARAM_SET_PRIORITY (Mika)
 One less assumption of engine[RCS] \o/

Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
Cc: Tomasz Lis <tomasz@intel.com>
Cc: Joonas Lahtinen <joonas.lahti...@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursu...@intel.com>
Cc: Michal Wajdeczko <michal.wajdec...@intel.com>
Cc: Mika Kuoppala <mika.kuopp...@linux.intel.com>

Reviewed-by: Tomasz Lis <tomasz@intel.com>

---
  drivers/gpu/drm/i915/i915_debugfs.c  | 1 +
  drivers/gpu/drm/i915/i915_drv.c  | 8 +---
  drivers/gpu/drm/i915/i915_drv.h  | 2 ++
  drivers/gpu/drm/i915/i915_gem.c  | 3 +++
  drivers/gpu/drm/i915/i915_gem_context.c  | 2 +-
  drivers/gpu/drm/i915/i915_gpu_error.c| 7 +--
  drivers/gpu/drm/i915/intel_device_info.c | 6 ++
  drivers/gpu/drm/i915/intel_device_info.h | 7 +++
  drivers/gpu/drm/i915/intel_lrc.c | 6 ++
  9 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index 9e44adef30f0..2bdce9fea671 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -49,6 +49,7 @@ static int i915_capabilities(struct seq_file *m, void *data)
  
  	intel_device_info_dump_flags(info, );

intel_device_info_dump_runtime(info, );
+   intel_driver_caps_print(_priv->caps, );
  
  	kernel_param_lock(THIS_MODULE);

i915_params_dump(_modparams, );
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 1ec12add34b2..733f71637914 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -372,13 +372,7 @@ static int i915_getparam(struct drm_device *dev, void 
*data,
value = i915_gem_mmap_gtt_version();
break;
case I915_PARAM_HAS_SCHEDULER:
-   value = 0;
-   if (dev_priv->engine[RCS] && dev_priv->engine[RCS]->schedule) {
-   value |= I915_SCHEDULER_CAP_ENABLED;
-   value |= I915_SCHEDULER_CAP_PRIORITY;
-   if (HAS_LOGICAL_RING_PREEMPTION(dev_priv))
-   value |= I915_SCHEDULER_CAP_PREEMPTION;
-   }
+   value = dev_priv->caps.scheduler;
break;
  
  	case I915_PARAM_MMAP_VERSION:

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index c676269ed843..24333042e1e9 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -468,6 +468,7 @@ struct i915_gpu_state {
u32 reset_count;
u32 suspend_count;
struct intel_device_info device_info;
+   struct intel_driver_caps driver_caps;
struct i915_params params;
  
  	struct i915_error_uc {

@@ -1809,6 +1810,7 @@ struct drm_i915_private {
struct kmem_cache *priorities;
  
  	const struct intel_device_info info;

+   struct intel_driver_caps caps;
  
  	/**

 * Data Stolen Memory - aka "i915 stolen memory" gives us the start and
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index c049496e8757..8d732f97e23e 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3222,8 +3222,11 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
 * start to complete all requests.
 */
engine->submit_request = nop_complete_submit_request;
+   engine->schedule = NULL;
}
  
+	i915->caps.scheduler = 0;

+
/*
 * Make sure no request can slip through without getting completed by
 * either this call here to intel_engine_init_global_seqno, or the one
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c 
b/drivers/gpu/drm/i915/i915_gem_context.c
index 648e7536ff51..8337d15bb0e5 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -807,7 +807,7 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, 
void *data,
  
  			if (args->size)

ret = -EINVAL;
-   else if (!to_i915(dev)->engine[RCS]->schedule)
+   else if (!(to_i915(dev)-&

Re: [Intel-gfx] [PATCH] drm/i915: Only allocate preempt context when required

2018-01-30 Thread Lis, Tomasz



On 2018-01-27 21:28, Chris Wilson wrote:

If we remove some hardcoded assumptions about the preempt context having
a fixed id, reserved from use by normal user contexts, we may only
allocate the i915_gem_context when required. Then the subsequent
decisions on using preemption reduce to having the preempt context
available.

Signed-off-by: Chris Wilson 
Cc: Michal Winiarski 
Cc: Tvrtko Ursulin 
Cc: Arkadiusz Hiler 
Cc: Mika Kuoppala 
Cc: Daniele Ceraolo Spurio 
---
  drivers/gpu/drm/i915/i915_drv.c  |  2 +-
  drivers/gpu/drm/i915/i915_gem_context.c  | 24 +---
  drivers/gpu/drm/i915/intel_engine_cs.c   |  6 +++---
  drivers/gpu/drm/i915/intel_guc_submission.c  | 24 +---
  drivers/gpu/drm/i915/intel_lrc.c | 15 ++-
  drivers/gpu/drm/i915/intel_ringbuffer.h  |  5 +
  drivers/gpu/drm/i915/selftests/mock_gem_device.c |  6 --
  7 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 1ec12add34b2..a7fc87e87fcf 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -376,7 +376,7 @@ static int i915_getparam(struct drm_device *dev, void *data,
if (dev_priv->engine[RCS] && dev_priv->engine[RCS]->schedule) {
value |= I915_SCHEDULER_CAP_ENABLED;
value |= I915_SCHEDULER_CAP_PRIORITY;
-   if (HAS_LOGICAL_RING_PREEMPTION(dev_priv))
+   if (dev_priv->preempt_context)
value |= I915_SCHEDULER_CAP_PREEMPTION;
}
break;
We cannot put equality to not having preempt_context and having 
preemption disabled. The preempt_context will not be required on 
platforms which support preempting directly to idle.
There is no need to send anything to the ring for these platforms, and 
the information about preemption finish is send back to us via interrupt 
(or GuC message in case it's enabled).
Checking whether preempt_context is null before it is to be accessed is 
a good idea, but for checking for feature availability, 
HAS_LOGICAL_RING_PREEMPTION() will be needed.
The inject_preempt_context() functions will definitely not be called 
when preempting to idle is available, so any changes there do not 
collide with the new functionality.

-Tomasz

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c 
b/drivers/gpu/drm/i915/i915_gem_context.c
index 648e7536ff51..ebb1d01b4f4e 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -452,7 +452,6 @@ destroy_kernel_context(struct i915_gem_context **ctxp)
  int i915_gem_contexts_init(struct drm_i915_private *dev_priv)
  {
struct i915_gem_context *ctx;
-   int err;
  
  	GEM_BUG_ON(dev_priv->kernel_context);
  
@@ -468,8 +467,7 @@ int i915_gem_contexts_init(struct drm_i915_private *dev_priv)

ctx = i915_gem_context_create_kernel(dev_priv, I915_PRIORITY_MIN);
if (IS_ERR(ctx)) {
DRM_ERROR("Failed to create default global context\n");
-   err = PTR_ERR(ctx);
-   goto err;
+   return PTR_ERR(ctx);
}
/*
 * For easy recognisablity, we want the kernel context to be 0 and then
@@ -479,23 +477,18 @@ int i915_gem_contexts_init(struct drm_i915_private 
*dev_priv)
dev_priv->kernel_context = ctx;
  
  	/* highest priority; preempting task */

-   ctx = i915_gem_context_create_kernel(dev_priv, INT_MAX);
-   if (IS_ERR(ctx)) {
-   DRM_ERROR("Failed to create default preempt context\n");
-   err = PTR_ERR(ctx);
-   goto err_kernel_context;
+   if (HAS_LOGICAL_RING_PREEMPTION(dev_priv)) {
+   ctx = i915_gem_context_create_kernel(dev_priv, INT_MAX);
+   if (!IS_ERR(ctx))
+   dev_priv->preempt_context = ctx;
+   else
+   DRM_ERROR("Failed to create preempt context; disabling 
preemption\n");
}
-   dev_priv->preempt_context = ctx;
  
  	DRM_DEBUG_DRIVER("%s context support initialized\n",

 dev_priv->engine[RCS]->context_size ? "logical" :
 "fake");
return 0;
-
-err_kernel_context:
-   destroy_kernel_context(_priv->kernel_context);
-err:
-   return err;
  }
  
  void i915_gem_contexts_lost(struct drm_i915_private *dev_priv)

@@ -521,7 +514,8 @@ void i915_gem_contexts_fini(struct drm_i915_private *i915)
  {
lockdep_assert_held(>drm.struct_mutex);
  
-	destroy_kernel_context(>preempt_context);

+   if (i915->preempt_context)
+   destroy_kernel_context(>preempt_context);