Re: [Intel-gfx] [PATCH v3 2/2] drm/i915/tgl: MOCS table update
On 2019-11-13 02:09, Lucas De Marchi wrote: On Tue, Nov 12, 2019 at 02:47:57PM -0800, Matt Roper wrote: The bspec was just updated with a minor correction to entry 61 (it shouldn't have had the SCF bit set). v2: - Add a MOCS_ENTRY_UNUSED() and use it to declare the explicitly-reserved MOCS entries. (Lucas) - Move the warning suppression from the Makefile to a #pragma that only affects the TGL table. (Lucas) v3: - Entries 16 and 17 are identical to ICL now, so no need to explicitly adjust them (or mess with compiler warning overrides). Bspec: 45101 Fixes: 2ddf992179c4 ("drm/i915/tgl: Define MOCS entries for Tigerlake") Cc: Tomasz Lis Cc: Lucas De Marchi Cc: Francisco Jerez Cc: Jon Bloomfield Signed-off-by: Matt Roper Reviewed-by: Lucas De Marchi Lucas De Marchi Reviewed-by: Tomasz Lis Tomasz --- drivers/gpu/drm/i915/gt/intel_mocs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_mocs.c b/drivers/gpu/drm/i915/gt/intel_mocs.c index 06e2adbf27be..2b977991b785 100644 --- a/drivers/gpu/drm/i915/gt/intel_mocs.c +++ b/drivers/gpu/drm/i915/gt/intel_mocs.c @@ -263,7 +263,7 @@ static const struct drm_i915_mocs_entry tigerlake_mocs_table[] = { L3_1_UC), /* HW Special Case (Displayable) */ MOCS_ENTRY(61, - LE_1_UC | LE_TC_1_LLC | LE_SCF(1), + LE_1_UC | LE_TC_1_LLC, L3_3_WB), }; -- 2.21.0 ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [PATCH 3/4] drm/i915/tgl: Tigerlake only has global MOCS registers
On 2019-07-26 02:12, Lucas De Marchi wrote: From: Michel Thierry Until Icelake, each engine had its own set of 64 MOCS registers. In order to simplify, Tigerlake moves to only 64 Global MOCS registers, which are no longer part of the engine context. Since these registers are now global, they also only need to be initialized once. From Gen12 onwards, MOCS must specify the target cache (3:2) and LRU management (5:4) fields and cannot be programmed to 'use the value from Private PAT', because these fields are no longer part of the PPAT. Also cacheability control (1:0) field has changed, 00 no longer means 'use controls from page table', but uncacheable (UC). v2: Move the changes to the fault registers to a separate commit - the old ones overlap with the range used by the new global MOCS (requested by Daniele) Cc: Daniele Ceraolo Spurio Cc: Tomasz Lis Signed-off-by: Michel Thierry Signed-off-by: Tvrtko Ursulin Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/i915/gt/intel_mocs.c | 47 drivers/gpu/drm/i915/gt/intel_mocs.h | 1 + drivers/gpu/drm/i915/i915_drv.h | 2 + drivers/gpu/drm/i915/i915_gem.c | 1 + drivers/gpu/drm/i915/i915_gpu_error.c| 6 ++- drivers/gpu/drm/i915/i915_pci.c | 3 +- drivers/gpu/drm/i915/i915_reg.h | 2 + drivers/gpu/drm/i915/intel_device_info.h | 1 + 8 files changed, 60 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_mocs.c b/drivers/gpu/drm/i915/gt/intel_mocs.c index ca370c7487f9..9399c0ec08f1 100644 --- a/drivers/gpu/drm/i915/gt/intel_mocs.c +++ b/drivers/gpu/drm/i915/gt/intel_mocs.c @@ -377,6 +377,10 @@ void intel_mocs_init_engine(struct intel_engine_cs *engine) unsigned int index; u32 unused_value; + /* Platforms with global MOCS do not need per-engine initialization. */ + if (HAS_GLOBAL_MOCS_REGISTERS(gt->i915)) + return; + /* Called under a blanket forcewake */ assert_forcewakes_active(uncore, FORCEWAKE_ALL); @@ -401,6 +405,46 @@ void intel_mocs_init_engine(struct intel_engine_cs *engine) unused_value); } +/** + * intel_mocs_init_global() - program the global mocs registers + * gt: pointer to struct intel_gt + * + * This function initializes the MOCS global registers. + */ +void intel_mocs_init_global(struct intel_gt *gt) +{ + struct intel_uncore *uncore = gt->uncore; + struct drm_i915_mocs_table table; + unsigned int index; + + if (!HAS_GLOBAL_MOCS_REGISTERS(gt->i915)) + return; + + if (!get_mocs_settings(gt, )) + return; + + if (GEM_DEBUG_WARN_ON(table.size > table.n_entries)) + return; + + for (index = 0; index < table.size; index++) + intel_uncore_write(uncore, + GEN12_GLOBAL_MOCS(index), + table.table[index].control_value); + + /* +* Ok, now set the unused entries to uncached. These entries +* are officially undefined and no contract for the contents +* and settings is given for these entries. +* +* Entry 0 in the table is uncached - so we are just writing +* that value to all the used entries. +*/ + for (; index < table.n_entries; index++) + intel_uncore_write(uncore, + GEN12_GLOBAL_MOCS(index), + table.table[0].control_value); While get_mocs_settings() can return a table with less than 64 entries, it will never be the case for platforms supporting global MOCS. So this for() is actually a dead code.. but removing it could cause harm in case this is forgotten and modifications are made, so I'd leave it as is. R-b: Tomasz Lis -Tomasz +} + /** * emit_mocs_control_table() - emit the mocs control table * @rq: Request to set up the MOCS table for. @@ -604,6 +648,9 @@ int intel_rcs_context_init_mocs(struct i915_request *rq) struct drm_i915_mocs_table t; int ret; + if (HAS_GLOBAL_MOCS_REGISTERS(rq->i915)) + return 0; + if (get_mocs_settings(rq->engine->gt, )) { /* Program the RCS control registers */ ret = emit_mocs_control_table(rq, ); diff --git a/drivers/gpu/drm/i915/gt/intel_mocs.h b/drivers/gpu/drm/i915/gt/intel_mocs.h index 8b9813e6f9ac..aa3a2df07c82 100644 --- a/drivers/gpu/drm/i915/gt/intel_mocs.h +++ b/drivers/gpu/drm/i915/gt/intel_mocs.h @@ -56,6 +56,7 @@ struct intel_gt; int intel_rcs_context_init_mocs(struct i915_request *rq); void intel_mocs_init_l3cc_table(struct intel_gt *gt); +void intel_mocs_init_global(struct intel_gt *gt); void intel_mocs_init_engine(struct intel_engine_cs *engine); #endif diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index
Re: [Intel-gfx] [PATCH 18/22] drm/i915/tgl: Define MOCS entries for Tigerlake
On 2019-07-25 00:32, Lucas De Marchi wrote: On Thu, Jul 18, 2019 at 10:09:27AM -0700, Daniele Ceraolo Spurio wrote: On 7/18/19 6:08 AM, Ville Syrjälä wrote: On Fri, Jul 12, 2019 at 06:09:36PM -0700, Lucas De Marchi wrote: From: Tomasz Lis The MOCS table is published as part of bspec, and versioned. Entries are supposed to never be modified, but new ones can be added. Adding entries increases table version. The patch includes version 1 entries. Two of the 3 legacy entries used for gen9 are no longer expected to work. Although we are changing the gen11 table, those changes are supposed to be backward compatible since we are only touching previously undefined entries. Cc: Joonas Lahtinen Cc: Mika Kuoppala Cc: Daniele Ceraolo Spurio Signed-off-by: Tomasz Lis Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/i915/gt/intel_mocs.c | 25 ++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_mocs.c b/drivers/gpu/drm/i915/gt/intel_mocs.c index 290a5e9b90b9..259e7bec0a63 100644 --- a/drivers/gpu/drm/i915/gt/intel_mocs.c +++ b/drivers/gpu/drm/i915/gt/intel_mocs.c @@ -62,6 +62,10 @@ struct drm_i915_mocs_table { #define GEN11_NUM_MOCS_ENTRIES 64 /* 63-64 are reserved, but configured. */ /* (e)LLC caching options */ +/* + * Note: LE_0_PAGETABLE works only up to Gen11; for newer gens it means + * the same as LE_UC + */ #define LE_0_PAGETABLE _LE_CACHEABILITY(0) #define LE_1_UC _LE_CACHEABILITY(1) #define LE_2_WT _LE_CACHEABILITY(2) @@ -100,8 +104,9 @@ struct drm_i915_mocs_table { * of bspec. * * Entries not part of the following tables are undefined as far as - * userspace is concerned and shouldn't be relied upon. For the time - * being they will be initialized to PTE. + * userspace is concerned and shouldn't be relied upon. For Gen < 12 + * they will be initialized to PTE. Gen >= 12 onwards don't have a setting for + * PTE. We use the same value, but that actually means Uncached. * * The last two entries are reserved by the hardware. For ICL+ they * should be initialized according to bspec and never used, for older @@ -137,11 +142,13 @@ static const struct drm_i915_mocs_entry broxton_mocs_table[] = { }; #define GEN11_MOCS_ENTRIES \ - /* Base - Uncached (Deprecated) */ \ + /* Gen11: Base - Uncached (Deprecated) */ \ + /* Gen12+: Base - Error (Reserved for Non-Use) */ \ MOCS_ENTRY(I915_MOCS_UNCACHED, \ LE_1_UC | LE_TC_1_LLC, \ L3_1_UC), \ /* Base - L3 + LeCC:PAT (Deprecated) */ \ + /* Gen12+: Base - Reserved */ \ MOCS_ENTRY(I915_MOCS_PTE, \ LE_0_PAGETABLE | LE_TC_1_LLC, \ L3_3_WB), \ @@ -233,6 +240,18 @@ static const struct drm_i915_mocs_entry broxton_mocs_table[] = { MOCS_ENTRY(23, \ LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_RSC(1) | LE_SCC(7), \ L3_3_WB), \ + /* Gen12+: HW Reserved - HDC:L1 + L3 + LLC */ \ Why is this marked as reserved? From the looks of things 48-61 should just be normal entries that userspace can select to get HDC L1$. And looks like icl already has that stuff. So someone should probably figure out if Mesa/etc. can make use of the HDC L1$, and if so we should add the relevant MOCS entries for icl as well. Here the reserved terminology is indeed misleading. The 48-59 range is a "special" range where L1 usage is implicitly enabled by the HW, as there is no explicit L1 toggle in the MOCS registers. The reserved here means that the range shouldn't be used for "normal" MOCS settings, but SW can freely use these entries as needed. Similarly, MOCS 60 and 61 are reserved for other special purposes, but are still usable by SW. The only entries SW shouldn't touch are 62 and 63. Regarding ICL, Gen11 HW doesn't have the capability so no new entries are required there. It might be a good idea to find a better word for it than "reserved". Not only for this patch, but to be used everywhere, especially the spec. But that exceeds the scope of this review. + MOCS_ENTRY(48, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), \ + L3_3_WB), \ + /* Gen12+: HW Reserved - HW Special Case (CCS) */ \ The specs have MOCS 49-51 defined as well. humn... it seems they got added later. I'm not sure anymore if we should update igt so it doesn't expect those entries to be set to PTE or if we should stop reusing the same table for ICL and TGL. Spec doesn't mention the compatibility of this table with gen 11 anymore. Thoughts? It doesn't sound right to change the implementation decision to keep them together, only because this allows to keep tests intact. I don't believe there's a reason to verify undefined entries in IGT. Sometimes we do want to verify our specific implementation instead of pure specs compliance; but I don't see a reason for this should be the case here. The existing MOCS entries are supposed to be unchangeable (in
Re: [Intel-gfx] [PATCH v2 11/22] drm/i915/guc: Reset GuC ADS during sanitize
Only comment issues. Besides these: Reviewed-by: Tomasz Lis On 2019-04-11 10:44, Michal Wajdeczko wrote: GuC stores some data in there, which might be stale after a reset. Reinitialize whole ADS in case any part of it was corrupted during previous GuC run. Signed-off-by: Michal Wajdeczko Cc: Daniele Ceraolo Spurio Cc: MichaĹ Winiarski Cc: Tomasz Lis --- drivers/gpu/drm/i915/intel_guc.h | 2 + drivers/gpu/drm/i915/intel_guc_ads.c | 85 ++-- drivers/gpu/drm/i915/intel_guc_ads.h | 1 + 3 files changed, 57 insertions(+), 31 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h index 2c59ff8d9f39..4f3cf8eddfe6 100644 --- a/drivers/gpu/drm/i915/intel_guc.h +++ b/drivers/gpu/drm/i915/intel_guc.h @@ -26,6 +26,7 @@ #define _INTEL_GUC_H_ #include "intel_uncore.h" +#include "intel_guc_ads.h" #include "intel_guc_fw.h" #include "intel_guc_fwif.h" #include "intel_guc_ct.h" @@ -177,6 +178,7 @@ u32 intel_guc_reserved_gtt_size(struct intel_guc *guc); static inline int intel_guc_sanitize(struct intel_guc *guc) { intel_uc_fw_sanitize(>fw); + intel_guc_ads_reset(guc); return 0; } diff --git a/drivers/gpu/drm/i915/intel_guc_ads.c b/drivers/gpu/drm/i915/intel_guc_ads.c index abab5cb6909a..97926effb944 100644 --- a/drivers/gpu/drm/i915/intel_guc_ads.c +++ b/drivers/gpu/drm/i915/intel_guc_ads.c @@ -72,43 +72,28 @@ static void guc_ct_pool_entries_init(struct guc_ct_pool_entry *pool, u32 num) */ #define LR_HW_CONTEXT_SIZE(80 * sizeof(u32)) -/** - * intel_guc_ads_create() - creates GuC ADS - * @guc: intel_guc struct - * - */ -int intel_guc_ads_create(struct intel_guc *guc) +/* The ads obj includes the struct itself and buffers passed to GuC */ +struct __guc_ads_blob { + struct guc_ads ads; + struct guc_policies policies; + struct guc_mmio_reg_state reg_state; + struct guc_gt_system_info system_info; + struct guc_clients_info clients_info; + struct guc_ct_pool_entry ct_pool[GUC_CT_POOL_SIZE]; + u8 reg_state_buffer[GUC_S3_SAVE_SPACE_PAGES * PAGE_SIZE]; +} __packed; + +static int __guc_ads_reinit(struct intel_guc *guc) { struct drm_i915_private *dev_priv = guc_to_i915(guc); - struct i915_vma *vma; - /* The ads obj includes the struct itself and buffers passed to GuC */ - struct { - struct guc_ads ads; - struct guc_policies policies; - struct guc_mmio_reg_state reg_state; - struct guc_gt_system_info system_info; - struct guc_clients_info clients_info; - struct guc_ct_pool_entry ct_pool[GUC_CT_POOL_SIZE]; - u8 reg_state_buffer[GUC_S3_SAVE_SPACE_PAGES * PAGE_SIZE]; - } __packed *blob; + struct __guc_ads_blob *blob; const u32 skipped_size = LRC_PPHWSP_SZ * PAGE_SIZE + LR_HW_CONTEXT_SIZE; u32 base; u8 engine_class; - int ret; - - GEM_BUG_ON(guc->ads_vma); - - vma = intel_guc_allocate_vma(guc, PAGE_ALIGN(sizeof(*blob))); - if (IS_ERR(vma)) - return PTR_ERR(vma); - - guc->ads_vma = vma; blob = i915_gem_object_pin_map(guc->ads_vma->obj, I915_MAP_WB); - if (IS_ERR(blob)) { - ret = PTR_ERR(blob); - goto err_vma; - } + if (IS_ERR(blob)) + return PTR_ERR(blob); /* GuC scheduling policies */ guc_policies_init(>policies); @@ -142,7 +127,7 @@ int intel_guc_ads_create(struct intel_guc *guc) blob->system_info.vebox_enable_mask = VEBOX_MASK(dev_priv); blob->system_info.vdbox_sfc_support_mask = RUNTIME_INFO(dev_priv)->vdbox_sfc_access; - base = intel_guc_ggtt_offset(guc, vma); + base = intel_guc_ggtt_offset(guc, guc->ads_vma); /* Clients info */ guc_ct_pool_entries_init(blob->ct_pool, ARRAY_SIZE(blob->ct_pool)); @@ -161,6 +146,32 @@ int intel_guc_ads_create(struct intel_guc *guc) i915_gem_object_unpin_map(guc->ads_vma->obj); return 0; +} + +/** + * intel_guc_ads_create() - creates GuC ADS + * @guc: intel_guc struct Are we really ok with documentation which just reads names with spaces instead of underscores? I believe the description should go deeper, or at least use different words to describe the thing. ie. here: intel_guc_ads_create() - allocates and initializes GuC Additional Data Struct @guc: instance of intel_guc which will own the ADS + * + */ +int intel_guc_ads_create(struct intel_guc *guc) +{ + const u32 size = PAGE_ALIGN(sizeof(struct __guc_ads_blob)); + struct i915_vma *vma; + int ret; + + GEM_BUG_ON(guc->ads_vma); + + vma = intel_guc_allocate_vma(guc, size); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + guc->ads_vma = vma; + + ret = __guc_ads_reinit(guc); + if (ret) + goto err_vma; + + return 0; err_vma:
Re: [Intel-gfx] [PATCH v8 7/7] drm/i915/icl: Define MOCS table for Icelake
On 2019-01-22 06:12, Lucas De Marchi wrote: From: Tomasz Lis The table has been unified across OSes to minimize virtualization overhead. The MOCS table is now published as part of bspec, and versioned. Entries are supposed to never be modified, but new ones can be added. Adding entries increases table version. The patch includes version 1 entries. Meaning of each entry is now explained in bspec, and user mode clients are expected to know what each entry means. The 3 entries used for previous platforms are still compatible with their legacy definitions, but that is not guaranteed to be true for future platforms. v2: Fixed SCC values, improved commit comment (Daniele) v3: Improved MOCS table comment (Daniele) v4: Moved new entries below gen9 ones. Put common entries into definition to be used in multiple arrays. (Lucas) v5: Made defines for or-ing flags. Renamed macros from MOCS_TABLE to MOCS_ENTRIES. Switched LE_CoS to upper case. (Joonas) v6: Removed definitions of reserved entries. (Michal) Increased limit of entries sent to the hardware on gen11+. v7: Simplify table as done for previou gens (Lucas) v8: Rebase on cached number of entries per-platform and use new MOCS_ENTRY() macro (Lucas) BSpec: 34007 BSpec: 560 Signed-off-by: Tomasz Lis Reviewed-by: Daniele Ceraolo Spurio Reviewed-by: Lucas De Marchi Signed-off-by: Lucas De Marchi Acked-by: Tomasz Lis --- drivers/gpu/drm/i915/intel_mocs.c | 107 +++--- 1 file changed, 98 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_mocs.c b/drivers/gpu/drm/i915/intel_mocs.c index 716f3f6f2966..3afd8c30cacc 100644 --- a/drivers/gpu/drm/i915/intel_mocs.c +++ b/drivers/gpu/drm/i915/intel_mocs.c @@ -46,6 +46,8 @@ struct drm_i915_mocs_table { #define LE_SCC(value) ((value) << 8) #define LE_PFM(value) ((value) << 11) #define LE_SCF(value) ((value) << 14) +#define LE_COS(value) ((value) << 15) +#define LE_SSE(value) ((value) << 17) /* Defines for the tables (LNCFMOCS0 - LNCFMOCS31) - two entries per word */ #define L3_ESC(value) ((value) << 0) @@ -54,6 +56,7 @@ struct drm_i915_mocs_table { /* Helper defines */ #define GEN9_NUM_MOCS_ENTRIES 62 /* 62 out of 64 - 63 & 64 are reserved. */ +#define GEN11_NUM_MOCS_ENTRIES 64 /* 63-64 are reserved, but configured. */ /* (e)LLC caching options */ #define LE_0_PAGETABLE_LE_CACHEABILITY(0) @@ -89,18 +92,22 @@ struct drm_i915_mocs_table { * LNCFCMOCS0 - LNCFCMOCS32 registers. * * These tables are intended to be kept reasonably consistent across - * platforms. However some of the fields are not applicable to all of - * them. + * HW platforms, and for ICL+, be identical across OSes. To achieve + * that, for Icelake and above, list of entries is published as part + * of bspec. * * Entries not part of the following tables are undefined as far as * userspace is concerned and shouldn't be relied upon. For the time * being they will be initialized to PTE. * - * NOTE: These tables MUST start with being uncached and the length - * MUST be less than 63 as the last two registers are reserved - * by the hardware. These tables are part of the kernel ABI and - * may only be updated incrementally by adding entries at the - * end. + * The last two entries are reserved by the hardware. For ICL+ they + * should be initialized according to bspec and never used, for older + * platforms they should never be written to. + * + * NOTE: These tables are part of bspec and defined as part of hardware + * interface for ICL+. For older platforms, they are part of kernel + * ABI. It is expected that existing entries will remain constant + * and the tables will only be updated by adding new entries. We have no guarantee that the entries will remain constant across gens.. so maybe: + * ABI. It is expected that, for specific hardware platform, existing + * entries will remain constant and the table will only be updated + * by adding new entries, filling unused positions. -Tomasz */ #define GEN9_MOCS_ENTRIES \ MOCS_ENTRY(I915_MOCS_UNCACHED, LE_1_UC | LE_TC_2_LLC_ELLC, \ @@ -121,6 +128,84 @@ static const struct drm_i915_mocs_entry broxton_mocs_table[] = { L3_3_WB) }; +#define GEN11_MOCS_ENTRIES \ + /* Base - Uncached (Deprecated) */ \ + MOCS_ENTRY(I915_MOCS_UNCACHED, LE_1_UC | LE_TC_1_LLC, \ + L3_1_UC), \ + /* Base - L3 + LeCC:PAT (Deprecated) */ \ + MOCS_ENTRY(I915_MOCS_PTE, LE_0_PAGETABLE | LE_TC_1_LLC, \ + L3_3_WB), \ + /* Base - L3 + LLC */ \ + MOCS_ENTRY(2, LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), \ + L3_3_WB), \ + /* Base - Uncached */ \ + MOCS_ENTRY(3, LE_1_UC | LE_TC_1_LLC, \ +
Re: [Intel-gfx] [PATCH v8 6/7] drm/i915: cache number of MOCS entries
On 2019-01-22 06:12, Lucas De Marchi wrote: Instead of checking the gen number every time we need to know the max number of entries, just save it into the table struct so we don't need extra branches throughout the code. This will be useful for Ice Lake that has 64 rather than 62 defined entries. Ice Lake changes will be added in a follow up. v2: make size and n_entries `unsigned int` and introduce changes as a pre-work for the Ice Lake changes (Tvrtko) Suggested-by: Tvrtko Ursulin Signed-off-by: Lucas De Marchi I would name it total_entries or n_hw_entries, not n_entries; but that's just a name, so: Reviewed-by: Tomasz Lis -Tomasz --- drivers/gpu/drm/i915/intel_mocs.c | 27 ++- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_mocs.c b/drivers/gpu/drm/i915/intel_mocs.c index af2ae2f396ae..716f3f6f2966 100644 --- a/drivers/gpu/drm/i915/intel_mocs.c +++ b/drivers/gpu/drm/i915/intel_mocs.c @@ -32,7 +32,8 @@ struct drm_i915_mocs_entry { }; struct drm_i915_mocs_table { - u32 size; + unsigned int size; + unsigned int n_entries; const struct drm_i915_mocs_entry *table; }; @@ -140,10 +141,12 @@ static bool get_mocs_settings(struct drm_i915_private *dev_priv, if (IS_GEN9_BC(dev_priv) || IS_CANNONLAKE(dev_priv) || IS_ICELAKE(dev_priv)) { table->size = ARRAY_SIZE(skylake_mocs_table); + table->n_entries = GEN9_NUM_MOCS_ENTRIES; table->table = skylake_mocs_table; result = true; } else if (IS_GEN9_LP(dev_priv)) { table->size = ARRAY_SIZE(broxton_mocs_table); + table->n_entries = GEN9_NUM_MOCS_ENTRIES; table->table = broxton_mocs_table; result = true; } else { @@ -202,8 +205,6 @@ void intel_mocs_init_engine(struct intel_engine_cs *engine) if (!get_mocs_settings(dev_priv, )) return; - GEM_BUG_ON(table.size > GEN9_NUM_MOCS_ENTRIES); - /* Set unused values to PTE */ unused_value = table.table[I915_MOCS_PTE].control_value; @@ -215,7 +216,7 @@ void intel_mocs_init_engine(struct intel_engine_cs *engine) } /* All remaining entries are also unused */ - for (; index < GEN9_NUM_MOCS_ENTRIES; index++) + for (; index < table.n_entries; index++) I915_WRITE(mocs_register(engine->id, index), unused_value); } @@ -237,17 +238,17 @@ static int emit_mocs_control_table(struct i915_request *rq, u32 unused_value; u32 *cs; - if (WARN_ON(table->size > GEN9_NUM_MOCS_ENTRIES)) + if (GEM_WARN_ON(table->size > table->n_entries)) return -ENODEV; /* Set unused values to PTE */ unused_value = table->table[I915_MOCS_PTE].control_value; - cs = intel_ring_begin(rq, 2 + 2 * GEN9_NUM_MOCS_ENTRIES); + cs = intel_ring_begin(rq, 2 + 2 * table->n_entries); if (IS_ERR(cs)) return PTR_ERR(cs); - *cs++ = MI_LOAD_REGISTER_IMM(GEN9_NUM_MOCS_ENTRIES); + *cs++ = MI_LOAD_REGISTER_IMM(table->n_entries); for (index = 0; index < table->size; index++) { u32 value = table->table[index].used ? @@ -258,7 +259,7 @@ static int emit_mocs_control_table(struct i915_request *rq, } /* All remaining entries are also unused */ - for (; index < GEN9_NUM_MOCS_ENTRIES; index++) { + for (; index < table->n_entries; index++) { *cs++ = i915_mmio_reg_offset(mocs_register(engine, index)); *cs++ = unused_value; } @@ -294,17 +295,17 @@ static int emit_mocs_l3cc_table(struct i915_request *rq, unsigned int i, unused_index; u32 *cs; - if (WARN_ON(table->size > GEN9_NUM_MOCS_ENTRIES)) + if (GEM_WARN_ON(table->size > table->n_entries)) return -ENODEV; /* Set unused values to PTE */ unused_index = I915_MOCS_PTE; - cs = intel_ring_begin(rq, 2 + GEN9_NUM_MOCS_ENTRIES); + cs = intel_ring_begin(rq, 2 + table->n_entries); if (IS_ERR(cs)) return PTR_ERR(cs); - *cs++ = MI_LOAD_REGISTER_IMM(GEN9_NUM_MOCS_ENTRIES / 2); + *cs++ = MI_LOAD_REGISTER_IMM(table->n_entries / 2); for (i = 0; i < table->size / 2; i++) { u16 low = table->table[2 * i].used ? @@ -327,7 +328,7 @@ static int emit_mocs_l3cc_table(struct i915_request *rq, } /* All remaining entries are also unused */ - for (; i < GEN9_NUM_MOCS_ENTRIES / 2; i++) { + for (; i < table->n_entries / 2; i++) { *cs++ = i915_mmio_reg_offset(GEN9_LNCFCMOCS(i)); *cs++ = l3cc_combine(table, unused_index, unused_index); } @@ -384,7 +385,7 @@ void intel_mocs_init_l3cc_table(struct drm_i915_private *dev_
Re: [Intel-gfx] [PATCH v8 5/7] drm/i915: keep track of used entries in MOCS table
On 2019-01-22 06:12, Lucas De Marchi wrote: Instead of considering we have defined entries for any index in the table, let's keep track of the ones we explicitly defined. This will allow Gen 11 to have it's new table defined in which we have holes of undefined entries. Repeated comments about the meaning of undefined entries were removed since they are overly verbose and copy-pasted in several functions: now the definition is in the top only. Signed-off-by: Lucas De Marchi Reviewed-by: Tomasz Lis -Tomasz --- drivers/gpu/drm/i915/intel_mocs.c | 88 --- 1 file changed, 57 insertions(+), 31 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_mocs.c b/drivers/gpu/drm/i915/intel_mocs.c index faae2eefc5cc..af2ae2f396ae 100644 --- a/drivers/gpu/drm/i915/intel_mocs.c +++ b/drivers/gpu/drm/i915/intel_mocs.c @@ -28,6 +28,7 @@ struct drm_i915_mocs_entry { u32 control_value; u16 l3cc_value; + u16 used; }; struct drm_i915_mocs_table { @@ -75,6 +76,7 @@ struct drm_i915_mocs_table { [__idx] = { \ .control_value = __control_value, \ .l3cc_value = __l3cc_value, \ + .used = 1, \ } /* @@ -195,24 +197,26 @@ void intel_mocs_init_engine(struct intel_engine_cs *engine) struct drm_i915_private *dev_priv = engine->i915; struct drm_i915_mocs_table table; unsigned int index; + u32 unused_value; if (!get_mocs_settings(dev_priv, )) return; GEM_BUG_ON(table.size > GEN9_NUM_MOCS_ENTRIES); - for (index = 0; index < table.size; index++) - I915_WRITE(mocs_register(engine->id, index), - table.table[index].control_value); + /* Set unused values to PTE */ + unused_value = table.table[I915_MOCS_PTE].control_value; - /* -* Now set the unused entries to PTE. These entries are officially -* undefined and no contract for the contents and settings is given -* for these entries. -*/ + for (index = 0; index < table.size; index++) { + u32 value = table.table[index].used ? + table.table[index].control_value : unused_value; + + I915_WRITE(mocs_register(engine->id, index), value); + } + + /* All remaining entries are also unused */ for (; index < GEN9_NUM_MOCS_ENTRIES; index++) - I915_WRITE(mocs_register(engine->id, index), - table.table[I915_MOCS_PTE].control_value); + I915_WRITE(mocs_register(engine->id, index), unused_value); } /** @@ -230,11 +234,15 @@ static int emit_mocs_control_table(struct i915_request *rq, { enum intel_engine_id engine = rq->engine->id; unsigned int index; + u32 unused_value; u32 *cs; if (WARN_ON(table->size > GEN9_NUM_MOCS_ENTRIES)) return -ENODEV; + /* Set unused values to PTE */ + unused_value = table->table[I915_MOCS_PTE].control_value; + cs = intel_ring_begin(rq, 2 + 2 * GEN9_NUM_MOCS_ENTRIES); if (IS_ERR(cs)) return PTR_ERR(cs); @@ -242,18 +250,17 @@ static int emit_mocs_control_table(struct i915_request *rq, *cs++ = MI_LOAD_REGISTER_IMM(GEN9_NUM_MOCS_ENTRIES); for (index = 0; index < table->size; index++) { + u32 value = table->table[index].used ? + table->table[index].control_value : unused_value; + *cs++ = i915_mmio_reg_offset(mocs_register(engine, index)); - *cs++ = table->table[index].control_value; + *cs++ = value; } - /* -* Now set the unused entries to PTE. These entries are officially -* undefined and no contract for the contents and settings is given -* for these entries. -*/ + /* All remaining entries are also unused */ for (; index < GEN9_NUM_MOCS_ENTRIES; index++) { *cs++ = i915_mmio_reg_offset(mocs_register(engine, index)); - *cs++ = table->table[I915_MOCS_PTE].control_value; + *cs++ = unused_value; } *cs++ = MI_NOOP; @@ -284,12 +291,15 @@ static inline u32 l3cc_combine(const struct drm_i915_mocs_table *table, static int emit_mocs_l3cc_table(struct i915_request *rq, const struct drm_i915_mocs_table *table) { - unsigned int i; + unsigned int i, unused_index; u32 *cs; if (WARN_ON(table->size > GEN9_NUM_MOCS_ENTRIES)) return -ENODEV; + /* Set unused values to PTE */ + unused_index = I915_MOCS_PTE; + cs = intel_ring_begin(rq, 2 + GEN9_NUM_MOCS_ENTRIES); if (IS_ERR(cs)) return PTR_ERR(cs); @@ -297,25 +307,29 @@ static int emit_mocs_l3cc_table(struct i915_request *rq, *cs++ = MI_LOAD_REGIS
Re: [Intel-gfx] [PATCH v8 4/7] drm/i915: use a macro to define MOCS entries
On 2019-01-22 06:12, Lucas De Marchi wrote: Let's use a macro to make tables smaller and at the same time allow us to add fields that apply to all entries in future. For the sake of readability, I'm calling an exception on 80 chars limit. Lines are aligned for easy comparison of the entry values. Signed-off-by: Lucas De Marchi Reviewed-by: Tomasz Lis --- drivers/gpu/drm/i915/intel_mocs.c | 39 +++ 1 file changed, 14 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_mocs.c b/drivers/gpu/drm/i915/intel_mocs.c index c7a2a8d81d90..faae2eefc5cc 100644 --- a/drivers/gpu/drm/i915/intel_mocs.c +++ b/drivers/gpu/drm/i915/intel_mocs.c @@ -71,6 +71,12 @@ struct drm_i915_mocs_table { #define L3_2_RESERVED _L3_CACHEABILITY(2) #define L3_3_WB _L3_CACHEABILITY(3) +#define MOCS_ENTRY(__idx, __control_value, __l3cc_value) \ + [__idx] = { \ + .control_value = __control_value, \ + .l3cc_value = __l3cc_value, \ + } + /* * MOCS tables * @@ -93,40 +99,23 @@ struct drm_i915_mocs_table { * may only be updated incrementally by adding entries at the * end. */ - The idea behind this EOL was that the comment above relates to several statements below, not just the first one. But I'm not really sure what our commenting rules are in this case - a comment which bundles several definitions. -Tomasz #define GEN9_MOCS_ENTRIES \ - [I915_MOCS_UNCACHED] = { \ - /* 0x0009 */ \ - .control_value = LE_1_UC | LE_TC_2_LLC_ELLC, \ - /* 0x0010 */ \ - .l3cc_value = L3_1_UC, \ - }, \ - [I915_MOCS_PTE] = { \ - /* 0x0038 */ \ - .control_value = LE_0_PAGETABLE | LE_TC_2_LLC_ELLC | LE_LRUM(3), \ - /* 0x0030 */ \ - .l3cc_value = L3_3_WB, \ - } + MOCS_ENTRY(I915_MOCS_UNCACHED, LE_1_UC | LE_TC_2_LLC_ELLC, \ + L3_1_UC), \ + MOCS_ENTRY(I915_MOCS_PTE, LE_0_PAGETABLE | LE_TC_2_LLC_ELLC | LE_LRUM(3), \ + L3_3_WB) \ static const struct drm_i915_mocs_entry skylake_mocs_table[] = { GEN9_MOCS_ENTRIES, - [I915_MOCS_CACHED] = { - /* 0x003b */ - .control_value = LE_3_WB | LE_TC_2_LLC_ELLC | LE_LRUM(3), - /* 0x0030 */ - .l3cc_value = L3_3_WB, - }, + MOCS_ENTRY(I915_MOCS_CACHED,LE_3_WB | LE_TC_2_LLC_ELLC | LE_LRUM(3), + L3_3_WB) }; /* NOTE: the LE_TGT_CACHE is not used on Broxton */ static const struct drm_i915_mocs_entry broxton_mocs_table[] = { GEN9_MOCS_ENTRIES, - [I915_MOCS_CACHED] = { - /* 0x0039 */ - .control_value = LE_1_UC | LE_TC_2_LLC_ELLC | LE_LRUM(3), - /* 0x0030 */ - .l3cc_value = L3_3_WB, - }, + MOCS_ENTRY(I915_MOCS_CACHED,LE_1_UC | LE_TC_2_LLC_ELLC | LE_LRUM(3), + L3_3_WB) }; /** ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [PATCH v8 3/7] drm/i915/skl: Rework MOCS tables to keep common part in a define
On 2019-01-22 06:12, Lucas De Marchi wrote: From: Tomasz Lis The MOCS tables are going to be very similar across platforms. To reduce the amount of copied code, this patch rips the common part and puts it into a definition valid for all gen9 platforms. v2: Made defines for or-ing flags. Renamed macros from MOCS_TABLE to MOCS_ENTRIES. (Joonas) v3 (Lucas): - Fix indentation - Rebase on rework done by additional patch - Remove define for or-ing flags as it made the table more complex by requiring zeroed values to be passed - Do not embed comma in the macro, so to treat that just as another item and please source code formatting tools Signed-off-by: Tomasz Lis Suggested-by: Lucas De Marchi Signed-off-by: Lucas De Marchi Acked-by: Tomasz Lis -Tomasz --- drivers/gpu/drm/i915/intel_mocs.c | 57 ++- 1 file changed, 25 insertions(+), 32 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_mocs.c b/drivers/gpu/drm/i915/intel_mocs.c index 4ea80bb7dcc8..c7a2a8d81d90 100644 --- a/drivers/gpu/drm/i915/intel_mocs.c +++ b/drivers/gpu/drm/i915/intel_mocs.c @@ -93,46 +93,39 @@ struct drm_i915_mocs_table { * may only be updated incrementally by adding entries at the * end. */ + +#define GEN9_MOCS_ENTRIES \ + [I915_MOCS_UNCACHED] = { \ + /* 0x0009 */ \ + .control_value = LE_1_UC | LE_TC_2_LLC_ELLC, \ + /* 0x0010 */ \ + .l3cc_value = L3_1_UC, \ + }, \ + [I915_MOCS_PTE] = { \ + /* 0x0038 */ \ + .control_value = LE_0_PAGETABLE | LE_TC_2_LLC_ELLC | LE_LRUM(3), \ + /* 0x0030 */ \ + .l3cc_value = L3_3_WB, \ + } + static const struct drm_i915_mocs_entry skylake_mocs_table[] = { - [I915_MOCS_UNCACHED] = { - /* 0x0009 */ - .control_value = LE_1_UC | LE_TC_2_LLC_ELLC, - /* 0x0010 */ - .l3cc_value =L3_1_UC, - }, - [I915_MOCS_PTE] = { - /* 0x0038 */ - .control_value = LE_0_PAGETABLE | LE_TC_2_LLC_ELLC | LE_LRUM(3), - /* 0x0030 */ - .l3cc_value =L3_3_WB, - }, + GEN9_MOCS_ENTRIES, [I915_MOCS_CACHED] = { - /* 0x003b */ - .control_value = LE_3_WB | LE_TC_2_LLC_ELLC | LE_LRUM(3), - /* 0x0030 */ - .l3cc_value = L3_3_WB, + /* 0x003b */ + .control_value = LE_3_WB | LE_TC_2_LLC_ELLC | LE_LRUM(3), + /* 0x0030 */ + .l3cc_value = L3_3_WB, }, }; /* NOTE: the LE_TGT_CACHE is not used on Broxton */ static const struct drm_i915_mocs_entry broxton_mocs_table[] = { - [I915_MOCS_UNCACHED] = { - /* 0x0009 */ - .control_value = LE_1_UC | LE_TC_2_LLC_ELLC, - /* 0x0010 */ - .l3cc_value = L3_1_UC, - }, - [I915_MOCS_PTE] = { - /* 0x0038 */ - .control_value = LE_0_PAGETABLE | LE_TC_2_LLC_ELLC | LE_LRUM(3), - /* 0x0030 */ - .l3cc_value = L3_3_WB, - }, + GEN9_MOCS_ENTRIES, [I915_MOCS_CACHED] = { - /* 0x0039 */ - .control_value = LE_1_UC | LE_TC_2_LLC_ELLC | LE_LRUM(3), - /* 0x0030 */ - .l3cc_value = L3_3_WB, + /* 0x0039 */ + .control_value = LE_1_UC | LE_TC_2_LLC_ELLC | LE_LRUM(3), + /* 0x0030 */ + .l3cc_value = L3_3_WB, }, }; ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [PATCH v8 1/7] drm/i915: initialize unused MOCS entries to PTE
On 2019-01-22 06:12, Lucas De Marchi wrote: Instead of initializing them to uncached, let's set them to PTE for kernel tracking. While at it do some minor adjustments to comments and coding style. Signed-off-by: Lucas De Marchi Reviewed-by: Tomasz Lis One comment (with no expectations for change) below. --- drivers/gpu/drm/i915/intel_mocs.c | 56 +-- 1 file changed, 23 insertions(+), 33 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_mocs.c b/drivers/gpu/drm/i915/intel_mocs.c index e976c5ce5479..0d6b94a239d6 100644 --- a/drivers/gpu/drm/i915/intel_mocs.c +++ b/drivers/gpu/drm/i915/intel_mocs.c @@ -85,10 +85,7 @@ struct drm_i915_mocs_table { * * Entries not part of the following tables are undefined as far as * userspace is concerned and shouldn't be relied upon. For the time - * being they will be implicitly initialized to the strictest caching - * configuration (uncached) to guarantee forwards compatibility with - * userspace programs written against more recent kernels providing - * additional MOCS entries. + * being they will be initialized to PTE. * * NOTE: These tables MUST start with being uncached and the length * MUST be less than 63 as the last two registers are reserved @@ -249,16 +246,13 @@ void intel_mocs_init_engine(struct intel_engine_cs *engine) table.table[index].control_value); /* -* Ok, now set the unused entries to uncached. These entries -* are officially undefined and no contract for the contents -* and settings is given for these entries. -* -* Entry 0 in the table is uncached - so we are just writing -* that value to all the used entries. +* Now set the unused entries to PTE. These entries are officially +* undefined and no contract for the contents and settings is given +* for these entries. */ for (; index < GEN9_NUM_MOCS_ENTRIES; index++) I915_WRITE(mocs_register(engine->id, index), - table.table[0].control_value); + table.table[I915_MOCS_PTE].control_value); } /** @@ -293,16 +287,13 @@ static int emit_mocs_control_table(struct i915_request *rq, } /* -* Ok, now set the unused entries to uncached. These entries -* are officially undefined and no contract for the contents -* and settings is given for these entries. -* -* Entry 0 in the table is uncached - so we are just writing -* that value to all the used entries. +* Now set the unused entries to PTE. These entries are officially +* undefined and no contract for the contents and settings is given +* for these entries. */ for (; index < GEN9_NUM_MOCS_ENTRIES; index++) { *cs++ = i915_mmio_reg_offset(mocs_register(engine, index)); - *cs++ = table->table[0].control_value; + *cs++ = table->table[I915_MOCS_PTE].control_value; Entries from enum i915_mocs_table_index are not guaranteed to mean the same thing in future gens; but for the time, that will work. And later it might still work, we don't know. -Tomasz } *cs++ = MI_NOOP; @@ -345,7 +336,7 @@ static int emit_mocs_l3cc_table(struct i915_request *rq, *cs++ = MI_LOAD_REGISTER_IMM(GEN9_NUM_MOCS_ENTRIES / 2); - for (i = 0; i < table->size/2; i++) { + for (i = 0; i < table->size / 2; i++) { *cs++ = i915_mmio_reg_offset(GEN9_LNCFCMOCS(i)); *cs++ = l3cc_combine(table, 2 * i, 2 * i + 1); } @@ -353,18 +344,18 @@ static int emit_mocs_l3cc_table(struct i915_request *rq, if (table->size & 0x01) { /* Odd table size - 1 left over */ *cs++ = i915_mmio_reg_offset(GEN9_LNCFCMOCS(i)); - *cs++ = l3cc_combine(table, 2 * i, 0); + *cs++ = l3cc_combine(table, 2 * i, I915_MOCS_PTE); i++; } /* -* Now set the rest of the table to uncached - use entry 0 as -* this will be uncached. Leave the last pair uninitialised as -* they are reserved by the hardware. +* Now set the unused entries to PTE. These entries are officially +* undefined and no contract for the contents and settings is given +* for these entries. */ for (; i < GEN9_NUM_MOCS_ENTRIES / 2; i++) { *cs++ = i915_mmio_reg_offset(GEN9_LNCFCMOCS(i)); - *cs++ = l3cc_combine(table, 0, 0); + *cs++ = l3cc_combine(table, I915_MOCS_PTE, I915_MOCS_PTE); } *cs++ = MI_NOOP; @@ -395,22 +386,21 @@ void intel_mocs_init_l3cc_table(struct drm_i915_private *dev_priv) if (!get_mocs_settings(dev_priv, )) return; - for (i = 0; i < table.size/2; i++) - I915_WRITE(GEN9_LNCFCMOCS(i), l3cc_combine(, 2*i, 2*i+1)); + for (i =
Re: [Intel-gfx] [PATCH v8 2/7] drm/i915: Simplify MOCS table definition
On 2019-01-22 06:12, Lucas De Marchi wrote: Make the defines for LE and L3 caching options to contain the shifts and remove the zeros from the tables as shifting zeros always result in zero. Starting from Ice Lake the MOCS table is defined in the spec and contains all entries. So to simplify checking the table with the values set in code, the value is now part of the macro name. This allows to still give the most used option and sensible name, but also to easily cross check the table from the spec for gen >= 11. By removing the zeros we avoid maintaining a huge table since the one from spec contains many more entries. The new table for Ice Lake will be added by other patches, this only reformats the table. While at it also fix the indentation. Signed-off-by: Lucas De Marchi That is much cleaner. Reviewed-by: Tomasz Lis -Tomasz --- drivers/gpu/drm/i915/intel_mocs.c | 80 +++ 1 file changed, 29 insertions(+), 51 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_mocs.c b/drivers/gpu/drm/i915/intel_mocs.c index 0d6b94a239d6..4ea80bb7dcc8 100644 --- a/drivers/gpu/drm/i915/intel_mocs.c +++ b/drivers/gpu/drm/i915/intel_mocs.c @@ -36,8 +36,8 @@ struct drm_i915_mocs_table { }; /* Defines for the tables (XXX_MOCS_0 - XXX_MOCS_63) */ -#define LE_CACHEABILITY(value) ((value) << 0) -#define LE_TGT_CACHE(value)((value) << 2) +#define _LE_CACHEABILITY(value)((value) << 0) +#define _LE_TGT_CACHE(value) ((value) << 2) #define LE_LRUM(value)((value) << 4) #define LE_AOM(value) ((value) << 6) #define LE_RSC(value) ((value) << 7) @@ -48,28 +48,28 @@ struct drm_i915_mocs_table { /* Defines for the tables (LNCFMOCS0 - LNCFMOCS31) - two entries per word */ #define L3_ESC(value) ((value) << 0) #define L3_SCC(value) ((value) << 1) -#define L3_CACHEABILITY(value) ((value) << 4) +#define _L3_CACHEABILITY(value)((value) << 4) /* Helper defines */ #define GEN9_NUM_MOCS_ENTRIES 62 /* 62 out of 64 - 63 & 64 are reserved. */ /* (e)LLC caching options */ -#define LE_PAGETABLE 0 -#define LE_UC 1 -#define LE_WT 2 -#define LE_WB 3 - -/* L3 caching options */ -#define L3_DIRECT 0 -#define L3_UC 1 -#define L3_RESERVED2 -#define L3_WB 3 +#define LE_0_PAGETABLE _LE_CACHEABILITY(0) +#define LE_1_UC_LE_CACHEABILITY(1) +#define LE_2_WT_LE_CACHEABILITY(2) +#define LE_3_WB_LE_CACHEABILITY(3) /* Target cache */ -#define LE_TC_PAGETABLE0 -#define LE_TC_LLC 1 -#define LE_TC_LLC_ELLC 2 -#define LE_TC_LLC_ELLC_ALT 3 +#define LE_TC_0_PAGETABLE _LE_TGT_CACHE(0) +#define LE_TC_1_LLC_LE_TGT_CACHE(1) +#define LE_TC_2_LLC_ELLC _LE_TGT_CACHE(2) +#define LE_TC_3_LLC_ELLC_ALT _LE_TGT_CACHE(3) + +/* L3 caching options */ +#define L3_0_DIRECT_L3_CACHEABILITY(0) +#define L3_1_UC_L3_CACHEABILITY(1) +#define L3_2_RESERVED _L3_CACHEABILITY(2) +#define L3_3_WB_L3_CACHEABILITY(3) /* * MOCS tables @@ -96,31 +96,21 @@ struct drm_i915_mocs_table { static const struct drm_i915_mocs_entry skylake_mocs_table[] = { [I915_MOCS_UNCACHED] = { /* 0x0009 */ - .control_value = LE_CACHEABILITY(LE_UC) | - LE_TGT_CACHE(LE_TC_LLC_ELLC) | - LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) | - LE_PFM(0) | LE_SCF(0), - + .control_value = LE_1_UC | LE_TC_2_LLC_ELLC, /* 0x0010 */ - .l3cc_value =L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC), + .l3cc_value =L3_1_UC, }, [I915_MOCS_PTE] = { /* 0x0038 */ - .control_value = LE_CACHEABILITY(LE_PAGETABLE) | - LE_TGT_CACHE(LE_TC_LLC_ELLC) | - LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) | - LE_PFM(0) | LE_SCF(0), + .control_value = LE_0_PAGETABLE | LE_TC_2_LLC_ELLC | LE_LRUM(3), /* 0x0030 */ - .l3cc_value =L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB), + .l3cc_value =L3_3_WB, }, [I915_MOCS_CACHED] = { /* 0x003b */ - .control_value = LE_CACHEABILITY(LE_WB) | - LE_TGT_CACHE(LE_TC_LLC_ELLC) | - LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) | - LE_PFM(0) | LE_SCF(0), + .control_value = LE_3_WB | LE_TC_2_LLC_ELLC | LE_LRUM(3), /* 0x0030 */ - .l3cc_value = L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB), + .l3cc_value = L3_3_WB,
Re: [Intel-gfx] [PATCH v7 3/4] drm/i915/icl: Define MOCS table for Icelake
On 2018-12-21 13:29, Tvrtko Ursulin wrote: On 14/12/2018 18:20, Lucas De Marchi wrote: From: Tomasz Lis The table has been unified across OSes to minimize virtualization overhead. The MOCS table is now published as part of bspec, and versioned. Entries are supposed to never be modified, but new ones can be added. Adding entries increases table version. The patch includes version 1 entries. Meaning of each entry is now explained in bspec, and user mode clients are expected to know what each entry means. The 3 entries used for previous platforms are still compatible with their legacy definitions, but that is not guaranteed to be true for future platforms. v2: Fixed SCC values, improved commit comment (Daniele) v3: Improved MOCS table comment (Daniele) v4: Moved new entries below gen9 ones. Put common entries into definition to be used in multiple arrays. (Lucas) v5: Made defines for or-ing flags. Renamed macros from MOCS_TABLE to MOCS_ENTRIES. Switched LE_CoS to upper case. (Joonas) v6: Removed definitions of reserved entries. (Michal) Increased limit of entries sent to the hardware on gen11+. v7: Simplify table as done for previou gens (Lucas) BSpec: 34007 BSpec: 560 Signed-off-by: Tomasz Lis Reviewed-by: Daniele Ceraolo Spurio Reviewed-by: Lucas De Marchi Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/i915/intel_mocs.c | 187 ++ 1 file changed, 162 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_mocs.c b/drivers/gpu/drm/i915/intel_mocs.c index 577633cefb8a..dfc4edea020f 100644 --- a/drivers/gpu/drm/i915/intel_mocs.c +++ b/drivers/gpu/drm/i915/intel_mocs.c @@ -44,6 +44,8 @@ struct drm_i915_mocs_table { #define LE_SCC(value) ((value) << 8) #define LE_PFM(value) ((value) << 11) #define LE_SCF(value) ((value) << 14) +#define LE_COS(value) ((value) << 15) +#define LE_SSE(value) ((value) << 17) /* Defines for the tables (LNCFMOCS0 - LNCFMOCS31) - two entries per word */ #define L3_ESC(value) ((value) << 0) @@ -52,6 +54,10 @@ struct drm_i915_mocs_table { /* Helper defines */ #define GEN9_NUM_MOCS_ENTRIES 62 /* 62 out of 64 - 63 & 64 are reserved. */ +#define GEN11_NUM_MOCS_ENTRIES 64 /* 63-64 are reserved, but configured. */ + +#define NUM_MOCS_ENTRIES(i915) \ + (INTEL_GEN(i915) < 11 ? GEN9_NUM_MOCS_ENTRIES : GEN11_NUM_MOCS_ENTRIES) /* (e)LLC caching options */ #define LE_0_PAGETABLE _LE_CACHEABILITY(0) @@ -80,21 +86,21 @@ struct drm_i915_mocs_table { * LNCFCMOCS0 - LNCFCMOCS32 registers. * * These tables are intended to be kept reasonably consistent across - * platforms. However some of the fields are not applicable to all of - * them. + * HW platforms, and for ICL+, be identical across OSes. To achieve + * that, for Icelake and above, list of entries is published as part + * of bspec. * * Entries not part of the following tables are undefined as far as - * userspace is concerned and shouldn't be relied upon. For the time - * being they will be implicitly initialized to the strictest caching - * configuration (uncached) to guarantee forwards compatibility with - * userspace programs written against more recent kernels providing - * additional MOCS entries. + * userspace is concerned and shouldn't be relied upon. + * + * The last two entries are reserved by the hardware. For ICL+ they + * should be initialized according to bspec and never used, for older + * platforms they should never be written to. * - * NOTE: These tables MUST start with being uncached and the length - * MUST be less than 63 as the last two registers are reserved - * by the hardware. These tables are part of the kernel ABI and - * may only be updated incrementally by adding entries at the - * end. + * NOTE: These tables are part of bspec and defined as part of hardware + * interface for ICL+. For older platforms, they are part of kernel + * ABI. It is expected that existing entries will remain constant + * and the tables will only be updated by adding new entries. */ #define GEN9_MOCS_ENTRIES \ @@ -132,6 +138,132 @@ static const struct drm_i915_mocs_entry broxton_mocs_table[] = { }, }; +#define GEN11_MOCS_ENTRIES \ + [0] = { \ + /* Base - Uncached (Deprecated) */ \ + .control_value = LE_1_UC | LE_TC_1_LLC, \ + .l3cc_value = L3_1_UC \ + }, \ + [1] = { \ + /* Base - L3 + LeCC:PAT (Deprecated) */ \ + .control_value = LE_0_PAGETABLE | LE_TC_1_LLC, \ + .l3cc_value = L3_3_WB \ + }, \ + [2] = { \ + /* Base - L3 + LLC */ \ + .control_value = LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), \ + .l3cc_value = L3_3_WB \ + }, \ + [3] = { \ + /* Base - Uncached */ \ + .control_value = LE_1_UC | LE_TC_1_LLC, \ + .l3cc_value = L3_1_UC \ + }, \ + [4] = { \ + /* Base - L3 */ \ +
Re: [Intel-gfx] [PATCH v6] drm/i915/icl: Preempt-to-idle support in execlists.
On 2018-12-14 12:10, Joonas Lahtinen wrote: Quoting Tvrtko Ursulin (2018-12-10 17:40:34) On 09/11/2018 17:18, Tomasz Lis wrote: The patch adds support of preempt-to-idle requesting by setting a proper bit within Execlist Control Register, and receiving preemption result from Context Status Buffer. Preemption in previous gens required a special batch buffer to be executed, so the Command Streamer never preempted to idle directly. In Icelake it is possible, as there is a hardware mechanism to inform the kernel about status of the preemption request. This patch does not cover using the new preemption mechanism when GuC is active. The advantage of this new preemption path is that one less context switch is needed, and returning information about preempion being complete is received earlier. This leads to significant improvement in our IGT latency test. Test performed: `gem_exec_latency --run-subtest render-preemption`, executed 100 times, on the same platform, same kernel, without and with this patch. Then taken average of the execution latency times: subcase old preempt.icl preempt. render-render 853.2036840.1176 render-bsd2328.8708 2083.2576 render-blt2080.1501 1852.0792 render-vebox 1553.5134 1428.762 Improvement observed: subcase improvement render-render 1.53% render-bsd10.55% render-blt10.96% render-vebox 8.03% Who can explain what do the parts other than render-render mean? At least I can make sense of render-render - measure how long it takes for one context to preempt another, but render-$other draws a blank for me. How are engines pre-empting one another? These cases submit low priority spin buffer to 'render' ring, and then on high priority context they submit two buffers which only write current timestamp to a known location: first one to 'render', and second to the other engine. Submission to the other engine with the same context makes sure that 'render' gets back to spin buffer after each iteration. The two high priority tasks are not parallelized, because they announce they write to the same object. The time taken to do all operations in one iteration is then displayed. So the time includes: - preempting spin buffer - executing timestamp write from within 'render' - executing timestamp write from within other ring (while at the same time, spin buffer gets back to execution on render) If I'm not mistaken with the above, it looks like the 'render-render' case is the worst one to measure performance increase - if both high priority tasks are executed on the same engine, there's a considerable chance that the hardware will get straight to another pair of them, without going though spin buffer and preempting it. -Tomasz But anyway, even if only the 1.53% improvement is the real one, FWIW that's I think good enough to justify the patch. It is sufficiently small and contained that I don't see a problem. So: Acked-by: Tvrtko Ursulin According to Chris, the baseline measurements are off by a decade or so compared to where they should be. This might be attributed to execution on frequency locked parts? Would it be worthy to repeat the numbers with some unlocked parts? Regards, Joonas Regards, Tvrtko v2: Added needs_preempt_context() change so that it is not created when preempt-to-idle is supported. (Chris) Updated setting HWACK flag so that it is cleared after preempt-to-dle. (Chris, Daniele) Updated to use I915_ENGINE_HAS_PREEMPTION flag. (Chris) v3: Fixed needs_preempt_context() change. (Chris) Merged preemption trigger functions to one. (Chris) Fixed conyext state tonot assume COMPLETED_MASK after preemption, since idle-to-idle case will not have it set. v4: Simplified needs_preempt_context() change. (Daniele) Removed clearing HWACK flag in idle-to-idle preempt. (Daniele) v5: Renamed inject_preempt_context(). (Daniele) Removed duplicated GEM_BUG_ON() on HWACK (Daniele) v6: Added performance test results. Bspec: 18922 Cc: Joonas Lahtinen Cc: Chris Wilson Cc: Daniele Ceraolo Spurio Cc: Michal Winiarski Cc: Mika Kuoppala Reviewed-by: Daniele Ceraolo Spurio Signed-off-by: Tomasz Lis --- drivers/gpu/drm/i915/i915_drv.h | 2 + drivers/gpu/drm/i915/i915_gem_context.c | 3 +- drivers/gpu/drm/i915/i915_pci.c | 3 +- drivers/gpu/drm/i915/intel_device_info.h | 1 + drivers/gpu/drm/i915/intel_lrc.c | 109 +-- drivers/gpu/drm/i915/intel_lrc.h | 1 + 6 files changed, 84 insertions(+), 35 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 08d25aa..d2cc9f1 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2579,6 +2579,8 @@ intel_info(const struct drm_i915_private *dev_priv) ((dev_priv)->info.has_logical_ring_elsq) #define
Re: [Intel-gfx] [PATCH v2 1/2] drm/i915/icl: Define MOCS table for Icelake
On 2018-10-22 19:40, Daniele Ceraolo Spurio wrote: On 22/10/18 10:13, Tomasz Lis wrote: The table has been unified across OSes to minimize virtualization overhead. The MOCS table is now published as part of bspec, and versioned. Entries are supposed to never be modified, but new ones can be added. Adding entries increases table version. The patch includes version 1 entries. Meaning of each entry is now explained in bspec, and user mode clients are expected to know what each entry means. The 3 entries used for previous platforms are still compatible with their legacy definitions, but that is not guaranteed to be true for future platforms. BSpec: 34007 BSpec: 560 Signed-off-by: Tomasz Lis Cc: Joonas Lahtinen Cc: Chris Wilson Cc: Mika Kuoppala Cc: Daniele Ceraolo Spurio Cc: Zhenyu Wang Cc: Zhi A Wang Cc: Anuj Phogat --- drivers/gpu/drm/i915/intel_mocs.c | 246 +- 1 file changed, 244 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_mocs.c b/drivers/gpu/drm/i915/intel_mocs.c index 77e9871..dc34e83 100644 --- a/drivers/gpu/drm/i915/intel_mocs.c +++ b/drivers/gpu/drm/i915/intel_mocs.c @@ -44,6 +44,8 @@ struct drm_i915_mocs_table { #define LE_SCC(value) ((value) << 8) #define LE_PFM(value) ((value) << 11) #define LE_SCF(value) ((value) << 14) +#define LE_CoS(value) ((value) << 15) +#define LE_SSE(value) ((value) << 17) /* Defines for the tables (LNCFMOCS0 - LNCFMOCS31) - two entries per word */ #define L3_ESC(value) ((value) << 0) @@ -96,6 +98,243 @@ struct drm_i915_mocs_table { * may only be updated incrementally by adding entries at the * end. */ This comment needs to be updated as some of the expectations are not true anymore, e.g. we're now defining entries 62 and 63 in SW, usage of mocs 0 is changing. Also could be useful to add some of the info you have in the commit message here, like the fact that the table is versioned in the specs for gen11+, so it is close to the table. From a POV of following the specs, with the updated comments this is: Reviewed-by: Daniele Ceraolo Spurio But please get acks from the relevant interested parties to make sure there are no concerns with the new approach. I believe all parties are informed; will add some more cc's to v3. Also having someone else double-check the table as well would be nice since I might have missed something and it's going to be hard to catch issues in testing if the only impact is a very small performance delta. Thanks, Daniele The IGT test for MOCS settings compares explicitly content of the registers with expected values hard-coded into that test; since someone else than me will be updating that test for correct Icelake values, we can count that as second verification. -Tomasz +static const struct drm_i915_mocs_entry icelake_mocs_table[] = { + [0] = { + /* Base - Uncached (Deprecated) */ + .control_value = LE_CACHEABILITY(LE_UC) | + LE_TGT_CACHE(LE_TC_LLC) | + LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) | + LE_PFM(0) | LE_SCF(0) | LE_CoS(0) | LE_SSE(0), + + .l3cc_value = L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC), + }, + [1] = { + /* Base - L3 + LeCC:PAT (Deprecated) */ + .control_value = LE_CACHEABILITY(LE_PAGETABLE) | + LE_TGT_CACHE(LE_TC_LLC) | + LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) | + LE_PFM(0) | LE_SCF(0) | LE_CoS(0) | LE_SSE(0), + + .l3cc_value = L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB), + }, + [2] = { + /* Base - L3 + LLC */ + .control_value = LE_CACHEABILITY(LE_WB) | + LE_TGT_CACHE(LE_TC_LLC) | + LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) | + LE_PFM(0) | LE_SCF(0) | LE_CoS(0) | LE_SSE(0), + + .l3cc_value = L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB), + }, + [3] = { + /* Base - Uncached */ + .control_value = LE_CACHEABILITY(LE_UC) | + LE_TGT_CACHE(LE_TC_LLC) | + LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) | + LE_PFM(0) | LE_SCF(0) | LE_CoS(0) | LE_SSE(0), + + .l3cc_value = L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC), + }, + [4] = { + /* Base - L3 */ + .control_value = LE_CACHEABILITY(LE_UC) | + LE_TGT_CACHE(LE_TC_LLC) | + LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) | + LE_PFM(0) | LE_SCF(0) | LE_CoS(0) | LE_SSE(0), + + .l3cc_value = L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB), + }, + [5] = { + /* Base - LLC */ + .control_value = LE_CACHEABILITY(LE_WB) | + LE_TGT_CACHE(LE_TC_LLC) | + LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) | + LE_PFM(0) | LE_SCF(0) | LE_CoS(0) | LE_SSE(0), + + .l3cc_value = L3_ESC(0) | L3_SCC(0) |
Re: [Intel-gfx] [PATCH v2 2/2] drm/i915/icl: Add IOCTL for getting MOCS table version
On 2018-10-22 20:18, Daniele Ceraolo Spurio wrote: On 22/10/18 10:13, Tomasz Lis wrote: For Icelake and above, MOCS table for each platform is published within bspec. The table is versioned, and new entries are assigned a version number. Existing entries do not change and their version is constant. This introduces a parameter which allows getting max version number of the MOCS entries currently supported, ie. value of 2 would mean only version 1 and version 2 entries are initialized and can be used by the user mode clients. BSpec: 34007 Signed-off-by: Tomasz Lis Cc: Joonas Lahtinen Cc: Chris Wilson Cc: Mika Kuoppala Cc: Daniele Ceraolo Spurio Cc: Zhenyu Wang Cc: Zhi A Wang Cc: Anuj Phogat --- drivers/gpu/drm/i915/i915_drv.c | 6 ++ drivers/gpu/drm/i915/intel_mocs.c | 12 drivers/gpu/drm/i915/intel_mocs.h | 1 + include/uapi/drm/i915_drm.h | 11 +++ 4 files changed, 30 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index baac35f..92fa8fd 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -53,6 +53,7 @@ #include "i915_vgpu.h" #include "intel_drv.h" #include "intel_uc.h" +#include "intel_mocs.h" static struct drm_driver driver; @@ -444,6 +445,11 @@ static int i915_getparam_ioctl(struct drm_device *dev, void *data, case I915_PARAM_MMAP_GTT_COHERENT: value = INTEL_INFO(dev_priv)->has_coherent_ggtt; break; + case I915_PARAM_MOCS_TABLE_VERSION: + value = intel_mocs_table_version(dev_priv); + if (!value) + return -ENODEV; Do we really want to return -ENODEV for platforms that do have a MOCS table programmed, but the table is not one versioned in the specs (i.e. Gen9-10)? I think returning "0" for those to indicate "kernel-defined table" would be ok and we could limit -ENODEV for platforms that don't have a table at all. But what wins is what the callers of the ioctl would like to get from the kernel ;) + break; default: DRM_DEBUG("Unknown parameter %d\n", param->param); return -EINVAL; diff --git a/drivers/gpu/drm/i915/intel_mocs.c b/drivers/gpu/drm/i915/intel_mocs.c index dc34e83..fc1e98b 100644 --- a/drivers/gpu/drm/i915/intel_mocs.c +++ b/drivers/gpu/drm/i915/intel_mocs.c @@ -469,6 +469,18 @@ static i915_reg_t mocs_register(enum intel_engine_id engine_id, int index) } /** + * intel_mocs_table_version() - get version of mocs table implementation + * @i915: i915 device struct. + */ +int intel_mocs_table_version(struct drm_i915_private *i915) +{ + if (IS_ICELAKE(i915)) + return 1; Can we add this version value as a define above the table, to keep them close to each other? If we agree on my suggestion above to differentiate between no table at all (< 0), kernel-defined table (= 0) and spec-defined versioned table (> 0), it might also be useful to store the version with the table info in drm_i915_mocs_table and then have intel_mocs_table_version call get_mocs_settings(), e.g: int intel_mocs_table_version(struct drm_i915_private *i915) { struct drm_i915_mocs_table table; if (!get_mocs_settings(i915, )) return -ENODEV; return table->version; } Thanks, Daniele That does sound reasonable. And I agree we should really ask userland what exactly they want. Right now there is no request from any UMD for such interface; we will get back to this patch when it is requested. Joonas also mentioned there is no need for an interface which always returns "1", and is expected to return only that value for future platforms as well. That's a good argument. I will remove this patch from the current series. -Tomasz + else + return 0; +} + +/** * intel_mocs_init_engine() - emit the mocs control table * @engine: The engine for whom to emit the registers. * diff --git a/drivers/gpu/drm/i915/intel_mocs.h b/drivers/gpu/drm/i915/intel_mocs.h index d89080d..dc1d64a 100644 --- a/drivers/gpu/drm/i915/intel_mocs.h +++ b/drivers/gpu/drm/i915/intel_mocs.h @@ -55,5 +55,6 @@ int intel_rcs_context_init_mocs(struct i915_request *rq); void intel_mocs_init_l3cc_table(struct drm_i915_private *dev_priv); void intel_mocs_init_engine(struct intel_engine_cs *engine); +int intel_mocs_table_version(struct drm_i915_private *i915); #endif diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 298b2e1..16aafc4 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -559,6 +559,17 @@ typedef struct drm_i915_irq_wait { */ #define I915_PARAM_MMAP_GTT_COHERENT 52 +/* + * Query MOCS table version used during hardware initialization. + * + * The MOCS table for each platform is published as part of bspec. Entries in + * the table are supposed to never be modified, but new enties are added, making + * more indexes in the table valid. This parameter informs which
Re: [Intel-gfx] [PATCH v5] drm/i915/icl: Preempt-to-idle support in execlists.
On 2018-10-23 11:13, Joonas Lahtinen wrote: Quoting Lis, Tomasz (2018-10-19 19:00:15) On 2018-10-16 12:53, Joonas Lahtinen wrote: Quoting Tomasz Lis (2018-10-15 20:29:18) The patch adds support of preempt-to-idle requesting by setting a proper bit within Execlist Control Register, and receiving preemption result from Context Status Buffer. Preemption in previous gens required a special batch buffer to be executed, so the Command Streamer never preempted to idle directly. In Icelake it is possible, as there is a hardware mechanism to inform the kernel about status of the preemption request. This patch does not cover using the new preemption mechanism when GuC is active. v2: Added needs_preempt_context() change so that it is not created when preempt-to-idle is supported. (Chris) Updated setting HWACK flag so that it is cleared after preempt-to-dle. (Chris, Daniele) Updated to use I915_ENGINE_HAS_PREEMPTION flag. (Chris) v3: Fixed needs_preempt_context() change. (Chris) Merged preemption trigger functions to one. (Chris) Fixed conyext state tonot assume COMPLETED_MASK after preemption, since idle-to-idle case will not have it set. v4: Simplified needs_preempt_context() change. (Daniele) Removed clearing HWACK flag in idle-to-idle preempt. (Daniele) v5: Renamed inject_preempt_context(). (Daniele) Removed duplicated GEM_BUG_ON() on HWACK (Daniele) Bspec: 18922 Cc: Joonas Lahtinen Cc: Chris Wilson Cc: Daniele Ceraolo Spurio Cc: Michal Winiarski Cc: Mika Kuoppala Reviewed-by: Daniele Ceraolo Spurio This R-b was on v4, and should be indicated with # v4 comment. The commit message doesn't say much about why preempting to idle is beneficial? The pre-Gen11 codepath needs to be maintained anyway. Regards, Joonas The benefit is one less context switch - there is no "preempt context". Yes. But that still doesn't quite explain what material benefits there are? :) Is there some actual workloads/microbenchmarks that get an improvement? This alters the behavior between different platforms for a very delicate feature, probably resulting in slightly different bugs. So there should be some more reasoning than just because we can. Regards, Joonas Less context switching does imply perf improvement, though it would require measurement - it might be hardly detectable. We may even lose performance - without measurements, we don't know. So not a strong argument. One more benefit I could think of is - GuC path will use preempt-to-idle, so this would make execlists use the same path as GuC. But that's not a strong argument as well. I must agree - there doesn't seem to be any strong enough reason to go with this change. We might consider it after we have performance data. -Tomasz ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [PATCH v5] drm/i915/icl: Preempt-to-idle support in execlists.
On 2018-10-16 12:53, Joonas Lahtinen wrote: Quoting Tomasz Lis (2018-10-15 20:29:18) The patch adds support of preempt-to-idle requesting by setting a proper bit within Execlist Control Register, and receiving preemption result from Context Status Buffer. Preemption in previous gens required a special batch buffer to be executed, so the Command Streamer never preempted to idle directly. In Icelake it is possible, as there is a hardware mechanism to inform the kernel about status of the preemption request. This patch does not cover using the new preemption mechanism when GuC is active. v2: Added needs_preempt_context() change so that it is not created when preempt-to-idle is supported. (Chris) Updated setting HWACK flag so that it is cleared after preempt-to-dle. (Chris, Daniele) Updated to use I915_ENGINE_HAS_PREEMPTION flag. (Chris) v3: Fixed needs_preempt_context() change. (Chris) Merged preemption trigger functions to one. (Chris) Fixed conyext state tonot assume COMPLETED_MASK after preemption, since idle-to-idle case will not have it set. v4: Simplified needs_preempt_context() change. (Daniele) Removed clearing HWACK flag in idle-to-idle preempt. (Daniele) v5: Renamed inject_preempt_context(). (Daniele) Removed duplicated GEM_BUG_ON() on HWACK (Daniele) Bspec: 18922 Cc: Joonas Lahtinen Cc: Chris Wilson Cc: Daniele Ceraolo Spurio Cc: Michal Winiarski Cc: Mika Kuoppala Reviewed-by: Daniele Ceraolo Spurio This R-b was on v4, and should be indicated with # v4 comment. The commit message doesn't say much about why preempting to idle is beneficial? The pre-Gen11 codepath needs to be maintained anyway. Regards, Joonas The benefit is one less context switch - there is no "preempt context". -Tomasz ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [RFC] drm/i915/guc: New GuC stage descriptors
On 2018-10-12 20:25, Daniele Ceraolo Spurio wrote: With the new interface, GuC now requires every lrc to be registered in one of the stage descriptors, which have been re-designed so that each descriptor can store up to 64 lrc per class (i.e. equal to the possible SW counter values). Similarly to what happened with the previous legacy design, it is possible to have a single "proxy" descriptor that owns the workqueue and the doorbell and use it for all submission. To distinguish the proxy descriptors from the one used for lrc storage, the latter have been called "principal". A descriptor can't be both a proxy and a principal at the same time; to enforce this, since we only use 1 proxy descriptor per client, we reserve enough descriptor from the bottom of the pool to be used as proxy and leave the others as principals. For simplicity, we currently map context IDs 1:1 to principal descriptors, but we could have more contexts in flight if needed by using the SW counter. Does this have any influence on the concept of "default context" used by UMDs? Or is this completely separate? Note that the lrcs need to be mapped in the principal descriptor until guc is done with them. This means that we can't release the HW id when the user app closes the ctx because it might still be in flight with GuC and that we need to be careful when unpinning because the fact that the s/the// a request on the next context has completed doesn't mean that GuC is done processing the first one. See in-code comments for details. NOTE: GuC is not going to look at lrcs that are not in flight, so we could potentially skip the unpinning steps. However, the unpinnig steps perform extra correctness check so better keep them until we're sure that the flow is solid. Based on an initial patch by Oscar Mateo. RFC: this will be sent as part of the updated series once we have the gen9 FW with the new interface, but since the flow is significantly different compared to the previous version I'd like to gather some early feedback to make sure there aren't any major issues. Signed-off-by: Daniele Ceraolo Spurio Signed-off-by: Michal Wajdeczko Cc: Chris Wilson Cc: Michel Thierry Cc: Michal Winiarski Cc: Tomasz Lis --- drivers/gpu/drm/i915/i915_debugfs.c | 30 +- drivers/gpu/drm/i915/i915_drv.h | 5 +- drivers/gpu/drm/i915/i915_gem_context.c | 9 +- drivers/gpu/drm/i915/intel_guc.h| 14 +- drivers/gpu/drm/i915/intel_guc_fwif.h | 73 +++-- drivers/gpu/drm/i915/intel_guc_submission.c | 346 +++- drivers/gpu/drm/i915/intel_lrc.c| 18 +- drivers/gpu/drm/i915/intel_lrc.h| 7 + drivers/gpu/drm/i915/selftests/intel_guc.c | 4 +- 9 files changed, 360 insertions(+), 146 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 00c551d3e409..04bbde4a38a6 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -2474,11 +2474,10 @@ static int i915_guc_stage_pool(struct seq_file *m, void *data) seq_printf(m, "GuC stage descriptor %u:\n", index); seq_printf(m, "\tIndex: %u\n", desc->stage_id); + seq_printf(m, "\tProxy Index: %u\n", desc->proxy_id); seq_printf(m, "\tAttribute: 0x%x\n", desc->attribute); seq_printf(m, "\tPriority: %d\n", desc->priority); seq_printf(m, "\tDoorbell id: %d\n", desc->db_id); - seq_printf(m, "\tEngines used: 0x%x\n", - desc->engines_used); seq_printf(m, "\tDoorbell trigger phy: 0x%llx, cpu: 0x%llx, uK: 0x%x\n", desc->db_trigger_phy, desc->db_trigger_cpu, @@ -2490,18 +2489,21 @@ static int i915_guc_stage_pool(struct seq_file *m, void *data) seq_putc(m, '\n'); for_each_engine_masked(engine, dev_priv, client->engines, tmp) { - u32 guc_engine_id = engine->guc_id; - struct guc_execlist_context *lrc = - >lrc[guc_engine_id]; - - seq_printf(m, "\t%s LRC:\n", engine->name); - seq_printf(m, "\t\tContext desc: 0x%x\n", - lrc->context_desc); - seq_printf(m, "\t\tContext id: 0x%x\n", lrc->context_id); - seq_printf(m, "\t\tLRCA: 0x%x\n", lrc->ring_lrca); - seq_printf(m, "\t\tRing begin: 0x%x\n", lrc->ring_begin); - seq_printf(m, "\t\tRing end: 0x%x\n", lrc->ring_end); - seq_putc(m, '\n'); + u8 class = engine->class; + u8 inst = engine->instance; + + if (desc->lrc_alloc_map[class].bitmap & BIT(inst)) { + struct guc_execlist_context *lrc = +
Re: [Intel-gfx] [PATCH v2] drm/i915: GEM_WARN_ON considered harmful
On 2018-10-12 08:31, Tvrtko Ursulin wrote: From: Tvrtko Ursulin GEM_WARN_ON currently has dangerous semantics where it is completely compiled out on !GEM_DEBUG builds. This can leave users who expect it to be more like a WARN_ON, just without a warning in non-debug builds, in complete ignorance. Another gotcha with it is that it cannot be used as a statement. Which is again different from a standard kernel WARN_ON. This patch fixes both problems by making it behave as one would expect. It can now be used both as an expression and as statement, and also the condition evaluates properly in all builds - code under the conditional will therefore not unexpectedly disappear. To satisfy call sites which really want the code under the conditional to completely disappear, we add GEM_DEBUG_WARN_ON and convert some of the callers to it. This one can also be used as both expression and statement. From the above it follows GEM_DEBUG_WARN_ON should be used in situations where we are certain the condition will be hit during development, but at a place in code where error can be handled to the benefit of not crashing the machine. GEM_WARN_ON on the other hand should be used where condition may happen in production and we just want to distinguish the level of debugging output emitted between the production and debug build. v2: * Dropped BUG_ON hunk. I have to agree with the need to "fix" GEM_WARN_ON() - ran into not realizing the difference to WARN_ON() myself. And while nobody likes multiplying variants of macros, I can't disagree with the need of having DEBUG-only version as well. Reviewed-by: Tomasz Lis Signed-off-by: Tvrtko Ursulin Cc: Chris Wilson Cc: Joonas Lahtinen Cc: Matthew Auld Cc: Mika Kuoppala Cc: Tomasz Lis --- drivers/gpu/drm/i915/i915_gem.h | 4 +++- drivers/gpu/drm/i915/i915_vma.c | 8 drivers/gpu/drm/i915/intel_engine_cs.c | 8 drivers/gpu/drm/i915/intel_lrc.c | 6 +++--- drivers/gpu/drm/i915/intel_workarounds.c | 2 +- 5 files changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem.h b/drivers/gpu/drm/i915/i915_gem.h index 599c4f6eb1ea..b0e4b976880c 100644 --- a/drivers/gpu/drm/i915/i915_gem.h +++ b/drivers/gpu/drm/i915/i915_gem.h @@ -47,17 +47,19 @@ struct drm_i915_private; #define GEM_DEBUG_DECL(var) var #define GEM_DEBUG_EXEC(expr) expr #define GEM_DEBUG_BUG_ON(expr) GEM_BUG_ON(expr) +#define GEM_DEBUG_WARN_ON(expr) GEM_WARN_ON(expr) #else #define GEM_SHOW_DEBUG() (0) #define GEM_BUG_ON(expr) BUILD_BUG_ON_INVALID(expr) -#define GEM_WARN_ON(expr) (BUILD_BUG_ON_INVALID(expr), 0) +#define GEM_WARN_ON(expr) ({ unlikely(!!(expr)); }) #define GEM_DEBUG_DECL(var) #define GEM_DEBUG_EXEC(expr) do { } while (0) #define GEM_DEBUG_BUG_ON(expr) +#define GEM_DEBUG_WARN_ON(expr) ({ BUILD_BUG_ON_INVALID(expr); 0; }) #endif #if IS_ENABLED(CONFIG_DRM_I915_TRACE_GEM) diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index 31efc971a3a8..82652c3d1bed 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -305,12 +305,12 @@ int i915_vma_bind(struct i915_vma *vma, enum i915_cache_level cache_level, GEM_BUG_ON(!drm_mm_node_allocated(>node)); GEM_BUG_ON(vma->size > vma->node.size); - if (GEM_WARN_ON(range_overflows(vma->node.start, - vma->node.size, - vma->vm->total))) + if (GEM_DEBUG_WARN_ON(range_overflows(vma->node.start, + vma->node.size, + vma->vm->total))) return -ENODEV; - if (GEM_WARN_ON(!flags)) + if (GEM_DEBUG_WARN_ON(!flags)) return -EINVAL; bind_flags = 0; diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c index f27dbe26bcc1..78e42c0825d2 100644 --- a/drivers/gpu/drm/i915/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/intel_engine_cs.c @@ -273,13 +273,13 @@ intel_engine_setup(struct drm_i915_private *dev_priv, BUILD_BUG_ON(MAX_ENGINE_CLASS >= BIT(GEN11_ENGINE_CLASS_WIDTH)); BUILD_BUG_ON(MAX_ENGINE_INSTANCE >= BIT(GEN11_ENGINE_INSTANCE_WIDTH)); - if (GEM_WARN_ON(info->class > MAX_ENGINE_CLASS)) + if (GEM_DEBUG_WARN_ON(info->class > MAX_ENGINE_CLASS)) return -EINVAL; - if (GEM_WARN_ON(info->instance > MAX_ENGINE_INSTANCE)) + if (GEM_DEBUG_WARN_ON(info->instance > MAX_ENGINE_INSTANCE)) return -EINVAL; - if (GEM_WARN_ON(dev_priv->engine_class[info->class][info->instance])) + if (GEM_DEBUG_WARN_ON(dev_priv->engine_class[info->class][info->instance])) return -EINVAL; GEM_BUG_ON(dev_priv->engine[id]); @@ -402,7 +402,7 @@ int intel_engines_init(struct drm_i915_private *dev_priv) err = -EINVAL; err_id =
Re: [Intel-gfx] [RFC] drm/i915: GEM_WARN_ON considered harmful
So I understand we agree on the change, just waiting for non-RFC version? -Tomasz On 2018-09-24 11:34, Jani Nikula wrote: On Thu, 20 Sep 2018, Tvrtko Ursulin wrote: Ping! Any comments here? Main goal was to allow GEM_WARN_ON as a statement, plus also protect uses in if statements, which there are some who I think don't expect the branch to completely disappear. I've said before I don't like the conditional early returns vanishing depending on config options, but I've been shot down. I think this patch is an improvement. BR, Jani. Regards, Tvrtko On 07/09/2018 12:53, Tvrtko Ursulin wrote: From: Tvrtko Ursulin GEM_WARN_ON currently has dangerous semantics where it is completely compiled out on !GEM_DEBUG builds. This can leave users who expect it to be more like a WARN_ON, just without a warning in non-debug builds, in complete ignorance. Another gotcha with it is that it cannot be used as a statement. Which is again different from a standard kernel WARN_ON. This patch fixes both problems by making it behave as one would expect. It can now be used both as an expression and as statement, and also the condition evaluates properly in all builds - code under the conditional will therefore not unexpectedly disappear. To satisfy call sites which really want the code under the conditional to completely disappear, we add GEM_DEBUG_WARN_ON and convert some of the callers to it. This one can also be used as both expression and statement. From the above it follows GEM_DEBUG_WARN_ON should be used in situations where we are certain the condition will be hit during development, but at a place in code where error can be handled to the benefit of not crashing the machine. GEM_WARN_ON on the other hand should be used where condition may happen in production and we just want to distinguish the level of debugging output emitted between the production and debug build. Signed-off-by: Tvrtko Ursulin Cc: Chris Wilson Cc: Joonas Lahtinen Cc: Matthew Auld Cc: Mika Kuoppala --- Quickly put together and compile tested only! --- drivers/gpu/drm/i915/i915_gem.h | 4 +++- drivers/gpu/drm/i915/i915_vma.c | 8 drivers/gpu/drm/i915/intel_engine_cs.c | 8 drivers/gpu/drm/i915/intel_lrc.c | 8 drivers/gpu/drm/i915/intel_workarounds.c | 2 +- 5 files changed, 16 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem.h b/drivers/gpu/drm/i915/i915_gem.h index 599c4f6eb1ea..b0e4b976880c 100644 --- a/drivers/gpu/drm/i915/i915_gem.h +++ b/drivers/gpu/drm/i915/i915_gem.h @@ -47,17 +47,19 @@ struct drm_i915_private; #define GEM_DEBUG_DECL(var) var #define GEM_DEBUG_EXEC(expr) expr #define GEM_DEBUG_BUG_ON(expr) GEM_BUG_ON(expr) +#define GEM_DEBUG_WARN_ON(expr) GEM_WARN_ON(expr) #else #define GEM_SHOW_DEBUG() (0) #define GEM_BUG_ON(expr) BUILD_BUG_ON_INVALID(expr) -#define GEM_WARN_ON(expr) (BUILD_BUG_ON_INVALID(expr), 0) +#define GEM_WARN_ON(expr) ({ unlikely(!!(expr)); }) #define GEM_DEBUG_DECL(var) #define GEM_DEBUG_EXEC(expr) do { } while (0) #define GEM_DEBUG_BUG_ON(expr) +#define GEM_DEBUG_WARN_ON(expr) ({ BUILD_BUG_ON_INVALID(expr); 0; }) #endif #if IS_ENABLED(CONFIG_DRM_I915_TRACE_GEM) diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index 31efc971a3a8..82652c3d1bed 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -305,12 +305,12 @@ int i915_vma_bind(struct i915_vma *vma, enum i915_cache_level cache_level, GEM_BUG_ON(!drm_mm_node_allocated(>node)); GEM_BUG_ON(vma->size > vma->node.size); - if (GEM_WARN_ON(range_overflows(vma->node.start, - vma->node.size, - vma->vm->total))) + if (GEM_DEBUG_WARN_ON(range_overflows(vma->node.start, + vma->node.size, + vma->vm->total))) return -ENODEV; - if (GEM_WARN_ON(!flags)) + if (GEM_DEBUG_WARN_ON(!flags)) return -EINVAL; bind_flags = 0; diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c index 10cd051ba29e..8dbdb18b2668 100644 --- a/drivers/gpu/drm/i915/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/intel_engine_cs.c @@ -273,13 +273,13 @@ intel_engine_setup(struct drm_i915_private *dev_priv, BUILD_BUG_ON(MAX_ENGINE_CLASS >= BIT(GEN11_ENGINE_CLASS_WIDTH)); BUILD_BUG_ON(MAX_ENGINE_INSTANCE >= BIT(GEN11_ENGINE_INSTANCE_WIDTH)); - if (GEM_WARN_ON(info->class > MAX_ENGINE_CLASS)) + if (GEM_DEBUG_WARN_ON(info->class > MAX_ENGINE_CLASS)) return -EINVAL; - if (GEM_WARN_ON(info->instance > MAX_ENGINE_INSTANCE)) + if (GEM_DEBUG_WARN_ON(info->instance > MAX_ENGINE_INSTANCE)) return -EINVAL; - if
Re: [Intel-gfx] [PATCH 08/21] drm/i915/guc: Make use of the SW counter field in the context descriptor
On 2018-08-30 16:15, Lis, Tomasz wrote: On 2018-08-30 02:08, Lionel Landwerlin wrote: On 29/08/2018 20:16, Michal Wajdeczko wrote: The new context descriptor format contains two assignable fields: the SW Context ID (technically 11 bits, but practically limited to 2032 entries due to some being reserved for future use by the GuC) and the SW Counter (6 bits). We don't want to limit ourselves too much in the maximum number of concurrent contexts we want to allow, so ideally we want to employ every possible bit available. Unfortunately, a further limitation in the interface with the GuC means the combination of SW Context ID + SW Counter has to be unique within the same engine class (as we use the SW Context ID to index in the GuC stage descriptor pool, and the Engine Class + SW Counter to index in the 2-dimensional lrc array). This essentially means we need to somehow encode the engine instance. Since the BSpec allows 6 bits for engine instance, we use the whole SW counter for this task. If the limitation of 2032 maximum simultaneous contexts is too restrictive, we can always squeeze things a bit more (3 extras bits for hw_id, 3 bits for instance) and things will still work (Gen11 does not instance more than 8 engines of any class). Another alternative would be to generate the hw_id per HW context instead of per GEM context, but that has other problems (e.g. maximum number of user-created contexts would be variable, no relationship between a GuC principal descriptor and the proxy descriptor it uses, ...) Bspec: 12254 Signed-off-by: Oscar Mateo Signed-off-by: Rodrigo Vivi Signed-off-by: Michal Wajdeczko Cc: Joonas Lahtinen Cc: Daniele Ceraolo Spurio Cc: Michel Thierry Tested-by: Tomasz Lis --- drivers/gpu/drm/i915/i915_drv.h | 15 +++ drivers/gpu/drm/i915/i915_gem_context.c | 5 - drivers/gpu/drm/i915/i915_gem_context.h | 2 ++ drivers/gpu/drm/i915/i915_reg.h | 2 ++ drivers/gpu/drm/i915/intel_lrc.c | 12 +--- 5 files changed, 28 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index e5b9d3c..34f5495 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1866,14 +1866,21 @@ struct drm_i915_private { struct llist_head free_list; struct work_struct free_work; - /* The hw wants to have a stable context identifier for the + /* + * The HW wants to have a stable context identifier for the * lifetime of the context (for OA, PASID, faults, etc). * This is limited in execlists to 21 bits. + * In enhanced execlist (GEN11+) this is limited to 11 bits + * (the SW Context ID field) but GuC limits it a bit further + * (11 bits - 16) due to some entries being reserved for future + * use (so the firmware only supports a GuC stage descriptor + * pool of 2032 entries). */ struct ida hw_ida; -#define MAX_CONTEXT_HW_ID (1<<21) /* exclusive */ -#define MAX_GUC_CONTEXT_HW_ID (1 << 20) /* exclusive */ -#define GEN11_MAX_CONTEXT_HW_ID (1<<11) /* exclusive */ +#define MAX_CONTEXT_HW_ID (1 << 21) /* exclusive */ +#define MAX_GUC_CONTEXT_HW_ID (1 << 20) /* exclusive */ +#define GEN11_MAX_CONTEXT_HW_ID (1 << 11) /* exclusive */ +#define GEN11_MAX_CONTEXT_HW_ID_WITH_GUC (GEN11_MAX_CONTEXT_HW_ID - 16) } contexts; u32 fdi_rx_config; diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index f15a039..e3b500c 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -209,7 +209,10 @@ static int assign_hw_id(struct drm_i915_private *dev_priv, unsigned *out) unsigned int max; if (INTEL_GEN(dev_priv) >= 11) { - max = GEN11_MAX_CONTEXT_HW_ID; + if (USES_GUC_SUBMISSION(dev_priv)) + max = GEN11_MAX_CONTEXT_HW_ID_WITH_GUC; + else + max = GEN11_MAX_CONTEXT_HW_ID; } else { /* * When using GuC in proxy submission, GuC consumes the diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h index 851dad6..4b87f5d 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.h +++ b/drivers/gpu/drm/i915/i915_gem_context.h @@ -154,6 +154,8 @@ struct i915_gem_context { struct intel_ring *ring; u32 *lrc_reg_state; u64 lrc_desc; + u32 sw_context_id; + u32 sw_counter; int pin_count; const struct intel_context_ops *ops; diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index f232178..ea65d7b 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -3900,6 +3900,8 @@ enum { #define GEN8_CTX_ID_WIDTH 21 #define GEN11_SW_CTX_ID_SHIFT 37 #d
Re: [Intel-gfx] [PATCH 06/21] drm/i915/guc: Use guc_class instead of engine_class in fw interface
Uhh, sorry - answered on wrong patch. Please ignore this one. -Tomasz On 2018-08-30 15:29, Lis, Tomasz wrote: On 2018-08-30 02:16, Lionel Landwerlin wrote: On 29/08/2018 20:58, Michel Thierry wrote: +Lionel (please see below as this touches the lrca format & relates to OA reporting too) On 8/29/2018 12:10 PM, Michal Wajdeczko wrote: Until now the GuC and HW engine class has been the same, which allowed us to use them interchangeable. But it is better to start doing the right thing and use the GuC definitions for the firmware interface. We also keep the same class id in the ctx descriptor to be able to have the same values in the driver and firmware logs. Signed-off-by: Michel Thierry Signed-off-by: Rodrigo Vivi Signed-off-by: Michal Wajdeczko Cc: Daniele Ceraolo Spurio Cc: Michel Thierry Cc: Lucas De Marchi Cc: Tomasz Lis Tested-by: Tomasz Lis --- drivers/gpu/drm/i915/intel_engine_cs.c | 13 + drivers/gpu/drm/i915/intel_guc_fwif.h | 7 +++ drivers/gpu/drm/i915/intel_lrc.c | 10 +- drivers/gpu/drm/i915/intel_ringbuffer.h | 2 ++ 4 files changed, 31 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c index 1a34e8f..bc81354 100644 --- a/drivers/gpu/drm/i915/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/intel_engine_cs.c @@ -85,6 +85,7 @@ struct engine_info { unsigned int hw_id; unsigned int uabi_id; u8 class; + u8 guc_class; u8 instance; /* mmio bases table *must* be sorted in reverse gen order */ struct engine_mmio_base { @@ -98,6 +99,7 @@ struct engine_info { .hw_id = RCS_HW, .uabi_id = I915_EXEC_RENDER, .class = RENDER_CLASS, + .guc_class = GUC_RENDER_CLASS, .instance = 0, .mmio_bases = { { .gen = 1, .base = RENDER_RING_BASE } @@ -107,6 +109,7 @@ struct engine_info { .hw_id = BCS_HW, .uabi_id = I915_EXEC_BLT, .class = COPY_ENGINE_CLASS, + .guc_class = GUC_BLITTER_CLASS, .instance = 0, .mmio_bases = { { .gen = 6, .base = BLT_RING_BASE } @@ -116,6 +119,7 @@ struct engine_info { .hw_id = VCS_HW, .uabi_id = I915_EXEC_BSD, .class = VIDEO_DECODE_CLASS, + .guc_class = GUC_VIDEO_CLASS, .instance = 0, .mmio_bases = { { .gen = 11, .base = GEN11_BSD_RING_BASE }, @@ -127,6 +131,7 @@ struct engine_info { .hw_id = VCS2_HW, .uabi_id = I915_EXEC_BSD, .class = VIDEO_DECODE_CLASS, + .guc_class = GUC_VIDEO_CLASS, .instance = 1, .mmio_bases = { { .gen = 11, .base = GEN11_BSD2_RING_BASE }, @@ -137,6 +142,7 @@ struct engine_info { .hw_id = VCS3_HW, .uabi_id = I915_EXEC_BSD, .class = VIDEO_DECODE_CLASS, + .guc_class = GUC_VIDEO_CLASS, .instance = 2, .mmio_bases = { { .gen = 11, .base = GEN11_BSD3_RING_BASE } @@ -146,6 +152,7 @@ struct engine_info { .hw_id = VCS4_HW, .uabi_id = I915_EXEC_BSD, .class = VIDEO_DECODE_CLASS, + .guc_class = GUC_VIDEO_CLASS, .instance = 3, .mmio_bases = { { .gen = 11, .base = GEN11_BSD4_RING_BASE } @@ -155,6 +162,7 @@ struct engine_info { .hw_id = VECS_HW, .uabi_id = I915_EXEC_VEBOX, .class = VIDEO_ENHANCEMENT_CLASS, + .guc_class = GUC_VIDEOENHANCE_CLASS, .instance = 0, .mmio_bases = { { .gen = 11, .base = GEN11_VEBOX_RING_BASE }, @@ -165,6 +173,7 @@ struct engine_info { .hw_id = VECS2_HW, .uabi_id = I915_EXEC_VEBOX, .class = VIDEO_ENHANCEMENT_CLASS, + .guc_class = GUC_VIDEOENHANCE_CLASS, .instance = 1, .mmio_bases = { { .gen = 11, .base = GEN11_VEBOX2_RING_BASE } @@ -276,6 +285,9 @@ static void __sprint_engine_name(char *name, const struct engine_info *info) if (GEM_WARN_ON(info->class > MAX_ENGINE_CLASS)) return -EINVAL; + if (GEM_WARN_ON(info->guc_class >= GUC_MAX_ENGINE_CLASSES)) + return -EINVAL; + if (GEM_WARN_ON(info->instance > MAX_ENGINE_INSTANCE)) return -EINVAL; @@ -291,6 +303,7 @@ static void __sprint_engine_name(char *name, const struct engine_info *info) engine->i915 = dev_priv; __sprint_engine_name(engine->name, info); engine->hw_id = engine->guc_id = info->hw_id; + engine->guc_class = info->guc_class; engine->mmio_base = __engine_mmio_base(dev_priv, info->mmio_bases); engine->class = info->class; engine->instance = info->instance; diff --git a/drivers/gpu/drm/i915/intel_guc_fwif.h b/drivers/gpu/drm/i915/intel_guc_fwif.h index 963da91..5b7a05b 100644 --- a/drivers/gpu/drm/i915/intel_guc_fwif.
Re: [Intel-gfx] [PATCH 08/21] drm/i915/guc: Make use of the SW counter field in the context descriptor
On 2018-08-30 02:08, Lionel Landwerlin wrote: On 29/08/2018 20:16, Michal Wajdeczko wrote: The new context descriptor format contains two assignable fields: the SW Context ID (technically 11 bits, but practically limited to 2032 entries due to some being reserved for future use by the GuC) and the SW Counter (6 bits). We don't want to limit ourselves too much in the maximum number of concurrent contexts we want to allow, so ideally we want to employ every possible bit available. Unfortunately, a further limitation in the interface with the GuC means the combination of SW Context ID + SW Counter has to be unique within the same engine class (as we use the SW Context ID to index in the GuC stage descriptor pool, and the Engine Class + SW Counter to index in the 2-dimensional lrc array). This essentially means we need to somehow encode the engine instance. Since the BSpec allows 6 bits for engine instance, we use the whole SW counter for this task. If the limitation of 2032 maximum simultaneous contexts is too restrictive, we can always squeeze things a bit more (3 extras bits for hw_id, 3 bits for instance) and things will still work (Gen11 does not instance more than 8 engines of any class). Another alternative would be to generate the hw_id per HW context instead of per GEM context, but that has other problems (e.g. maximum number of user-created contexts would be variable, no relationship between a GuC principal descriptor and the proxy descriptor it uses, ...) Bspec: 12254 Signed-off-by: Oscar Mateo Signed-off-by: Rodrigo Vivi Signed-off-by: Michal Wajdeczko Cc: Joonas Lahtinen Cc: Daniele Ceraolo Spurio Cc: Michel Thierry Tested-by: Tomasz Lis --- drivers/gpu/drm/i915/i915_drv.h | 15 +++ drivers/gpu/drm/i915/i915_gem_context.c | 5 - drivers/gpu/drm/i915/i915_gem_context.h | 2 ++ drivers/gpu/drm/i915/i915_reg.h | 2 ++ drivers/gpu/drm/i915/intel_lrc.c | 12 +--- 5 files changed, 28 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index e5b9d3c..34f5495 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1866,14 +1866,21 @@ struct drm_i915_private { struct llist_head free_list; struct work_struct free_work; - /* The hw wants to have a stable context identifier for the + /* + * The HW wants to have a stable context identifier for the * lifetime of the context (for OA, PASID, faults, etc). * This is limited in execlists to 21 bits. + * In enhanced execlist (GEN11+) this is limited to 11 bits + * (the SW Context ID field) but GuC limits it a bit further + * (11 bits - 16) due to some entries being reserved for future + * use (so the firmware only supports a GuC stage descriptor + * pool of 2032 entries). */ struct ida hw_ida; -#define MAX_CONTEXT_HW_ID (1<<21) /* exclusive */ -#define MAX_GUC_CONTEXT_HW_ID (1 << 20) /* exclusive */ -#define GEN11_MAX_CONTEXT_HW_ID (1<<11) /* exclusive */ +#define MAX_CONTEXT_HW_ID (1 << 21) /* exclusive */ +#define MAX_GUC_CONTEXT_HW_ID (1 << 20) /* exclusive */ +#define GEN11_MAX_CONTEXT_HW_ID (1 << 11) /* exclusive */ +#define GEN11_MAX_CONTEXT_HW_ID_WITH_GUC (GEN11_MAX_CONTEXT_HW_ID - 16) } contexts; u32 fdi_rx_config; diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index f15a039..e3b500c 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -209,7 +209,10 @@ static int assign_hw_id(struct drm_i915_private *dev_priv, unsigned *out) unsigned int max; if (INTEL_GEN(dev_priv) >= 11) { - max = GEN11_MAX_CONTEXT_HW_ID; + if (USES_GUC_SUBMISSION(dev_priv)) + max = GEN11_MAX_CONTEXT_HW_ID_WITH_GUC; + else + max = GEN11_MAX_CONTEXT_HW_ID; } else { /* * When using GuC in proxy submission, GuC consumes the diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h index 851dad6..4b87f5d 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.h +++ b/drivers/gpu/drm/i915/i915_gem_context.h @@ -154,6 +154,8 @@ struct i915_gem_context { struct intel_ring *ring; u32 *lrc_reg_state; u64 lrc_desc; + u32 sw_context_id; + u32 sw_counter; int pin_count; const struct intel_context_ops *ops; diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index f232178..ea65d7b 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -3900,6 +3900,8 @@ enum { #define GEN8_CTX_ID_WIDTH 21 #define GEN11_SW_CTX_ID_SHIFT 37 #define GEN11_SW_CTX_ID_WIDTH 11 +#define GEN11_SW_COUNTER_SHIFT 55 +#define GEN11_SW_COUNTER_WIDTH
Re: [Intel-gfx] [PATCH 06/21] drm/i915/guc: Use guc_class instead of engine_class in fw interface
On 2018-08-30 02:16, Lionel Landwerlin wrote: On 29/08/2018 20:58, Michel Thierry wrote: +Lionel (please see below as this touches the lrca format & relates to OA reporting too) On 8/29/2018 12:10 PM, Michal Wajdeczko wrote: Until now the GuC and HW engine class has been the same, which allowed us to use them interchangeable. But it is better to start doing the right thing and use the GuC definitions for the firmware interface. We also keep the same class id in the ctx descriptor to be able to have the same values in the driver and firmware logs. Signed-off-by: Michel Thierry Signed-off-by: Rodrigo Vivi Signed-off-by: Michal Wajdeczko Cc: Daniele Ceraolo Spurio Cc: Michel Thierry Cc: Lucas De Marchi Cc: Tomasz Lis Tested-by: Tomasz Lis --- drivers/gpu/drm/i915/intel_engine_cs.c | 13 + drivers/gpu/drm/i915/intel_guc_fwif.h | 7 +++ drivers/gpu/drm/i915/intel_lrc.c | 10 +- drivers/gpu/drm/i915/intel_ringbuffer.h | 2 ++ 4 files changed, 31 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c index 1a34e8f..bc81354 100644 --- a/drivers/gpu/drm/i915/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/intel_engine_cs.c @@ -85,6 +85,7 @@ struct engine_info { unsigned int hw_id; unsigned int uabi_id; u8 class; + u8 guc_class; u8 instance; /* mmio bases table *must* be sorted in reverse gen order */ struct engine_mmio_base { @@ -98,6 +99,7 @@ struct engine_info { .hw_id = RCS_HW, .uabi_id = I915_EXEC_RENDER, .class = RENDER_CLASS, + .guc_class = GUC_RENDER_CLASS, .instance = 0, .mmio_bases = { { .gen = 1, .base = RENDER_RING_BASE } @@ -107,6 +109,7 @@ struct engine_info { .hw_id = BCS_HW, .uabi_id = I915_EXEC_BLT, .class = COPY_ENGINE_CLASS, + .guc_class = GUC_BLITTER_CLASS, .instance = 0, .mmio_bases = { { .gen = 6, .base = BLT_RING_BASE } @@ -116,6 +119,7 @@ struct engine_info { .hw_id = VCS_HW, .uabi_id = I915_EXEC_BSD, .class = VIDEO_DECODE_CLASS, + .guc_class = GUC_VIDEO_CLASS, .instance = 0, .mmio_bases = { { .gen = 11, .base = GEN11_BSD_RING_BASE }, @@ -127,6 +131,7 @@ struct engine_info { .hw_id = VCS2_HW, .uabi_id = I915_EXEC_BSD, .class = VIDEO_DECODE_CLASS, + .guc_class = GUC_VIDEO_CLASS, .instance = 1, .mmio_bases = { { .gen = 11, .base = GEN11_BSD2_RING_BASE }, @@ -137,6 +142,7 @@ struct engine_info { .hw_id = VCS3_HW, .uabi_id = I915_EXEC_BSD, .class = VIDEO_DECODE_CLASS, + .guc_class = GUC_VIDEO_CLASS, .instance = 2, .mmio_bases = { { .gen = 11, .base = GEN11_BSD3_RING_BASE } @@ -146,6 +152,7 @@ struct engine_info { .hw_id = VCS4_HW, .uabi_id = I915_EXEC_BSD, .class = VIDEO_DECODE_CLASS, + .guc_class = GUC_VIDEO_CLASS, .instance = 3, .mmio_bases = { { .gen = 11, .base = GEN11_BSD4_RING_BASE } @@ -155,6 +162,7 @@ struct engine_info { .hw_id = VECS_HW, .uabi_id = I915_EXEC_VEBOX, .class = VIDEO_ENHANCEMENT_CLASS, + .guc_class = GUC_VIDEOENHANCE_CLASS, .instance = 0, .mmio_bases = { { .gen = 11, .base = GEN11_VEBOX_RING_BASE }, @@ -165,6 +173,7 @@ struct engine_info { .hw_id = VECS2_HW, .uabi_id = I915_EXEC_VEBOX, .class = VIDEO_ENHANCEMENT_CLASS, + .guc_class = GUC_VIDEOENHANCE_CLASS, .instance = 1, .mmio_bases = { { .gen = 11, .base = GEN11_VEBOX2_RING_BASE } @@ -276,6 +285,9 @@ static void __sprint_engine_name(char *name, const struct engine_info *info) if (GEM_WARN_ON(info->class > MAX_ENGINE_CLASS)) return -EINVAL; + if (GEM_WARN_ON(info->guc_class >= GUC_MAX_ENGINE_CLASSES)) + return -EINVAL; + if (GEM_WARN_ON(info->instance > MAX_ENGINE_INSTANCE)) return -EINVAL; @@ -291,6 +303,7 @@ static void __sprint_engine_name(char *name, const struct engine_info *info) engine->i915 = dev_priv; __sprint_engine_name(engine->name, info); engine->hw_id = engine->guc_id = info->hw_id; + engine->guc_class = info->guc_class; engine->mmio_base = __engine_mmio_base(dev_priv, info->mmio_bases); engine->class = info->class; engine->instance = info->instance; diff --git a/drivers/gpu/drm/i915/intel_guc_fwif.h b/drivers/gpu/drm/i915/intel_guc_fwif.h index 963da91..5b7a05b 100644 --- a/drivers/gpu/drm/i915/intel_guc_fwif.h +++ b/drivers/gpu/drm/i915/intel_guc_fwif.h @@ -39,6 +39,13 @@ #define GUC_VIDEO_ENGINE2 4 #define GUC_MAX_ENGINES_NUM (GUC_VIDEO_ENGINE2 + 1) +#define
Re: [Intel-gfx] [PATCH] drm/i915: Only force GGTT coherency w/a on required chipsets
On 2018-07-20 12:19, Chris Wilson wrote: Not all chipsets have an internal buffer delaying the visibility of writes via the GGTT being visible by other physical paths, but we use a very heavy workaround for all. We only need to apply that workarounds to the chipsets we know suffer from the delay and the resulting coherency issue. Similarly, the same inconsistent coherency fouls up our ABI promise that a write into a mmap_gtt is immediately visible to others. Since the HW has made that a lie, let userspace know when that contract is broken. (Not that userspace would want to use mmap_gtt on those chipsets for other performance reasons...) Testcase: igt/drv_selftest/live_coherency Testcase: igt/gem_mmap_gtt/coherency Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100587 Signed-off-by: Chris Wilson Cc: Joonas Lahtinen Is there any mention of this bug/feature in bspec? Would be nice to have a reference. Reviewed-by: Tomasz Lis --- Resend with --function-context -Chris --- drivers/gpu/drm/i915/i915_drv.c | 3 +++ drivers/gpu/drm/i915/i915_gem.c | 5 + drivers/gpu/drm/i915/i915_pci.c | 10 ++ drivers/gpu/drm/i915/intel_device_info.h | 1 + include/uapi/drm/i915_drm.h | 22 ++ 5 files changed, 41 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index f8cfd16be534..18a45e7a3d7c 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -302,152 +302,155 @@ static void intel_detect_pch(struct drm_i915_private *dev_priv) static int i915_getparam_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_i915_private *dev_priv = to_i915(dev); struct pci_dev *pdev = dev_priv->drm.pdev; drm_i915_getparam_t *param = data; int value; switch (param->param) { case I915_PARAM_IRQ_ACTIVE: case I915_PARAM_ALLOW_BATCHBUFFER: case I915_PARAM_LAST_DISPATCH: case I915_PARAM_HAS_EXEC_CONSTANTS: /* Reject all old ums/dri params. */ return -ENODEV; case I915_PARAM_CHIPSET_ID: value = pdev->device; break; case I915_PARAM_REVISION: value = pdev->revision; break; case I915_PARAM_NUM_FENCES_AVAIL: value = dev_priv->num_fence_regs; break; case I915_PARAM_HAS_OVERLAY: value = dev_priv->overlay ? 1 : 0; break; case I915_PARAM_HAS_BSD: value = !!dev_priv->engine[VCS]; break; case I915_PARAM_HAS_BLT: value = !!dev_priv->engine[BCS]; break; case I915_PARAM_HAS_VEBOX: value = !!dev_priv->engine[VECS]; break; case I915_PARAM_HAS_BSD2: value = !!dev_priv->engine[VCS2]; break; case I915_PARAM_HAS_LLC: value = HAS_LLC(dev_priv); break; case I915_PARAM_HAS_WT: value = HAS_WT(dev_priv); break; case I915_PARAM_HAS_ALIASING_PPGTT: value = USES_PPGTT(dev_priv); break; case I915_PARAM_HAS_SEMAPHORES: value = HAS_LEGACY_SEMAPHORES(dev_priv); break; case I915_PARAM_HAS_SECURE_BATCHES: value = capable(CAP_SYS_ADMIN); break; case I915_PARAM_CMD_PARSER_VERSION: value = i915_cmd_parser_get_version(dev_priv); break; case I915_PARAM_SUBSLICE_TOTAL: value = sseu_subslice_total(_INFO(dev_priv)->sseu); if (!value) return -ENODEV; break; case I915_PARAM_EU_TOTAL: value = INTEL_INFO(dev_priv)->sseu.eu_total; if (!value) return -ENODEV; break; case I915_PARAM_HAS_GPU_RESET: value = i915_modparams.enable_hangcheck && intel_has_gpu_reset(dev_priv); if (value && intel_has_reset_engine(dev_priv)) value = 2; break; case I915_PARAM_HAS_RESOURCE_STREAMER: value = HAS_RESOURCE_STREAMER(dev_priv); break; case I915_PARAM_HAS_POOLED_EU: value = HAS_POOLED_EU(dev_priv); break; case I915_PARAM_MIN_EU_IN_POOL: value = INTEL_INFO(dev_priv)->sseu.min_eu_in_pool; break; case I915_PARAM_HUC_STATUS: value = intel_huc_check_status(_priv->huc); if (value < 0) return value; break; case I915_PARAM_MMAP_GTT_VERSION: /* Though we've
Re: [Intel-gfx] [PATCH v6] drm/i915: Add IOCTL Param to control data port coherency.
On 2018-07-19 09:12, Joonas Lahtinen wrote: Quoting Lis, Tomasz (2018-07-18 18:28:32) On 2018-07-18 16:42, Tvrtko Ursulin wrote: On 18/07/2018 14:24, Joonas Lahtinen wrote: Quoting Tomasz Lis (2018-07-16 16:07:16) +++ b/include/uapi/drm/i915_drm.h @@ -1456,6 +1456,13 @@ struct drm_i915_gem_context_param { #define I915_CONTEXT_MAX_USER_PRIORITY 1023 /* inclusive */ #define I915_CONTEXT_DEFAULT_PRIORITY 0 #define I915_CONTEXT_MIN_USER_PRIORITY -1023 /* inclusive */ +/* + * When data port level coherency is enabled, the GPU will update memory + * buffers shared with CPU, by forcing internal cache units to send memory + * writes to higher level caches faster. Enabling data port coherency has + * a performance cost. + */ I was under impression this is enabled by default and it can be disabled for a performance optimization? This is true, coherency is kept by default. We disable it as a workaround: performance-related for gen11, and due to minor hardware issue on previous platforms. See WaForceEnableNonCoherent. Ok, then you definitely want to rephrase the comment to bake that information in it. Now it sounds like it needs to be turned on to have coherency. I'm not sure if I understand what you're asking for. Should I emphasize that the feature is disabled unless the flag is set? This seem obvious... Or should I provide the reason why it is disabled on specific platforms? This should probably be done within workaround setup, not in user api definition. Or maybe it's enough to have it in Bspec? Bspec links are provided in the patch. Or should I just mention the workaround name? -Tomasz Regards, Joonas ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [PATCH v6] drm/i915: Add IOCTL Param to control data port coherency.
On 2018-07-18 16:42, Tvrtko Ursulin wrote: On 18/07/2018 14:24, Joonas Lahtinen wrote: Quoting Tomasz Lis (2018-07-16 16:07:16) +static int emit_set_data_port_coherency(struct i915_request *rq, bool enable) +{ + u32 *cs; + i915_reg_t reg; + + GEM_BUG_ON(rq->engine->class != RENDER_CLASS); + GEM_BUG_ON(INTEL_GEN(rq->i915) < 9); + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + if (INTEL_GEN(rq->i915) >= 11) + reg = ICL_HDC_MODE; + else if (INTEL_GEN(rq->i915) >= 10) + reg = CNL_HDC_CHICKEN0; + else + reg = HDC_CHICKEN0; + + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = i915_mmio_reg_offset(reg); + /* Enabling coherency means disabling the bit which forces it off */ This comment is still spurious, please get rid of the habit of writing comments about "what" the code is doing, useful comments should be limited to "why", which is quite self explanatory here, that's the way the register is. Ok, I will read the related doc: https://www.kernel.org/doc/html/v4.10/process/coding-style.html#commenting +static int +intel_lr_context_update_data_port_coherency(struct i915_request *rq) +{ + struct i915_gem_context *ctx = rq->gem_context; + bool enable = test_bit(CONTEXT_DATA_PORT_COHERENT_REQUESTED, >flags); + int ret; + + lockdep_assert_held(>i915->drm.struct_mutex); + + if (test_bit(CONTEXT_DATA_PORT_COHERENT_ACTIVE, >flags) == enable) + return 0; + + ret = emit_set_data_port_coherency(rq, enable); + + if (!ret) { + if (enable) + __set_bit(CONTEXT_DATA_PORT_COHERENT_ACTIVE, >flags); + else + __clear_bit(CONTEXT_DATA_PORT_COHERENT_ACTIVE, >flags); + } Do we have indication that the hardware feature will be unreliable in responding to the requests? I don't think you need the differentiation of requested vs. active. If there is an error, we can just report back to the user as a failed IOCTL. Now it adds unnecessary complication for no benefit. Requested vs active is for implementing the lazy emit. AFAIR it does propagate the error out of execbuf (although we never ever expect it to happen), and this is just to keep the internal house-keeping in sync. Regards, Tvrtko @@ -2164,6 +2221,13 @@ static int gen8_emit_flush_render(struct i915_request *request, /* WaForGAMHang:kbl */ if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0)) dc_flush_wa = true; + + /* Emit the switch of data port coherency state if needed */ Ditto for spurious comment, just about what the code does. +++ b/include/uapi/drm/i915_drm.h @@ -1456,6 +1456,13 @@ struct drm_i915_gem_context_param { #define I915_CONTEXT_MAX_USER_PRIORITY 1023 /* inclusive */ #define I915_CONTEXT_DEFAULT_PRIORITY 0 #define I915_CONTEXT_MIN_USER_PRIORITY -1023 /* inclusive */ +/* + * When data port level coherency is enabled, the GPU will update memory + * buffers shared with CPU, by forcing internal cache units to send memory + * writes to higher level caches faster. Enabling data port coherency has + * a performance cost. + */ I was under impression this is enabled by default and it can be disabled for a performance optimization? This is true, coherency is kept by default. We disable it as a workaround: performance-related for gen11, and due to minor hardware issue on previous platforms. See WaForceEnableNonCoherent. -Tomasz Regards, Joonas ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [PATCH v5] drm/i915: Add IOCTL Param to control data port coherency.
On 2018-07-13 12:40, Tvrtko Ursulin wrote: On 12/07/2018 16:10, Tomasz Lis wrote: The patch adds a parameter to control the data port coherency functionality on a per-context level. When the IOCTL is called, a command to switch data port coherency state is added to the ordered list. All prior requests are executed on old coherency settings, and all exec requests after the IOCTL will use new settings. Rationale: The OpenCL driver develpers requested a functionality to control cache coherency at data port level. Keeping the coherency at that level is disabled by default due to its performance costs. OpenCL driver is planning to enable it for a small subset of submissions, when such functionality is required. Below are answers to basic question explaining background of the functionality and reasoning for the proposed implementation: 1. Why do we need a coherency enable/disable switch for memory that is shared between CPU and GEN (GPU)? Memory coherency between CPU and GEN, while being a great feature that enables CL_MEM_SVM_FINE_GRAIN_BUFFER OCL capability on Intel GEN architecture, adds overhead related to tracking (snooping) memory inside different cache units (L1$, L2$, L3$, LLC$, etc.). At the same time, minority of modern OCL applications actually use CL_MEM_SVM_FINE_GRAIN_BUFFER (and hence require memory coherency between CPU and GPU). The goal of coherency enable/disable switch is to remove overhead of memory coherency when memory coherency is not needed. 2. Why do we need a global coherency switch? In order to support I/O commands from within EUs (Execution Units), Intel GEN ISA (GEN Instruction Set Assembly) contains dedicated "send" instructions. These send instructions provide several addressing models. One of these addressing models (named "stateless") provides most flexible I/O using plain virtual addresses (as opposed to buffer_handle+offset models). This "stateless" model is similar to regular memory load/store operations available on typical CPUs. Since this model provides I/O using arbitrary virtual addresses, it enables algorithmic designs that are based on pointer-to-pointer (e.g. buffer of pointers) concepts. For instance, it allows creating tree-like data structures such as: | NODE1 | | uint64_t data | +| | NODE* | NODE*| ++---+ / \ / \ | NODE2 | | NODE3 | | uint64_t data | | uint64_t data | +| +| | NODE* | NODE*| | NODE* | NODE*| ++---+ ++---+ Please note that pointers inside such structures can point to memory locations in different OCL allocations - e.g. NODE1 and NODE2 can reside in one OCL allocation while NODE3 resides in a completely separate OCL allocation. Additionally, such pointers can be shared with CPU (i.e. using SVM - Shared Virtual Memory feature). Using pointers from different allocations doesn't affect the stateless addressing model which even allows scattered reading from different allocations at the same time (i.e. by utilizing SIMD-nature of send instructions). When it comes to coherency programming, send instructions in stateless model can be encoded (at ISA level) to either use or disable coherency. However, for generic OCL applications (such as example with tree-like data structure), OCL compiler is not able to determine origin of memory pointed to by an arbitrary pointer - i.e. is not able to track given pointer back to a specific allocation. As such, it's not able to decide whether coherency is needed or not for specific pointer (or for specific I/O instruction). As a result, compiler encodes all stateless sends as coherent (doing otherwise would lead to functional issues resulting from data corruption). Please note that it would be possible to workaround this (e.g. based on allocations map and pointer bounds checking prior to each I/O instruction) but the performance cost of such workaround would be many times greater than the cost of keeping coherency always enabled. As such, enabling/disabling memory coherency at GEN ISA level is not feasible and alternative method is needed. Such alternative solution is to have a global coherency switch that allows disabling coherency for single (though entire) GPU submission. This is beneficial because this way we: * can enable (and pay for) coherency only in submissions that actually need coherency (submissions that use CL_MEM_SVM_FINE_GRAIN_BUFFER resources) * don't care about coherency at GEN ISA granularity (no performance impact) 3. Will coherency switch be used frequently? There are scenarios that will require frequent toggling
Re: [Intel-gfx] [PATCH v4] drm/i915: Add IOCTL Param to control data port coherency.
On 2018-07-10 20:03, Lis, Tomasz wrote: On 2018-07-09 18:28, Tvrtko Ursulin wrote: On 09/07/2018 14:20, Tomasz Lis wrote: diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h index 1593194..f6965ae 100644 --- a/drivers/gpu/drm/i915/intel_lrc.h +++ b/drivers/gpu/drm/i915/intel_lrc.h [...] +/* + * When data port level coherency is enabled, the GPU will update memory + * buffers shared with CPU, by forcing internal cache units to send memory + * writes to real RAM faster. Keeping such coherency has performance cost. Is this comment correct? Is it actually sending memory writes to _RAM_, or just the coherency mode enabled, even if only targetting CPU or shared cache, which adds a cost? I'm not sure whether there are further coherency modes to choose how "deep" coherency goes. The use case of OCL Team is to see gradual changes in the buffers on CPU side while the execution progresses. Write to RAM is needed to achieve that. And that limits performance by using RAM bandwidth. It was pointed out to me that last level cache is shared between CPU and GPU on non-atoms. Which means my argument was invalid, an most likely the coherency option does not enforce RAM write. I will update the comment. -Tomasz ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [PATCH v4] drm/i915: Add IOCTL Param to control data port coherency.
On 2018-07-09 18:28, Tvrtko Ursulin wrote: On 09/07/2018 14:20, Tomasz Lis wrote: The patch adds a parameter to control the data port coherency functionality on a per-context level. When the IOCTL is called, a command to switch data port coherency state is added to the ordered list. All prior requests are executed on old coherency settings, and all exec requests after the IOCTL will use new settings. Rationale: The OpenCL driver develpers requested a functionality to control cache coherency at data port level. Keeping the coherency at that level is disabled by default due to its performance costs. OpenCL driver is planning to enable it for a small subset of submissions, when such functionality is required. Below are answers to basic question explaining background of the functionality and reasoning for the proposed implementation: 1. Why do we need a coherency enable/disable switch for memory that is shared between CPU and GEN (GPU)? Memory coherency between CPU and GEN, while being a great feature that enables CL_MEM_SVM_FINE_GRAIN_BUFFER OCL capability on Intel GEN architecture, adds overhead related to tracking (snooping) memory inside different cache units (L1$, L2$, L3$, LLC$, etc.). At the same time, minority of modern OCL applications actually use CL_MEM_SVM_FINE_GRAIN_BUFFER (and hence require memory coherency between CPU and GPU). The goal of coherency enable/disable switch is to remove overhead of memory coherency when memory coherency is not needed. 2. Why do we need a global coherency switch? In order to support I/O commands from within EUs (Execution Units), Intel GEN ISA (GEN Instruction Set Assembly) contains dedicated "send" instructions. These send instructions provide several addressing models. One of these addressing models (named "stateless") provides most flexible I/O using plain virtual addresses (as opposed to buffer_handle+offset models). This "stateless" model is similar to regular memory load/store operations available on typical CPUs. Since this model provides I/O using arbitrary virtual addresses, it enables algorithmic designs that are based on pointer-to-pointer (e.g. buffer of pointers) concepts. For instance, it allows creating tree-like data structures such as: | NODE1 | | uint64_t data | +| | NODE* | NODE*| ++---+ / \ / \ | NODE2 | | NODE3 | | uint64_t data | | uint64_t data | +| +| | NODE* | NODE*| | NODE* | NODE*| ++---+ ++---+ Please note that pointers inside such structures can point to memory locations in different OCL allocations - e.g. NODE1 and NODE2 can reside in one OCL allocation while NODE3 resides in a completely separate OCL allocation. Additionally, such pointers can be shared with CPU (i.e. using SVM - Shared Virtual Memory feature). Using pointers from different allocations doesn't affect the stateless addressing model which even allows scattered reading from different allocations at the same time (i.e. by utilizing SIMD-nature of send instructions). When it comes to coherency programming, send instructions in stateless model can be encoded (at ISA level) to either use or disable coherency. However, for generic OCL applications (such as example with tree-like data structure), OCL compiler is not able to determine origin of memory pointed to by an arbitrary pointer - i.e. is not able to track given pointer back to a specific allocation. As such, it's not able to decide whether coherency is needed or not for specific pointer (or for specific I/O instruction). As a result, compiler encodes all stateless sends as coherent (doing otherwise would lead to functional issues resulting from data corruption). Please note that it would be possible to workaround this (e.g. based on allocations map and pointer bounds checking prior to each I/O instruction) but the performance cost of such workaround would be many times greater than the cost of keeping coherency always enabled. As such, enabling/disabling memory coherency at GEN ISA level is not feasible and alternative method is needed. Such alternative solution is to have a global coherency switch that allows disabling coherency for single (though entire) GPU submission. This is beneficial because this way we: * can enable (and pay for) coherency only in submissions that actually need coherency (submissions that use CL_MEM_SVM_FINE_GRAIN_BUFFER resources) * don't care about coherency at GEN ISA granularity (no performance impact) 3. Will coherency switch be used frequently? There are scenarios that will require frequent toggling
Re: [Intel-gfx] [PATCH v4] drm/i915: Add IOCTL Param to control data port coherency.
On 2018-07-09 18:37, Chris Wilson wrote: Quoting Tvrtko Ursulin (2018-07-09 17:28:02) On 09/07/2018 14:20, Tomasz Lis wrote: +static int i915_gem_context_clear_data_port_coherent(struct i915_gem_context *ctx) +{ + int ret; + + ret = intel_lr_context_modify_data_port_coherency(ctx, false); + if (!ret) + __clear_bit(CONTEXT_DATA_PORT_COHERENT, >flags); + return ret; Is there a good reason you allow userspace to keep emitting unlimited number of commands which actually do not change the status? If not please consider gating the command emission with test_and_set_bit/test_and_clear_bit. Hm.. apart even with that they could keep toggling ad infinitum with no real work in between. Has it been considered to only save the desired state in set param and then emit it, if needed, before next execbuf? Minor thing in any case, just curious since I wasn't following the threads. The first patch tried to add a bit to execbuf, and having been mistakenly down that road before, we asked if there was any alternative. (Now if you've also been following execbuf3 conversations, having a packet for privileged LRI is definitely something we want.) Setting the value in the context register is precisely what we want to do, and trivially serialised with execbuf since we have to serialise reservation of ring space, i.e. the normal rules of request generation. (execbuf is just a client and nothing special). From that point of view, we only care about frequency, if it is very frequent it should be controlled by userspace inside the batch (but it can't due to there being dangerous bits inside the reg aiui). At the other end of the scale, is context_setparam for set-once. And there should be no inbetween as that requires costly batch flushes. -Chris Joonas did brought that concern in his review; here it is, with my response: On 2018-06-21 15:47, Lis, Tomasz wrote: On 2018-06-21 08:39, Joonas Lahtinen wrote: I'm thinking we should set this value when it has changed, when we insert the requests into the command stream. So if you change back and forth, while not emitting any requests, nothing really happens. If you change the value and emit a request, we should emit a LRI before the jump to the commands. Similary if you keep setting the value to the value it already was in, nothing will happen, again. When I considered that, my way of reasoning was: If we execute the flag changing buffer right away, it may be sent to hardware faster if there is no job in progress. If we use the lazy way, and trigger the change just before submission - there will be additional conditions in submission code, plus the change will be made when there is another job pending (though it's not a considerable payload to just switch a flag). If user space switches the flag back and forth without much sense, then there is something wrong with the user space driver, and it shouldn't be up to kernel to fix that. This is why I chosen the current approach. But I can change it if you wish. So while I think the current solution is better from performance standpoint, but I will change it if you request that. -Tomasz ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [PATCH v4] drm/i915: Add IOCTL Param to control data port coherency.
On 2018-07-09 16:24, Lionel Landwerlin wrote: On 09/07/18 15:03, Lis, Tomasz wrote: On 2018-07-09 15:48, Lionel Landwerlin wrote: On 09/07/18 14:20, Tomasz Lis wrote: The patch adds a parameter to control the data port coherency functionality on a per-context level. When the IOCTL is called, a command to switch data port coherency state is added to the ordered list. All prior requests are executed on old coherency settings, and all exec requests after the IOCTL will use new settings. Rationale: The OpenCL driver develpers requested a functionality to control cache coherency at data port level. Keeping the coherency at that level is disabled by default due to its performance costs. OpenCL driver is planning to enable it for a small subset of submissions, when such functionality is required. Below are answers to basic question explaining background of the functionality and reasoning for the proposed implementation: 1. Why do we need a coherency enable/disable switch for memory that is shared between CPU and GEN (GPU)? Memory coherency between CPU and GEN, while being a great feature that enables CL_MEM_SVM_FINE_GRAIN_BUFFER OCL capability on Intel GEN architecture, adds overhead related to tracking (snooping) memory inside different cache units (L1$, L2$, L3$, LLC$, etc.). At the same time, minority of modern OCL applications actually use CL_MEM_SVM_FINE_GRAIN_BUFFER (and hence require memory coherency between CPU and GPU). The goal of coherency enable/disable switch is to remove overhead of memory coherency when memory coherency is not needed. 2. Why do we need a global coherency switch? In order to support I/O commands from within EUs (Execution Units), Intel GEN ISA (GEN Instruction Set Assembly) contains dedicated "send" instructions. These send instructions provide several addressing models. One of these addressing models (named "stateless") provides most flexible I/O using plain virtual addresses (as opposed to buffer_handle+offset models). This "stateless" model is similar to regular memory load/store operations available on typical CPUs. Since this model provides I/O using arbitrary virtual addresses, it enables algorithmic designs that are based on pointer-to-pointer (e.g. buffer of pointers) concepts. For instance, it allows creating tree-like data structures such as: | NODE1 | | uint64_t data | +| | NODE* | NODE*| ++---+ / \ / \ | NODE2 | | NODE3 | | uint64_t data | | uint64_t data | +| +| | NODE* | NODE*| | NODE* | NODE*| ++---+ ++---+ Please note that pointers inside such structures can point to memory locations in different OCL allocations - e.g. NODE1 and NODE2 can reside in one OCL allocation while NODE3 resides in a completely separate OCL allocation. Additionally, such pointers can be shared with CPU (i.e. using SVM - Shared Virtual Memory feature). Using pointers from different allocations doesn't affect the stateless addressing model which even allows scattered reading from different allocations at the same time (i.e. by utilizing SIMD-nature of send instructions). When it comes to coherency programming, send instructions in stateless model can be encoded (at ISA level) to either use or disable coherency. However, for generic OCL applications (such as example with tree-like data structure), OCL compiler is not able to determine origin of memory pointed to by an arbitrary pointer - i.e. is not able to track given pointer back to a specific allocation. As such, it's not able to decide whether coherency is needed or not for specific pointer (or for specific I/O instruction). As a result, compiler encodes all stateless sends as coherent (doing otherwise would lead to functional issues resulting from data corruption). Please note that it would be possible to workaround this (e.g. based on allocations map and pointer bounds checking prior to each I/O instruction) but the performance cost of such workaround would be many times greater than the cost of keeping coherency always enabled. As such, enabling/disabling memory coherency at GEN ISA level is not feasible and alternative method is needed. Such alternative solution is to have a global coherency switch that allows disabling coherency for single (though entire) GPU submission. This is beneficial because this way we: * can enable (and pay for) coherency only in submissions that actually need coherency (submissions that use CL_MEM_SVM_FINE_GRAIN_BUFFER resources) * don't care about coherency at GEN ISA granularit
Re: [Intel-gfx] [PATCH v4] drm/i915: Add IOCTL Param to control data port coherency.
On 2018-07-09 15:48, Lionel Landwerlin wrote: On 09/07/18 14:20, Tomasz Lis wrote: The patch adds a parameter to control the data port coherency functionality on a per-context level. When the IOCTL is called, a command to switch data port coherency state is added to the ordered list. All prior requests are executed on old coherency settings, and all exec requests after the IOCTL will use new settings. Rationale: The OpenCL driver develpers requested a functionality to control cache coherency at data port level. Keeping the coherency at that level is disabled by default due to its performance costs. OpenCL driver is planning to enable it for a small subset of submissions, when such functionality is required. Below are answers to basic question explaining background of the functionality and reasoning for the proposed implementation: 1. Why do we need a coherency enable/disable switch for memory that is shared between CPU and GEN (GPU)? Memory coherency between CPU and GEN, while being a great feature that enables CL_MEM_SVM_FINE_GRAIN_BUFFER OCL capability on Intel GEN architecture, adds overhead related to tracking (snooping) memory inside different cache units (L1$, L2$, L3$, LLC$, etc.). At the same time, minority of modern OCL applications actually use CL_MEM_SVM_FINE_GRAIN_BUFFER (and hence require memory coherency between CPU and GPU). The goal of coherency enable/disable switch is to remove overhead of memory coherency when memory coherency is not needed. 2. Why do we need a global coherency switch? In order to support I/O commands from within EUs (Execution Units), Intel GEN ISA (GEN Instruction Set Assembly) contains dedicated "send" instructions. These send instructions provide several addressing models. One of these addressing models (named "stateless") provides most flexible I/O using plain virtual addresses (as opposed to buffer_handle+offset models). This "stateless" model is similar to regular memory load/store operations available on typical CPUs. Since this model provides I/O using arbitrary virtual addresses, it enables algorithmic designs that are based on pointer-to-pointer (e.g. buffer of pointers) concepts. For instance, it allows creating tree-like data structures such as: | NODE1 | | uint64_t data | +| | NODE* | NODE*| ++---+ / \ / \ | NODE2 | | NODE3 | | uint64_t data | | uint64_t data | +| +| | NODE* | NODE*| | NODE* | NODE*| ++---+ ++---+ Please note that pointers inside such structures can point to memory locations in different OCL allocations - e.g. NODE1 and NODE2 can reside in one OCL allocation while NODE3 resides in a completely separate OCL allocation. Additionally, such pointers can be shared with CPU (i.e. using SVM - Shared Virtual Memory feature). Using pointers from different allocations doesn't affect the stateless addressing model which even allows scattered reading from different allocations at the same time (i.e. by utilizing SIMD-nature of send instructions). When it comes to coherency programming, send instructions in stateless model can be encoded (at ISA level) to either use or disable coherency. However, for generic OCL applications (such as example with tree-like data structure), OCL compiler is not able to determine origin of memory pointed to by an arbitrary pointer - i.e. is not able to track given pointer back to a specific allocation. As such, it's not able to decide whether coherency is needed or not for specific pointer (or for specific I/O instruction). As a result, compiler encodes all stateless sends as coherent (doing otherwise would lead to functional issues resulting from data corruption). Please note that it would be possible to workaround this (e.g. based on allocations map and pointer bounds checking prior to each I/O instruction) but the performance cost of such workaround would be many times greater than the cost of keeping coherency always enabled. As such, enabling/disabling memory coherency at GEN ISA level is not feasible and alternative method is needed. Such alternative solution is to have a global coherency switch that allows disabling coherency for single (though entire) GPU submission. This is beneficial because this way we: * can enable (and pay for) coherency only in submissions that actually need coherency (submissions that use CL_MEM_SVM_FINE_GRAIN_BUFFER resources) * don't care about coherency at GEN ISA granularity (no performance impact) 3. Will coherency switch be used frequently? There are scenarios that will require frequent toggling
Re: [Intel-gfx] [PATCH v4] drm/i915/gen11: Preempt-to-idle support in execlists.
On 2018-06-11 18:37, Daniele Ceraolo Spurio wrote: On 25/05/18 11:26, Tomasz Lis wrote: The patch adds support of preempt-to-idle requesting by setting a proper bit within Execlist Control Register, and receiving preemption result from Context Status Buffer. Preemption in previous gens required a special batch buffer to be executed, so the Command Streamer never preempted to idle directly. In Icelake it is possible, as there is a hardware mechanism to inform the kernel about status of the preemption request. This patch does not cover using the new preemption mechanism when GuC is active. v2: Added needs_preempt_context() change so that it is not created when preempt-to-idle is supported. (Chris) Updated setting HWACK flag so that it is cleared after preempt-to-dle. (Chris, Daniele) Updated to use I915_ENGINE_HAS_PREEMPTION flag. (Chris) v3: Fixed needs_preempt_context() change. (Chris) Merged preemption trigger functions to one. (Chris) Fixed conyext state tonot assume COMPLETED_MASK after preemption, since idle-to-idle case will not have it set. v4: Simplified needs_preempt_context() change. (Daniele) Removed clearing HWACK flag in idle-to-idle preempt. (Daniele) Cc: Joonas Lahtinen Cc: Chris Wilson Cc: Daniele Ceraolo Spurio Cc: Michal Winiarski Cc: Mika Kuoppala Bspec: 18922 Signed-off-by: Tomasz Lis --- drivers/gpu/drm/i915/i915_drv.h | 2 + drivers/gpu/drm/i915/i915_gem_context.c | 3 +- drivers/gpu/drm/i915/i915_pci.c | 3 +- drivers/gpu/drm/i915/intel_device_info.h | 1 + drivers/gpu/drm/i915/intel_lrc.c | 113 +-- drivers/gpu/drm/i915/intel_lrc.h | 1 + 6 files changed, 86 insertions(+), 37 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 487922f..35eddf7 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2534,6 +2534,8 @@ intel_info(const struct drm_i915_private *dev_priv) ((dev_priv)->info.has_logical_ring_elsq) #define HAS_LOGICAL_RING_PREEMPTION(dev_priv) \ ((dev_priv)->info.has_logical_ring_preemption) +#define HAS_HW_PREEMPT_TO_IDLE(dev_priv) \ + ((dev_priv)->info.has_hw_preempt_to_idle) #define HAS_EXECLISTS(dev_priv) HAS_LOGICAL_RING_CONTEXTS(dev_priv) diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 45393f6..341a5ff 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -455,7 +455,8 @@ destroy_kernel_context(struct i915_gem_context **ctxp) static bool needs_preempt_context(struct drm_i915_private *i915) { - return HAS_LOGICAL_RING_PREEMPTION(i915); + return HAS_LOGICAL_RING_PREEMPTION(i915) && + !HAS_HW_PREEMPT_TO_IDLE(i915); } int i915_gem_contexts_init(struct drm_i915_private *dev_priv) diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c index 97a91e6a..ee09926 100644 --- a/drivers/gpu/drm/i915/i915_pci.c +++ b/drivers/gpu/drm/i915/i915_pci.c @@ -593,7 +593,8 @@ static const struct intel_device_info intel_cannonlake_info = { GEN(11), \ .ddb_size = 2048, \ .has_csr = 0, \ - .has_logical_ring_elsq = 1 + .has_logical_ring_elsq = 1, \ + .has_hw_preempt_to_idle = 1 static const struct intel_device_info intel_icelake_11_info = { GEN11_FEATURES, diff --git a/drivers/gpu/drm/i915/intel_device_info.h b/drivers/gpu/drm/i915/intel_device_info.h index 933e316..4eb97b5 100644 --- a/drivers/gpu/drm/i915/intel_device_info.h +++ b/drivers/gpu/drm/i915/intel_device_info.h @@ -98,6 +98,7 @@ enum intel_platform { func(has_logical_ring_contexts); \ func(has_logical_ring_elsq); \ func(has_logical_ring_preemption); \ + func(has_hw_preempt_to_idle); \ func(has_overlay); \ func(has_pooled_eu); \ func(has_psr); \ diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 8a6058b..f95cb37 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -154,6 +154,7 @@ #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) #define GEN8_CTX_STATUS_COMPLETE (1 << 4) #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) +#define GEN11_CTX_STATUS_PREEMPT_IDLE (1 << 29) #define GEN8_CTX_STATUS_COMPLETED_MASK \ (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) @@ -522,31 +523,46 @@ static void port_assign(struct execlist_port *port, struct i915_request *rq) static void inject_preempt_context(struct intel_engine_cs *engine) continuing the discussion from the previous patch, I still think that we should rename this function now that it doesn't inject a context on some gens. A new function name should be relatively trivial to handle from other patch series hitting the area (compared to having a second function). Ok, will rename it then.
Re: [Intel-gfx] [PATCH v1] drm/i915: Add IOCTL Param to control data port coherency.
On 2018-06-21 08:39, Joonas Lahtinen wrote: Changelog would be much appreciated. And this is not the first version of the series. It helps to remind the reviewer that original implementation was changed into IOCTl based on feedback. Please see the git log in i915 for some examples. Will add. I considered this a separate series, as it is a different implementation. Quoting Tomasz Lis (2018-06-20 18:03:07) The patch adds a parameter to control the data port coherency functionality on a per-context level. When the IOCTL is called, a command to switch data port coherency state is added to the ordered list. All prior requests are executed on old coherency settings, and all exec requests after the IOCTL will use new settings. Rationale: The OpenCL driver develpers requested a functionality to control cache coherency at data port level. Keeping the coherency at that level is disabled by default due to its performance costs. OpenCL driver is planning to enable it for a small subset of submissions, when such functionality is required. Below are answers to basic question explaining background of the functionality and reasoning for the proposed implementation: 1. Why do we need a coherency enable/disable switch for memory that is shared between CPU and GEN (GPU)? Memory coherency between CPU and GEN, while being a great feature that enables CL_MEM_SVM_FINE_GRAIN_BUFFER OCL capability on Intel GEN architecture, adds overhead related to tracking (snooping) memory inside different cache units (L1$, L2$, L3$, LLC$, etc.). At the same time, minority of modern OCL applications actually use CL_MEM_SVM_FINE_GRAIN_BUFFER (and hence require memory coherency between CPU and GPU). The goal of coherency enable/disable switch is to remove overhead of memory coherency when memory coherency is not needed. 2. Why do we need a global coherency switch? In order to support I/O commands from within EUs (Execution Units), Intel GEN ISA (GEN Instruction Set Assembly) contains dedicated "send" instructions. These send instructions provide several addressing models. One of these addressing models (named "stateless") provides most flexible I/O using plain virtual addresses (as opposed to buffer_handle+offset models). This "stateless" model is similar to regular memory load/store operations available on typical CPUs. Since this model provides I/O using arbitrary virtual addresses, it enables algorithmic designs that are based on pointer-to-pointer (e.g. buffer of pointers) concepts. For instance, it allows creating tree-like data structures such as: | NODE1 | | uint64_t data | +| | NODE* | NODE*| ++---+ / \ /\ | NODE2 || NODE3 | | uint64_t data || uint64_t data | +|+| | NODE* | NODE*|| NODE* | NODE*| ++---+++---+ Please note that pointers inside such structures can point to memory locations in different OCL allocations - e.g. NODE1 and NODE2 can reside in one OCL allocation while NODE3 resides in a completely separate OCL allocation. Additionally, such pointers can be shared with CPU (i.e. using SVM - Shared Virtual Memory feature). Using pointers from different allocations doesn't affect the stateless addressing model which even allows scattered reading from different allocations at the same time (i.e. by utilizing SIMD-nature of send instructions). When it comes to coherency programming, send instructions in stateless model can be encoded (at ISA level) to either use or disable coherency. However, for generic OCL applications (such as example with tree-like data structure), OCL compiler is not able to determine origin of memory pointed to by an arbitrary pointer - i.e. is not able to track given pointer back to a specific allocation. As such, it's not able to decide whether coherency is needed or not for specific pointer (or for specific I/O instruction). As a result, compiler encodes all stateless sends as coherent (doing otherwise would lead to functional issues resulting from data corruption). Please note that it would be possible to workaround this (e.g. based on allocations map and pointer bounds checking prior to each I/O instruction) but the performance cost of such workaround would be many times greater than the cost of keeping coherency always enabled. As such, enabling/disabling memory coherency at GEN ISA level is not feasible and alternative method is needed. Such alternative solution is to have a global coherency switch that allows disabling coherency for single (though entire) GPU submission. This is beneficial because this way we: * can enable (and pay for) coherency only in
Re: [Intel-gfx] [PATCH v1] drm/i915: Add IOCTL Param to control data port coherency.
On 2018-06-21 09:05, Chris Wilson wrote: Quoting Tomasz Lis (2018-06-20 16:03:07) diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 33bc914..c69dc26 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -258,6 +258,57 @@ intel_lr_context_descriptor_update(struct i915_gem_context *ctx, ce->lrc_desc = desc; } +static int emit_set_data_port_coherency(struct i915_request *req, bool enable) +{ + u32 *cs; + i915_reg_t reg; + + GEM_BUG_ON(req->engine->class != RENDER_CLASS); + GEM_BUG_ON(INTEL_GEN(req->i915) < 9); + + cs = intel_ring_begin(req, 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + if (INTEL_GEN(req->i915) >= 10) + reg = CNL_HDC_CHICKEN0; + else + reg = HDC_CHICKEN0; + + /* FIXME: this feature may be unuseable on CNL; If this checks to be +* true, we should enodev for CNL. */ + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = i915_mmio_reg_offset(reg); + /* Enabling coherency means disabling the bit which forces it off */ + if (enable) + *cs++ = _MASKED_BIT_DISABLE(HDC_FORCE_NON_COHERENT); + else + *cs++ = _MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT); + *cs++ = MI_NOOP; + + intel_ring_advance(req, cs); + + return 0; +} There's nothing specific to the logical ringbuffer context here afaics. It could have just been done inside the single i915_gem_context_set_data_port_coherency(). Also makes it clearer that i915_gem_context_set_data_port_coherency needs struct_mutex. cmd = HDC_FORCE_NON_COHERENT << 16; if (!coherent) cmd |= HDC_FORCE_NON_COHERENT; *cs++ = cmd; Does that read any clearer? Sorry, I don't think I follow. Should I move the code out of logical ringbuffer context (intel_lrc.c)? Should I merge the emit_set_data_port_coherency() with intel_lr_context_modify_data_port_coherency()? Should I lock a mutex while adding the request? -Tomasz diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h index 1593194..214e291 100644 --- a/drivers/gpu/drm/i915/intel_lrc.h +++ b/drivers/gpu/drm/i915/intel_lrc.h @@ -104,4 +104,8 @@ struct i915_gem_context; void intel_lr_context_resume(struct drm_i915_private *dev_priv); +int +intel_lr_context_modify_data_port_coherency(struct i915_gem_context *ctx, +bool enable); + #endif /* _INTEL_LRC_H_ */ diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 7f5634c..fab072f 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1453,6 +1453,7 @@ struct drm_i915_gem_context_param { #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE0x4 #define I915_CONTEXT_PARAM_BANNABLE0x5 #define I915_CONTEXT_PARAM_PRIORITY0x6 +#define I915_CONTEXT_PARAM_COHERENCY 0x7 DATAPORT_COHERENCY There are many different caches. There should be some commentary around here telling userspace what the contract is. Will do. #define I915_CONTEXT_MAX_USER_PRIORITY 1023 /* inclusive */ #define I915_CONTEXT_DEFAULT_PRIORITY0 #define I915_CONTEXT_MIN_USER_PRIORITY -1023 /* inclusive */ COHERENCY has MAX/MIN_USER_PRIORITY, interesting. I thought it was just a boolean. -Chris I did not noticed the structure of defines here; will move the new define. -Tomasz ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [PATCH v3] drm/i915/gen11: Preempt-to-idle support in execlists.
On 2018-05-22 16:39, Ceraolo Spurio, Daniele wrote: On 5/21/2018 3:16 AM, Lis, Tomasz wrote: On 2018-05-18 23:08, Daniele Ceraolo Spurio wrote: On 11/05/18 08:45, Tomasz Lis wrote: The patch adds support of preempt-to-idle requesting by setting a proper bit within Execlist Control Register, and receiving preemption result from Context Status Buffer. Preemption in previous gens required a special batch buffer to be executed, so the Command Streamer never preempted to idle directly. In Icelake it is possible, as there is a hardware mechanism to inform the kernel about status of the preemption request. This patch does not cover using the new preemption mechanism when GuC is active. v2: Added needs_preempt_context() change so that it is not created when preempt-to-idle is supported. (Chris) Updated setting HWACK flag so that it is cleared after preempt-to-dle. (Chris, Daniele) Updated to use I915_ENGINE_HAS_PREEMPTION flag. (Chris) v3: Fixed needs_preempt_context() change. (Chris) Merged preemption trigger functions to one. (Chris) Fixed context state to not assume COMPLETED_MASK after preemption, since idle-to-idle case will not have it set. Bspec: 18922 Signed-off-by: Tomasz Lis <tomasz@intel.com> --- drivers/gpu/drm/i915/i915_drv.h | 2 + drivers/gpu/drm/i915/i915_gem_context.c | 5 +- drivers/gpu/drm/i915/i915_pci.c | 3 +- drivers/gpu/drm/i915/intel_device_info.h | 1 + drivers/gpu/drm/i915/intel_lrc.c | 115 ++- drivers/gpu/drm/i915/intel_lrc.h | 1 + 6 files changed, 92 insertions(+), 35 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 57fb3aa..6e9647b 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2535,6 +2535,8 @@ intel_info(const struct drm_i915_private *dev_priv) ((dev_priv)->info.has_logical_ring_elsq) #define HAS_LOGICAL_RING_PREEMPTION(dev_priv) \ ((dev_priv)->info.has_logical_ring_preemption) +#define HAS_HW_PREEMPT_TO_IDLE(dev_priv) \ + ((dev_priv)->info.has_hw_preempt_to_idle) #define HAS_EXECLISTS(dev_priv) HAS_LOGICAL_RING_CONTEXTS(dev_priv) diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 33f8a4b..bdac129 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -454,7 +454,10 @@ destroy_kernel_context(struct i915_gem_context **ctxp) static bool needs_preempt_context(struct drm_i915_private *i915) { - return HAS_LOGICAL_RING_PREEMPTION(i915); + return HAS_LOGICAL_RING_PREEMPTION(i915) && + (!HAS_HW_PREEMPT_TO_IDLE(i915) || + (HAS_HW_PREEMPT_TO_IDLE(i915) && + !USES_GUC_SUBMISSION(i915))); Why do we keep the preempt context for !USES_GUC_SUBMISSION(i915) even if HAS_HW_PREEMPT_TO_IDLE(i915)? After this patch we shouldn't need it anymore, right? The patch only provides gen11 way for the non-GuC submission. This is why the condition is so convoluted - preempt_context is still needed if we use GuC. This will be simplified after GuC paches are added. mmm I think this check is the other way around because it returns true when HAS_HW_PREEMPT_TO_IDLE for !USES_GUC_SUBMISSION, so when GuC is not in use. Yes, agreed. USES_GUC_SUBMISSION should not be negated. BTW, GuC does not support using the preempt context on platforms that have HW supported preempt-to-idle, so there is no need to keep the preempt context around for GuC. Oh, I did not knew that. So the preemption is completely disabled on gen11 with GuC then? (because patches for gen11 preempt-to-idle are not upstreamed)? } int i915_gem_contexts_init(struct drm_i915_private *dev_priv) diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c index 4364922..66b6700 100644 --- a/drivers/gpu/drm/i915/i915_pci.c +++ b/drivers/gpu/drm/i915/i915_pci.c @@ -595,7 +595,8 @@ static const struct intel_device_info intel_cannonlake_info = { GEN(11), \ .ddb_size = 2048, \ .has_csr = 0, \ - .has_logical_ring_elsq = 1 + .has_logical_ring_elsq = 1, \ + .has_hw_preempt_to_idle = 1 static const struct intel_device_info intel_icelake_11_info = { GEN11_FEATURES, diff --git a/drivers/gpu/drm/i915/intel_device_info.h b/drivers/gpu/drm/i915/intel_device_info.h index 933e316..4eb97b5 100644 --- a/drivers/gpu/drm/i915/intel_device_info.h +++ b/drivers/gpu/drm/i915/intel_device_info.h @@ -98,6 +98,7 @@ enum intel_platform { func(has_logical_ring_contexts); \ func(has_logical_ring_elsq); \ func(has_logical_ring_preemption); \ + func(has_hw_preempt_to_idle); \ func(has_overlay); \ func(has_pooled_eu); \ func(has_psr); \ diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
Re: [Intel-gfx] [PATCH v3] drm/i915/gen11: Preempt-to-idle support in execlists.
On 2018-05-18 23:08, Daniele Ceraolo Spurio wrote: On 11/05/18 08:45, Tomasz Lis wrote: The patch adds support of preempt-to-idle requesting by setting a proper bit within Execlist Control Register, and receiving preemption result from Context Status Buffer. Preemption in previous gens required a special batch buffer to be executed, so the Command Streamer never preempted to idle directly. In Icelake it is possible, as there is a hardware mechanism to inform the kernel about status of the preemption request. This patch does not cover using the new preemption mechanism when GuC is active. v2: Added needs_preempt_context() change so that it is not created when preempt-to-idle is supported. (Chris) Updated setting HWACK flag so that it is cleared after preempt-to-dle. (Chris, Daniele) Updated to use I915_ENGINE_HAS_PREEMPTION flag. (Chris) v3: Fixed needs_preempt_context() change. (Chris) Merged preemption trigger functions to one. (Chris) Fixed context state to not assume COMPLETED_MASK after preemption, since idle-to-idle case will not have it set. Bspec: 18922 Signed-off-by: Tomasz Lis <tomasz@intel.com> --- drivers/gpu/drm/i915/i915_drv.h | 2 + drivers/gpu/drm/i915/i915_gem_context.c | 5 +- drivers/gpu/drm/i915/i915_pci.c | 3 +- drivers/gpu/drm/i915/intel_device_info.h | 1 + drivers/gpu/drm/i915/intel_lrc.c | 115 ++- drivers/gpu/drm/i915/intel_lrc.h | 1 + 6 files changed, 92 insertions(+), 35 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 57fb3aa..6e9647b 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2535,6 +2535,8 @@ intel_info(const struct drm_i915_private *dev_priv) ((dev_priv)->info.has_logical_ring_elsq) #define HAS_LOGICAL_RING_PREEMPTION(dev_priv) \ ((dev_priv)->info.has_logical_ring_preemption) +#define HAS_HW_PREEMPT_TO_IDLE(dev_priv) \ + ((dev_priv)->info.has_hw_preempt_to_idle) #define HAS_EXECLISTS(dev_priv) HAS_LOGICAL_RING_CONTEXTS(dev_priv) diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 33f8a4b..bdac129 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -454,7 +454,10 @@ destroy_kernel_context(struct i915_gem_context **ctxp) static bool needs_preempt_context(struct drm_i915_private *i915) { - return HAS_LOGICAL_RING_PREEMPTION(i915); + return HAS_LOGICAL_RING_PREEMPTION(i915) && + (!HAS_HW_PREEMPT_TO_IDLE(i915) || + (HAS_HW_PREEMPT_TO_IDLE(i915) && + !USES_GUC_SUBMISSION(i915))); Why do we keep the preempt context for !USES_GUC_SUBMISSION(i915) even if HAS_HW_PREEMPT_TO_IDLE(i915)? After this patch we shouldn't need it anymore, right? The patch only provides gen11 way for the non-GuC submission. This is why the condition is so convoluted - preempt_context is still needed if we use GuC. This will be simplified after GuC paches are added. } int i915_gem_contexts_init(struct drm_i915_private *dev_priv) diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c index 4364922..66b6700 100644 --- a/drivers/gpu/drm/i915/i915_pci.c +++ b/drivers/gpu/drm/i915/i915_pci.c @@ -595,7 +595,8 @@ static const struct intel_device_info intel_cannonlake_info = { GEN(11), \ .ddb_size = 2048, \ .has_csr = 0, \ - .has_logical_ring_elsq = 1 + .has_logical_ring_elsq = 1, \ + .has_hw_preempt_to_idle = 1 static const struct intel_device_info intel_icelake_11_info = { GEN11_FEATURES, diff --git a/drivers/gpu/drm/i915/intel_device_info.h b/drivers/gpu/drm/i915/intel_device_info.h index 933e316..4eb97b5 100644 --- a/drivers/gpu/drm/i915/intel_device_info.h +++ b/drivers/gpu/drm/i915/intel_device_info.h @@ -98,6 +98,7 @@ enum intel_platform { func(has_logical_ring_contexts); \ func(has_logical_ring_elsq); \ func(has_logical_ring_preemption); \ + func(has_hw_preempt_to_idle); \ func(has_overlay); \ func(has_pooled_eu); \ func(has_psr); \ diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 29dcf34..8fe6795 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -154,6 +154,7 @@ #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) #define GEN8_CTX_STATUS_COMPLETE (1 << 4) #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) +#define GEN11_CTX_STATUS_PREEMPT_IDLE (1 << 29) #define GEN8_CTX_STATUS_COMPLETED_MASK \ (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) @@ -526,31 +527,49 @@ static void port_assign(struct execlist_port *port, struct i915_request *rq) static void inject_preempt_context(struct intel_engine_cs *engine) For gen11+ we don't
Re: [Intel-gfx] [PATCH v1] drm/i915/gen11: Preempt-to-idle support in execlists.
On 2018-03-30 21:45, Daniele Ceraolo Spurio wrote: On 30/03/18 08:42, Lis, Tomasz wrote: On 2018-03-29 00:28, Chris Wilson wrote: Quoting Lis, Tomasz (2018-03-28 17:06:58) On 2018-03-28 01:27, Chris Wilson wrote: Quoting Tomasz Lis (2018-03-27 16:17:59) The patch adds support of preempt-to-idle requesting by setting a proper bit within Execlist Control Register, and receiving preemption result from Context Status Buffer. Preemption in previous gens required a special batch buffer to be executed, so the Command Streamer never preempted to idle directly. In Icelake it is possible, as there is a hardware mechanism to inform the kernel about status of the preemption request. This patch does not cover using the new preemption mechanism when GuC is active. Bspec: 18922 Signed-off-by: Tomasz Lis <tomasz@intel.com> --- drivers/gpu/drm/i915/i915_drv.h | 2 ++ drivers/gpu/drm/i915/i915_pci.c | 3 ++- drivers/gpu/drm/i915/intel_device_info.h | 1 + drivers/gpu/drm/i915/intel_lrc.c | 45 +++- drivers/gpu/drm/i915/intel_lrc.h | 1 + 5 files changed, 45 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 800230b..c32580b 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2514,6 +2514,8 @@ intel_info(const struct drm_i915_private *dev_priv) ((dev_priv)->info.has_logical_ring_elsq) #define HAS_LOGICAL_RING_PREEMPTION(dev_priv) \ ((dev_priv)->info.has_logical_ring_preemption) +#define HAS_HW_PREEMPT_TO_IDLE(dev_priv) \ + ((dev_priv)->info.has_hw_preempt_to_idle) #define HAS_EXECLISTS(dev_priv) HAS_LOGICAL_RING_CONTEXTS(dev_priv) diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c index 4364922..66b6700 100644 --- a/drivers/gpu/drm/i915/i915_pci.c +++ b/drivers/gpu/drm/i915/i915_pci.c @@ -595,7 +595,8 @@ static const struct intel_device_info intel_cannonlake_info = { GEN(11), \ .ddb_size = 2048, \ .has_csr = 0, \ - .has_logical_ring_elsq = 1 + .has_logical_ring_elsq = 1, \ + .has_hw_preempt_to_idle = 1 static const struct intel_device_info intel_icelake_11_info = { GEN11_FEATURES, diff --git a/drivers/gpu/drm/i915/intel_device_info.h b/drivers/gpu/drm/i915/intel_device_info.h index 933e316..4eb97b5 100644 --- a/drivers/gpu/drm/i915/intel_device_info.h +++ b/drivers/gpu/drm/i915/intel_device_info.h @@ -98,6 +98,7 @@ enum intel_platform { func(has_logical_ring_contexts); \ func(has_logical_ring_elsq); \ func(has_logical_ring_preemption); \ + func(has_hw_preempt_to_idle); \ func(has_overlay); \ func(has_pooled_eu); \ func(has_psr); \ diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index ba7f783..1a22de4 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -153,6 +153,7 @@ #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) #define GEN8_CTX_STATUS_COMPLETE (1 << 4) #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) +#define GEN11_CTX_STATUS_PREEMPT_IDLE (1 << 29) #define GEN8_CTX_STATUS_COMPLETED_MASK \ (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) @@ -183,7 +184,9 @@ static inline bool need_preempt(const struct intel_engine_cs *engine, const struct i915_request *last, int prio) { - return engine->i915->preempt_context && prio > max(rq_prio(last), 0); + return (engine->i915->preempt_context || + HAS_HW_PREEMPT_TO_IDLE(engine->i915)) && Well, you haven't actually disabled allocating the preempt_context so... Yes.. I had mixed feelings about changing needs_preempt_context() now, as that would mean adding a temporary condition on GuC until the GuC preemption is merged. I will add the conditions and disable the allocation in v2 of the patch. But at any rate, making this an engine->flag would eliminate one pointer dance. Could be an interesting idea for a separate patch. To land first ;) :) Sure, I can do that. + prio > max(rq_prio(last), 0); } /** @@ -535,6 +538,25 @@ static void inject_preempt_context(struct intel_engine_cs *engine) execlists_set_active(>execlists, EXECLISTS_ACTIVE_PREEMPT); } +static void gen11_preempt_to_idle(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists *execlists = >execlists; + + GEM_TRACE("%s\n", engine->name); + + /* + * hardware which HAS_HW_PREEMPT_TO_IDLE(), always also + * HAS_LOGICAL_RING_ELSQ(), so we can assume ctrl_reg is set + */ + GEM_BUG_ON(execlists->ctrl_reg != NULL); + + /* trigger preemption to idle */ + w
Re: [Intel-gfx] [PATCH v1] drm/i915/gen11: Preempt-to-idle support in execlists.
On 2018-03-30 20:23, Daniele Ceraolo Spurio wrote: On 27/03/18 16:27, Chris Wilson wrote: Quoting Tomasz Lis (2018-03-27 16:17:59) The patch adds support of preempt-to-idle requesting by setting a proper bit within Execlist Control Register, and receiving preemption result from Context Status Buffer. Preemption in previous gens required a special batch buffer to be executed, so the Command Streamer never preempted to idle directly. In Icelake it is possible, as there is a hardware mechanism to inform the kernel about status of the preemption request. This patch does not cover using the new preemption mechanism when GuC is active. Bspec: 18922 Signed-off-by: Tomasz Lis <tomasz@intel.com> --- drivers/gpu/drm/i915/i915_drv.h | 2 ++ drivers/gpu/drm/i915/i915_pci.c | 3 ++- drivers/gpu/drm/i915/intel_device_info.h | 1 + drivers/gpu/drm/i915/intel_lrc.c | 45 +++- drivers/gpu/drm/i915/intel_lrc.h | 1 + 5 files changed, 45 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 800230b..c32580b 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2514,6 +2514,8 @@ intel_info(const struct drm_i915_private *dev_priv) ((dev_priv)->info.has_logical_ring_elsq) #define HAS_LOGICAL_RING_PREEMPTION(dev_priv) \ ((dev_priv)->info.has_logical_ring_preemption) +#define HAS_HW_PREEMPT_TO_IDLE(dev_priv) \ + ((dev_priv)->info.has_hw_preempt_to_idle) #define HAS_EXECLISTS(dev_priv) HAS_LOGICAL_RING_CONTEXTS(dev_priv) diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c index 4364922..66b6700 100644 --- a/drivers/gpu/drm/i915/i915_pci.c +++ b/drivers/gpu/drm/i915/i915_pci.c @@ -595,7 +595,8 @@ static const struct intel_device_info intel_cannonlake_info = { GEN(11), \ .ddb_size = 2048, \ .has_csr = 0, \ - .has_logical_ring_elsq = 1 + .has_logical_ring_elsq = 1, \ + .has_hw_preempt_to_idle = 1 static const struct intel_device_info intel_icelake_11_info = { GEN11_FEATURES, diff --git a/drivers/gpu/drm/i915/intel_device_info.h b/drivers/gpu/drm/i915/intel_device_info.h index 933e316..4eb97b5 100644 --- a/drivers/gpu/drm/i915/intel_device_info.h +++ b/drivers/gpu/drm/i915/intel_device_info.h @@ -98,6 +98,7 @@ enum intel_platform { func(has_logical_ring_contexts); \ func(has_logical_ring_elsq); \ func(has_logical_ring_preemption); \ + func(has_hw_preempt_to_idle); \ func(has_overlay); \ func(has_pooled_eu); \ func(has_psr); \ diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index ba7f783..1a22de4 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -153,6 +153,7 @@ #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) #define GEN8_CTX_STATUS_COMPLETE (1 << 4) #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) +#define GEN11_CTX_STATUS_PREEMPT_IDLE (1 << 29) #define GEN8_CTX_STATUS_COMPLETED_MASK \ (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) @@ -183,7 +184,9 @@ static inline bool need_preempt(const struct intel_engine_cs *engine, const struct i915_request *last, int prio) { - return engine->i915->preempt_context && prio > max(rq_prio(last), 0); + return (engine->i915->preempt_context || + HAS_HW_PREEMPT_TO_IDLE(engine->i915)) && Well, you haven't actually disabled allocating the preempt_context so... But at any rate, making this an engine->flag would eliminate one pointer dance. Can't we re-use I915_SCHEDULER_CAP_PREEMPTION in engine->i915->caps.scheduler? That btw like here to be set if i915->preempt_context || HAS_HW_PREEMPT_TO_IDLE(i915) The engine->flag which Chris introduced is now used to set I915_SCHEDULER_CAP_PREEMPTION. + prio > max(rq_prio(last), 0); } /** @@ -535,6 +538,25 @@ static void inject_preempt_context(struct intel_engine_cs *engine) execlists_set_active(>execlists, EXECLISTS_ACTIVE_PREEMPT); } +static void gen11_preempt_to_idle(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists *execlists = >execlists; + + GEM_TRACE("%s\n", engine->name); + + /* + * hardware which HAS_HW_PREEMPT_TO_IDLE(), always also + * HAS_LOGICAL_RING_ELSQ(), so we can assume ctrl_reg is set + */ + GEM_BUG_ON(execlists->ctrl_reg != NULL); Shouldn't this check be the other way around? Wow. I have no idea how I was able to test this patch and not trigger this. You are right. + + /* trigger preemption to idle */ + writel
Re: [Intel-gfx] [PATCH v1] drm/i915/gen11: Preempt-to-idle support in execlists.
On 2018-03-29 00:28, Chris Wilson wrote: Quoting Lis, Tomasz (2018-03-28 17:06:58) On 2018-03-28 01:27, Chris Wilson wrote: Quoting Tomasz Lis (2018-03-27 16:17:59) The patch adds support of preempt-to-idle requesting by setting a proper bit within Execlist Control Register, and receiving preemption result from Context Status Buffer. Preemption in previous gens required a special batch buffer to be executed, so the Command Streamer never preempted to idle directly. In Icelake it is possible, as there is a hardware mechanism to inform the kernel about status of the preemption request. This patch does not cover using the new preemption mechanism when GuC is active. Bspec: 18922 Signed-off-by: Tomasz Lis <tomasz@intel.com> --- drivers/gpu/drm/i915/i915_drv.h | 2 ++ drivers/gpu/drm/i915/i915_pci.c | 3 ++- drivers/gpu/drm/i915/intel_device_info.h | 1 + drivers/gpu/drm/i915/intel_lrc.c | 45 +++- drivers/gpu/drm/i915/intel_lrc.h | 1 + 5 files changed, 45 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 800230b..c32580b 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2514,6 +2514,8 @@ intel_info(const struct drm_i915_private *dev_priv) ((dev_priv)->info.has_logical_ring_elsq) #define HAS_LOGICAL_RING_PREEMPTION(dev_priv) \ ((dev_priv)->info.has_logical_ring_preemption) +#define HAS_HW_PREEMPT_TO_IDLE(dev_priv) \ + ((dev_priv)->info.has_hw_preempt_to_idle) #define HAS_EXECLISTS(dev_priv) HAS_LOGICAL_RING_CONTEXTS(dev_priv) diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c index 4364922..66b6700 100644 --- a/drivers/gpu/drm/i915/i915_pci.c +++ b/drivers/gpu/drm/i915/i915_pci.c @@ -595,7 +595,8 @@ static const struct intel_device_info intel_cannonlake_info = { GEN(11), \ .ddb_size = 2048, \ .has_csr = 0, \ - .has_logical_ring_elsq = 1 + .has_logical_ring_elsq = 1, \ + .has_hw_preempt_to_idle = 1 static const struct intel_device_info intel_icelake_11_info = { GEN11_FEATURES, diff --git a/drivers/gpu/drm/i915/intel_device_info.h b/drivers/gpu/drm/i915/intel_device_info.h index 933e316..4eb97b5 100644 --- a/drivers/gpu/drm/i915/intel_device_info.h +++ b/drivers/gpu/drm/i915/intel_device_info.h @@ -98,6 +98,7 @@ enum intel_platform { func(has_logical_ring_contexts); \ func(has_logical_ring_elsq); \ func(has_logical_ring_preemption); \ + func(has_hw_preempt_to_idle); \ func(has_overlay); \ func(has_pooled_eu); \ func(has_psr); \ diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index ba7f783..1a22de4 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -153,6 +153,7 @@ #define GEN8_CTX_STATUS_ACTIVE_IDLE(1 << 3) #define GEN8_CTX_STATUS_COMPLETE (1 << 4) #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) +#define GEN11_CTX_STATUS_PREEMPT_IDLE (1 << 29) #define GEN8_CTX_STATUS_COMPLETED_MASK \ (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) @@ -183,7 +184,9 @@ static inline bool need_preempt(const struct intel_engine_cs *engine, const struct i915_request *last, int prio) { - return engine->i915->preempt_context && prio > max(rq_prio(last), 0); + return (engine->i915->preempt_context || + HAS_HW_PREEMPT_TO_IDLE(engine->i915)) && Well, you haven't actually disabled allocating the preempt_context so... Yes.. I had mixed feelings about changing needs_preempt_context() now, as that would mean adding a temporary condition on GuC until the GuC preemption is merged. I will add the conditions and disable the allocation in v2 of the patch. But at any rate, making this an engine->flag would eliminate one pointer dance. Could be an interesting idea for a separate patch. To land first ;) :) Sure, I can do that. +prio > max(rq_prio(last), 0); } /** @@ -535,6 +538,25 @@ static void inject_preempt_context(struct intel_engine_cs *engine) execlists_set_active(>execlists, EXECLISTS_ACTIVE_PREEMPT); } +static void gen11_preempt_to_idle(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists *execlists = >execlists; + + GEM_TRACE("%s\n", engine->name); + + /* +* hardware which HAS_HW_PREEMPT_TO_IDLE(), always also +* HAS_LOGICAL_RING_ELSQ(), so we can assume ctrl_reg is set +*/ + GEM_BUG_ON(execlists->ctrl_reg != NULL); + + /* trigger preemption to idle */ + writel(EL_CTRL_PR
Re: [Intel-gfx] [PATCH v1] drm/i915/gen11: Preempt-to-idle support in execlists.
On 2018-03-28 01:27, Chris Wilson wrote: Quoting Tomasz Lis (2018-03-27 16:17:59) The patch adds support of preempt-to-idle requesting by setting a proper bit within Execlist Control Register, and receiving preemption result from Context Status Buffer. Preemption in previous gens required a special batch buffer to be executed, so the Command Streamer never preempted to idle directly. In Icelake it is possible, as there is a hardware mechanism to inform the kernel about status of the preemption request. This patch does not cover using the new preemption mechanism when GuC is active. Bspec: 18922 Signed-off-by: Tomasz Lis <tomasz@intel.com> --- drivers/gpu/drm/i915/i915_drv.h | 2 ++ drivers/gpu/drm/i915/i915_pci.c | 3 ++- drivers/gpu/drm/i915/intel_device_info.h | 1 + drivers/gpu/drm/i915/intel_lrc.c | 45 +++- drivers/gpu/drm/i915/intel_lrc.h | 1 + 5 files changed, 45 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 800230b..c32580b 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2514,6 +2514,8 @@ intel_info(const struct drm_i915_private *dev_priv) ((dev_priv)->info.has_logical_ring_elsq) #define HAS_LOGICAL_RING_PREEMPTION(dev_priv) \ ((dev_priv)->info.has_logical_ring_preemption) +#define HAS_HW_PREEMPT_TO_IDLE(dev_priv) \ + ((dev_priv)->info.has_hw_preempt_to_idle) #define HAS_EXECLISTS(dev_priv) HAS_LOGICAL_RING_CONTEXTS(dev_priv) diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c index 4364922..66b6700 100644 --- a/drivers/gpu/drm/i915/i915_pci.c +++ b/drivers/gpu/drm/i915/i915_pci.c @@ -595,7 +595,8 @@ static const struct intel_device_info intel_cannonlake_info = { GEN(11), \ .ddb_size = 2048, \ .has_csr = 0, \ - .has_logical_ring_elsq = 1 + .has_logical_ring_elsq = 1, \ + .has_hw_preempt_to_idle = 1 static const struct intel_device_info intel_icelake_11_info = { GEN11_FEATURES, diff --git a/drivers/gpu/drm/i915/intel_device_info.h b/drivers/gpu/drm/i915/intel_device_info.h index 933e316..4eb97b5 100644 --- a/drivers/gpu/drm/i915/intel_device_info.h +++ b/drivers/gpu/drm/i915/intel_device_info.h @@ -98,6 +98,7 @@ enum intel_platform { func(has_logical_ring_contexts); \ func(has_logical_ring_elsq); \ func(has_logical_ring_preemption); \ + func(has_hw_preempt_to_idle); \ func(has_overlay); \ func(has_pooled_eu); \ func(has_psr); \ diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index ba7f783..1a22de4 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -153,6 +153,7 @@ #define GEN8_CTX_STATUS_ACTIVE_IDLE(1 << 3) #define GEN8_CTX_STATUS_COMPLETE (1 << 4) #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) +#define GEN11_CTX_STATUS_PREEMPT_IDLE (1 << 29) #define GEN8_CTX_STATUS_COMPLETED_MASK \ (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) @@ -183,7 +184,9 @@ static inline bool need_preempt(const struct intel_engine_cs *engine, const struct i915_request *last, int prio) { - return engine->i915->preempt_context && prio > max(rq_prio(last), 0); + return (engine->i915->preempt_context || + HAS_HW_PREEMPT_TO_IDLE(engine->i915)) && Well, you haven't actually disabled allocating the preempt_context so... Yes.. I had mixed feelings about changing needs_preempt_context() now, as that would mean adding a temporary condition on GuC until the GuC preemption is merged. I will add the conditions and disable the allocation in v2 of the patch. But at any rate, making this an engine->flag would eliminate one pointer dance. Could be an interesting idea for a separate patch. +prio > max(rq_prio(last), 0); } /** @@ -535,6 +538,25 @@ static void inject_preempt_context(struct intel_engine_cs *engine) execlists_set_active(>execlists, EXECLISTS_ACTIVE_PREEMPT); } +static void gen11_preempt_to_idle(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists *execlists = >execlists; + + GEM_TRACE("%s\n", engine->name); + + /* +* hardware which HAS_HW_PREEMPT_TO_IDLE(), always also +* HAS_LOGICAL_RING_ELSQ(), so we can assume ctrl_reg is set +*/ + GEM_BUG_ON(execlists->ctrl_reg != NULL); + + /* trigger preemption to idle */ + writel(EL_CTRL_PREEMPT_TO_IDLE, execlists->ctrl_reg); Future plans? Because just inserting the branch into the setter of inject_preempt_context() resolves a lot of conflicts with other work. M
Re: [Intel-gfx] [RFC v1] drm/i915: Add Exec param to control data port coherency.
On 2018-03-21 20:42, Oscar Mateo wrote: On 3/21/2018 3:16 AM, Chris Wilson wrote: Quoting Oscar Mateo (2018-03-20 18:43:45) On 3/19/2018 7:14 AM, Lis, Tomasz wrote: On 2018-03-19 13:43, Chris Wilson wrote: Quoting Tomasz Lis (2018-03-19 12:37:35) The patch adds a parameter to control the data port coherency functionality on a per-exec call basis. When data port coherency flag value is different than what it was in previous call for the context, a command to switch data port coherency state is added before the buffer to be executed. So this is part of the context? Why do it at exec level? It is part of the context, stored within HDC chicken bit register. The exec level was requested by the OCL team, due to concerns about performance cost of context setparam calls. If exec level is desired, why not whitelist it? -Chris If we have no issue in whitelisting the register, I'm sure OCL will agree to that. I assumed the whitelisting will be unacceptable because of security concerns with some options. The register also changes its position and content between gens, which makes whitelisting hard to manage. I think a security analysis of this register was already done, and the result was that it contains some other bits that could be dangerous. In CNL those bits were moved out of the way and the HDC_CHICKEN0 register can be whitelisted (WaAllowUMDToControlCoherency). In ICL the register should already be non-privileged. The previous alternative to whitelisting was running through a command parser for validation. That's a very general mechanism suitable for a wide range of sins. -Chris Are you suggesting that we enable the cmd parser for every Gen < CNL for this particular usage only? :P It is a solution that would allow to do what we want without any additions to module interface. It may be worth considering if we think the coherency setting will be temporary and removed in future gens, as we wouldn't want to have obsolete flags. I think the setting will stay with us, as it is needed to support CL_MEM_SVM_FINE_GRAIN_BUFFER flag from OpenCL spec. Keeping coherency will always cost performance, so we will likely always have a hardware setting to switch it. But the bspec says coherency override control will be removed in future projects... ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [RFC v1] drm/i915: Add Exec param to control data port coherency.
On 2018-03-19 15:26, Chris Wilson wrote: Quoting Lis, Tomasz (2018-03-19 14:14:19) On 2018-03-19 13:43, Chris Wilson wrote: Quoting Tomasz Lis (2018-03-19 12:37:35) The patch adds a parameter to control the data port coherency functionality on a per-exec call basis. When data port coherency flag value is different than what it was in previous call for the context, a command to switch data port coherency state is added before the buffer to be executed. So this is part of the context? Why do it at exec level? It is part of the context, stored within HDC chicken bit register. The exec level was requested by the OCL team, due to concerns about performance cost of context setparam calls. What? Oh dear, oh dear, thrice oh dear. The context setparam would look like: if (arg != context->value) { rq = request_alloc(context, RCS); cs = ring_begin(rq, 4); cs++ = MI_LRI; cs++ = reg; cs++ = magic; cs++ = MI_NOOP; request_add(rq); context->value = arg } The argument is whether stuffing it into a crowded, v.frequently executed execbuf is better than an irregular setparam. If they want to flip it on every batch, use execbuf. If it's going to be very infrequent, setparam. Implementing the data port coherency switch as context setparam would not be a problem, I agree. But this is not a solution OCL is willing to accept. Any additional IOCTL call is a concern for the OCL developers. For more explanation on switch frequency - please look at the cover letter I provided; here's the related part of it: (note: the data port coherency is called fine grain coherency within UMD) 3. Will coherency switch be used frequently? There are scenarios that will require frequent toggling of the coherency switch. E.g. an application has two OCL compute kernels: kern_master and kern_worker. kern_master uses, concurrently with CPU, some fine grain SVM resources (CL_MEM_SVM_FINE_GRAIN_BUFFER). These resources contain descriptors of computational work that needs to be executed. kern_master analyzes incoming work descriptors and populates a plain OCL buffer (non-fine-grain) with payload for kern_worker. Once kern_master is done, kern_worker kicks-in and processes the payload that kern_master produced. These two kernels work in a loop, one after another. Since only kern_master requires coherency, kern_worker should not be forced to pay for it. This means that we need to have the ability to toggle coherency switch on or off per each GPU submission: (ENABLE COHERENCY) kern_master -> (DISABLE COHERENCY)kern_worker -> (ENABLE COHERENCY) kern_master -> (DISABLE COHERENCY)kern_worker -> ... That discussion must be part of the rationale in the commitlog. Will add. Should I place the whole text from cover letter within the commit comment? Otoh, execbuf3 would accept it as a command packet. Hmm. I know we have execbuf2, but execbuf3? Are you proposing to add something like that? If exec level is desired, why not whitelist it? If we have no issue in whitelisting the register, I'm sure OCL will agree to that. I assumed the whitelisting will be unacceptable because of security concerns with some options. The register also changes its position and content between gens, which makes whitelisting hard to manage. Main purpose of chicken bit registers, in general, is to allow work around for hardware features which could be buggy or could have unintended influence on the platform. The data port coherency functionality landed there for the same reasons; then it twisted itself in a way that we now need user space to switch it. Is it really ok to whitelist chicken bit registers? It all depends on whether it breaks segregation. If the only users affected are themselves, fine. Otherwise, no. -Chris Chicken Bit registers are definitely not planned as safe for use. While meaning of bits within HDC_CHICKEN0 change between gens, I doubt any of the registers *can't* be used to cause GPU hung. -Tomasz ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [RFC v1] Data port coherency control for UMDs.
On 2018-03-19 14:53, Joonas Lahtinen wrote: + Dave, as FYI Quoting Tomasz Lis (2018-03-19 14:37:34) The OpenCL driver develpers requested a functionality to control cache coherency at data port level. Keeping the coherency at that level is disabled by default due to its performance costs. OpenCL driver is planning to enable it for a small subset of submissions, when such functionality is required. Can you please link to the corresponding OpenCL driver changes? I'm assuming this relates to the new-driver-to-be-adopted, instead of Beignet? It is for the new driver; I will ask the OCL developers to provide a link. How is the story/schedule looking for adopting the new driver to distros? I guess that's another question for OCL guys, I don't know. Seeing the userspace counterpart and tests will help in assessing the suggested changes. Regards, Joonas I prepared an IGT test for that, I will send it to a proper mailing list soon. -Tomasz ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [RFC v1] drm/i915: Add Exec param to control data port coherency.
On 2018-03-19 13:43, Chris Wilson wrote: Quoting Tomasz Lis (2018-03-19 12:37:35) The patch adds a parameter to control the data port coherency functionality on a per-exec call basis. When data port coherency flag value is different than what it was in previous call for the context, a command to switch data port coherency state is added before the buffer to be executed. So this is part of the context? Why do it at exec level? It is part of the context, stored within HDC chicken bit register. The exec level was requested by the OCL team, due to concerns about performance cost of context setparam calls. If exec level is desired, why not whitelist it? -Chris If we have no issue in whitelisting the register, I'm sure OCL will agree to that. I assumed the whitelisting will be unacceptable because of security concerns with some options. The register also changes its position and content between gens, which makes whitelisting hard to manage. Main purpose of chicken bit registers, in general, is to allow work around for hardware features which could be buggy or could have unintended influence on the platform. The data port coherency functionality landed there for the same reasons; then it twisted itself in a way that we now need user space to switch it. Is it really ok to whitelist chicken bit registers? -Tomasz ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [PATCH v2] drm/i915: Move the scheduler feature bits into the purview of the engines
So the functional purpose of this patch is to provide capabilities (including preemption status) within error information. I agree this is required. On 2018-02-01 20:02, Chris Wilson wrote: Rather than having the high level ioctl interface guess the underlying implementation details, having the implementation declare what capabilities it exports. We define an intel_driver_caps, similar to the intel_device_info, which instead of trying to describe the HW gives details on what the driver itself supports. This is then populated by the engine backend for the new scheduler capability field for use elsewhere. v2: Use caps.scheduler for validating CONTEXT_PARAM_SET_PRIORITY (Mika) One less assumption of engine[RCS] \o/ Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk> Cc: Tomasz Lis <tomasz@intel.com> Cc: Joonas Lahtinen <joonas.lahti...@linux.intel.com> Cc: Tvrtko Ursulin <tvrtko.ursu...@intel.com> Cc: Michal Wajdeczko <michal.wajdec...@intel.com> Cc: Mika Kuoppala <mika.kuopp...@linux.intel.com> Reviewed-by: Tomasz Lis <tomasz@intel.com> --- drivers/gpu/drm/i915/i915_debugfs.c | 1 + drivers/gpu/drm/i915/i915_drv.c | 8 +--- drivers/gpu/drm/i915/i915_drv.h | 2 ++ drivers/gpu/drm/i915/i915_gem.c | 3 +++ drivers/gpu/drm/i915/i915_gem_context.c | 2 +- drivers/gpu/drm/i915/i915_gpu_error.c| 7 +-- drivers/gpu/drm/i915/intel_device_info.c | 6 ++ drivers/gpu/drm/i915/intel_device_info.h | 7 +++ drivers/gpu/drm/i915/intel_lrc.c | 6 ++ 9 files changed, 32 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 9e44adef30f0..2bdce9fea671 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -49,6 +49,7 @@ static int i915_capabilities(struct seq_file *m, void *data) intel_device_info_dump_flags(info, ); intel_device_info_dump_runtime(info, ); + intel_driver_caps_print(_priv->caps, ); kernel_param_lock(THIS_MODULE); i915_params_dump(_modparams, ); diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index 1ec12add34b2..733f71637914 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -372,13 +372,7 @@ static int i915_getparam(struct drm_device *dev, void *data, value = i915_gem_mmap_gtt_version(); break; case I915_PARAM_HAS_SCHEDULER: - value = 0; - if (dev_priv->engine[RCS] && dev_priv->engine[RCS]->schedule) { - value |= I915_SCHEDULER_CAP_ENABLED; - value |= I915_SCHEDULER_CAP_PRIORITY; - if (HAS_LOGICAL_RING_PREEMPTION(dev_priv)) - value |= I915_SCHEDULER_CAP_PREEMPTION; - } + value = dev_priv->caps.scheduler; break; case I915_PARAM_MMAP_VERSION: diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index c676269ed843..24333042e1e9 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -468,6 +468,7 @@ struct i915_gpu_state { u32 reset_count; u32 suspend_count; struct intel_device_info device_info; + struct intel_driver_caps driver_caps; struct i915_params params; struct i915_error_uc { @@ -1809,6 +1810,7 @@ struct drm_i915_private { struct kmem_cache *priorities; const struct intel_device_info info; + struct intel_driver_caps caps; /** * Data Stolen Memory - aka "i915 stolen memory" gives us the start and diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index c049496e8757..8d732f97e23e 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -3222,8 +3222,11 @@ void i915_gem_set_wedged(struct drm_i915_private *i915) * start to complete all requests. */ engine->submit_request = nop_complete_submit_request; + engine->schedule = NULL; } + i915->caps.scheduler = 0; + /* * Make sure no request can slip through without getting completed by * either this call here to intel_engine_init_global_seqno, or the one diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 648e7536ff51..8337d15bb0e5 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -807,7 +807,7 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data, if (args->size) ret = -EINVAL; - else if (!to_i915(dev)->engine[RCS]->schedule) + else if (!(to_i915(dev)-&
Re: [Intel-gfx] [PATCH] drm/i915: Only allocate preempt context when required
On 2018-01-27 21:28, Chris Wilson wrote: If we remove some hardcoded assumptions about the preempt context having a fixed id, reserved from use by normal user contexts, we may only allocate the i915_gem_context when required. Then the subsequent decisions on using preemption reduce to having the preempt context available. Signed-off-by: Chris WilsonCc: Michal Winiarski Cc: Tvrtko Ursulin Cc: Arkadiusz Hiler Cc: Mika Kuoppala Cc: Daniele Ceraolo Spurio --- drivers/gpu/drm/i915/i915_drv.c | 2 +- drivers/gpu/drm/i915/i915_gem_context.c | 24 +--- drivers/gpu/drm/i915/intel_engine_cs.c | 6 +++--- drivers/gpu/drm/i915/intel_guc_submission.c | 24 +--- drivers/gpu/drm/i915/intel_lrc.c | 15 ++- drivers/gpu/drm/i915/intel_ringbuffer.h | 5 + drivers/gpu/drm/i915/selftests/mock_gem_device.c | 6 -- 7 files changed, 41 insertions(+), 41 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index 1ec12add34b2..a7fc87e87fcf 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -376,7 +376,7 @@ static int i915_getparam(struct drm_device *dev, void *data, if (dev_priv->engine[RCS] && dev_priv->engine[RCS]->schedule) { value |= I915_SCHEDULER_CAP_ENABLED; value |= I915_SCHEDULER_CAP_PRIORITY; - if (HAS_LOGICAL_RING_PREEMPTION(dev_priv)) + if (dev_priv->preempt_context) value |= I915_SCHEDULER_CAP_PREEMPTION; } break; We cannot put equality to not having preempt_context and having preemption disabled. The preempt_context will not be required on platforms which support preempting directly to idle. There is no need to send anything to the ring for these platforms, and the information about preemption finish is send back to us via interrupt (or GuC message in case it's enabled). Checking whether preempt_context is null before it is to be accessed is a good idea, but for checking for feature availability, HAS_LOGICAL_RING_PREEMPTION() will be needed. The inject_preempt_context() functions will definitely not be called when preempting to idle is available, so any changes there do not collide with the new functionality. -Tomasz diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 648e7536ff51..ebb1d01b4f4e 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -452,7 +452,6 @@ destroy_kernel_context(struct i915_gem_context **ctxp) int i915_gem_contexts_init(struct drm_i915_private *dev_priv) { struct i915_gem_context *ctx; - int err; GEM_BUG_ON(dev_priv->kernel_context); @@ -468,8 +467,7 @@ int i915_gem_contexts_init(struct drm_i915_private *dev_priv) ctx = i915_gem_context_create_kernel(dev_priv, I915_PRIORITY_MIN); if (IS_ERR(ctx)) { DRM_ERROR("Failed to create default global context\n"); - err = PTR_ERR(ctx); - goto err; + return PTR_ERR(ctx); } /* * For easy recognisablity, we want the kernel context to be 0 and then @@ -479,23 +477,18 @@ int i915_gem_contexts_init(struct drm_i915_private *dev_priv) dev_priv->kernel_context = ctx; /* highest priority; preempting task */ - ctx = i915_gem_context_create_kernel(dev_priv, INT_MAX); - if (IS_ERR(ctx)) { - DRM_ERROR("Failed to create default preempt context\n"); - err = PTR_ERR(ctx); - goto err_kernel_context; + if (HAS_LOGICAL_RING_PREEMPTION(dev_priv)) { + ctx = i915_gem_context_create_kernel(dev_priv, INT_MAX); + if (!IS_ERR(ctx)) + dev_priv->preempt_context = ctx; + else + DRM_ERROR("Failed to create preempt context; disabling preemption\n"); } - dev_priv->preempt_context = ctx; DRM_DEBUG_DRIVER("%s context support initialized\n", dev_priv->engine[RCS]->context_size ? "logical" : "fake"); return 0; - -err_kernel_context: - destroy_kernel_context(_priv->kernel_context); -err: - return err; } void i915_gem_contexts_lost(struct drm_i915_private *dev_priv) @@ -521,7 +514,8 @@ void i915_gem_contexts_fini(struct drm_i915_private *i915) { lockdep_assert_held(>drm.struct_mutex); - destroy_kernel_context(>preempt_context); + if (i915->preempt_context) + destroy_kernel_context(>preempt_context);