Re: [PATCH v2 3/7] drm/i915/uc: use different ggtt pin offsets for uc loads

2022-10-18 Thread Ceraolo Spurio, Daniele




On 10/17/2022 4:44 PM, John Harrison wrote:

On 10/12/2022 17:03, Daniele Ceraolo Spurio wrote:

Our current FW loading process is the same for all FWs:

- Pin FW to GGTT at the start of the ggtt->uc_fw node
- Load the FW
- Unpin

This worked because we didn't have a case where 2 FWs would be loaded on
the same GGTT at the same time. On MTL, however, this can happend if 
both

GTs are reset at the same time, so we can't pin everything in the same
spot and we need to use separate offset. For simplicity, instead of
calculating the exact required size, we reserve a 2MB slot for each fw.

v2: fail fetch if FW is > 2MBs, improve comments (John)

Signed-off-by: Daniele Ceraolo Spurio 
Cc: John Harrison 
Cc: Alan Previn 
---
  drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c | 30 +---
  drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h | 13 ++
  2 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c 
b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c

index de2843dc1307..021290a26195 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
@@ -575,6 +575,17 @@ int intel_uc_fw_fetch(struct intel_uc_fw *uc_fw)
  err = firmware_request_nowarn(, uc_fw->file_selected.path, 
dev);

  memcpy(_ideal, _fw->file_wanted, sizeof(file_ideal));
  +    if (!err && fw->size > INTEL_UC_RSVD_GGTT_PER_FW) {
+    drm_err(>drm,
+    "%s firmware %s: size (%zuKB) exceeds max supported size 
(%uKB)\n",
+    intel_uc_fw_type_repr(uc_fw->type), 
uc_fw->file_selected.path,

+    fw->size / SZ_1K, INTEL_UC_RSVD_GGTT_PER_FW / SZ_1K);
+
+    /* try to find another blob to load */
+    release_firmware(fw);
+    err = -ENOENT;
+    }
+
  /* Any error is terminal if overriding. Don't bother searching 
for older versions */

  if (err && intel_uc_fw_is_overridden(uc_fw))
  goto fail;
@@ -677,14 +688,28 @@ int intel_uc_fw_fetch(struct intel_uc_fw *uc_fw)
    static u32 uc_fw_ggtt_offset(struct intel_uc_fw *uc_fw)
  {
-    struct i915_ggtt *ggtt = __uc_fw_to_gt(uc_fw)->ggtt;
+    struct intel_gt *gt = __uc_fw_to_gt(uc_fw);
+    struct i915_ggtt *ggtt = gt->ggtt;
  struct drm_mm_node *node = >uc_fw;
+    u32 offset = uc_fw->type * INTEL_UC_RSVD_GGTT_PER_FW;
+
+    /*
+ * To keep the math simple, we use 8MB for the root tile and 8MB 
for

+ * the media one. This will need to be updated if we ever have more
+ * than 1 media GT
+ */
+    BUILD_BUG_ON(INTEL_UC_FW_NUM_TYPES * INTEL_UC_RSVD_GGTT_PER_FW > 
SZ_8M);

+    GEM_BUG_ON(gt->type == GT_MEDIA && gt->info.id > 1);
+    if (gt->type == GT_MEDIA)
+    offset += SZ_8M;
This is all because render/media GTs share the same page tables? 
Regular multi-tile is completely separate address spaces and can use a 
single common address? Otherwise, it seems like 'offset = gt->info.id 
* 2M' would be the generic solution and no reference to GT_MEDIA 
required. So maybe add a quick comment to that effect?


Yup this is only because of the GGTT sharing. There is already a comment 
somewhere else, but I'll add one here as well.






GEM_BUG_ON(!drm_mm_node_allocated(node));
  GEM_BUG_ON(upper_32_bits(node->start));
  GEM_BUG_ON(upper_32_bits(node->start + node->size - 1));
+    GEM_BUG_ON(offset + uc_fw->obj->base.size > node->size);
+    GEM_BUG_ON(uc_fw->obj->base.size > INTEL_UC_RSVD_GGTT_PER_FW);
  -    return lower_32_bits(node->start);
+    return lower_32_bits(node->start + offset);
  }
    static void uc_fw_bind_ggtt(struct intel_uc_fw *uc_fw)
@@ -699,7 +724,6 @@ static void uc_fw_bind_ggtt(struct intel_uc_fw 
*uc_fw)

  dummy->bi.pages = obj->mm.pages;
    GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
-    GEM_BUG_ON(dummy->node_size > ggtt->uc_fw.size);
    /* uc_fw->obj cache domains were not controlled across 
suspend */

  if (i915_gem_object_has_struct_page(obj))
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h 
b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h

index cb586f7df270..7b3db41efa6e 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h
@@ -6,6 +6,7 @@
  #ifndef _INTEL_UC_FW_H_
  #define _INTEL_UC_FW_H_
  +#include 
  #include 
  #include "intel_uc_fw_abi.h"
  #include "intel_device_info.h"
@@ -114,6 +115,18 @@ struct intel_uc_fw {
(uc)->fw.file_selected.minor_ver, \
(uc)->fw.file_selected.patch_ver))
  +/*
+ * When we load the uC binaries, we pin them in a reserved section 
at the top of
+ * the GGTT, which is ~18 MBs. On multi-GT systems where the GTs 
share the GGTT,
^^^ meaning only systems with a render GT + media GT as opposed to 
regular multi-GT systems? Would be good to make that explicit either 
here or above (or both).


I'll add a comment above where we reference the media gt.

Daniele



John.

+ * we also need to make sure that each binary is pinned to a unique 
location
+ * during load, because the different GT 

Re: [PATCH v2 3/7] drm/i915/uc: use different ggtt pin offsets for uc loads

2022-10-17 Thread John Harrison

On 10/12/2022 17:03, Daniele Ceraolo Spurio wrote:

Our current FW loading process is the same for all FWs:

- Pin FW to GGTT at the start of the ggtt->uc_fw node
- Load the FW
- Unpin

This worked because we didn't have a case where 2 FWs would be loaded on
the same GGTT at the same time. On MTL, however, this can happend if both
GTs are reset at the same time, so we can't pin everything in the same
spot and we need to use separate offset. For simplicity, instead of
calculating the exact required size, we reserve a 2MB slot for each fw.

v2: fail fetch if FW is > 2MBs, improve comments (John)

Signed-off-by: Daniele Ceraolo Spurio 
Cc: John Harrison 
Cc: Alan Previn 
---
  drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c | 30 +---
  drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h | 13 ++
  2 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c 
b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
index de2843dc1307..021290a26195 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
@@ -575,6 +575,17 @@ int intel_uc_fw_fetch(struct intel_uc_fw *uc_fw)
err = firmware_request_nowarn(, uc_fw->file_selected.path, dev);
memcpy(_ideal, _fw->file_wanted, sizeof(file_ideal));
  
+	if (!err && fw->size > INTEL_UC_RSVD_GGTT_PER_FW) {

+   drm_err(>drm,
+   "%s firmware %s: size (%zuKB) exceeds max supported size 
(%uKB)\n",
+   intel_uc_fw_type_repr(uc_fw->type), 
uc_fw->file_selected.path,
+   fw->size / SZ_1K, INTEL_UC_RSVD_GGTT_PER_FW / SZ_1K);
+
+   /* try to find another blob to load */
+   release_firmware(fw);
+   err = -ENOENT;
+   }
+
/* Any error is terminal if overriding. Don't bother searching for 
older versions */
if (err && intel_uc_fw_is_overridden(uc_fw))
goto fail;
@@ -677,14 +688,28 @@ int intel_uc_fw_fetch(struct intel_uc_fw *uc_fw)
  
  static u32 uc_fw_ggtt_offset(struct intel_uc_fw *uc_fw)

  {
-   struct i915_ggtt *ggtt = __uc_fw_to_gt(uc_fw)->ggtt;
+   struct intel_gt *gt = __uc_fw_to_gt(uc_fw);
+   struct i915_ggtt *ggtt = gt->ggtt;
struct drm_mm_node *node = >uc_fw;
+   u32 offset = uc_fw->type * INTEL_UC_RSVD_GGTT_PER_FW;
+
+   /*
+* To keep the math simple, we use 8MB for the root tile and 8MB for
+* the media one. This will need to be updated if we ever have more
+* than 1 media GT
+*/
+   BUILD_BUG_ON(INTEL_UC_FW_NUM_TYPES * INTEL_UC_RSVD_GGTT_PER_FW > SZ_8M);
+   GEM_BUG_ON(gt->type == GT_MEDIA && gt->info.id > 1);
+   if (gt->type == GT_MEDIA)
+   offset += SZ_8M;
This is all because render/media GTs share the same page tables? Regular 
multi-tile is completely separate address spaces and can use a single 
common address? Otherwise, it seems like 'offset = gt->info.id * 2M' 
would be the generic solution and no reference to GT_MEDIA required. So 
maybe add a quick comment to that effect?



  
  	GEM_BUG_ON(!drm_mm_node_allocated(node));

GEM_BUG_ON(upper_32_bits(node->start));
GEM_BUG_ON(upper_32_bits(node->start + node->size - 1));
+   GEM_BUG_ON(offset + uc_fw->obj->base.size > node->size);
+   GEM_BUG_ON(uc_fw->obj->base.size > INTEL_UC_RSVD_GGTT_PER_FW);
  
-	return lower_32_bits(node->start);

+   return lower_32_bits(node->start + offset);
  }
  
  static void uc_fw_bind_ggtt(struct intel_uc_fw *uc_fw)

@@ -699,7 +724,6 @@ static void uc_fw_bind_ggtt(struct intel_uc_fw *uc_fw)
dummy->bi.pages = obj->mm.pages;
  
  	GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));

-   GEM_BUG_ON(dummy->node_size > ggtt->uc_fw.size);
  
  	/* uc_fw->obj cache domains were not controlled across suspend */

if (i915_gem_object_has_struct_page(obj))
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h 
b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h
index cb586f7df270..7b3db41efa6e 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h
@@ -6,6 +6,7 @@
  #ifndef _INTEL_UC_FW_H_
  #define _INTEL_UC_FW_H_
  
+#include 

  #include 
  #include "intel_uc_fw_abi.h"
  #include "intel_device_info.h"
@@ -114,6 +115,18 @@ struct intel_uc_fw {
 
(uc)->fw.file_selected.minor_ver, \
 
(uc)->fw.file_selected.patch_ver))
  
+/*

+ * When we load the uC binaries, we pin them in a reserved section at the top 
of
+ * the GGTT, which is ~18 MBs. On multi-GT systems where the GTs share the 
GGTT,
^^^ meaning only systems with a render GT + media GT as opposed to 
regular multi-GT systems? Would be good to make that explicit either 
here or above (or both).


John.


+ * we also need to make sure that each binary is pinned to a unique location
+ * during load, because the different GT 

[PATCH v2 3/7] drm/i915/uc: use different ggtt pin offsets for uc loads

2022-10-12 Thread Daniele Ceraolo Spurio
Our current FW loading process is the same for all FWs:

- Pin FW to GGTT at the start of the ggtt->uc_fw node
- Load the FW
- Unpin

This worked because we didn't have a case where 2 FWs would be loaded on
the same GGTT at the same time. On MTL, however, this can happend if both
GTs are reset at the same time, so we can't pin everything in the same
spot and we need to use separate offset. For simplicity, instead of
calculating the exact required size, we reserve a 2MB slot for each fw.

v2: fail fetch if FW is > 2MBs, improve comments (John)

Signed-off-by: Daniele Ceraolo Spurio 
Cc: John Harrison 
Cc: Alan Previn 
---
 drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c | 30 +---
 drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h | 13 ++
 2 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c 
b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
index de2843dc1307..021290a26195 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
@@ -575,6 +575,17 @@ int intel_uc_fw_fetch(struct intel_uc_fw *uc_fw)
err = firmware_request_nowarn(, uc_fw->file_selected.path, dev);
memcpy(_ideal, _fw->file_wanted, sizeof(file_ideal));
 
+   if (!err && fw->size > INTEL_UC_RSVD_GGTT_PER_FW) {
+   drm_err(>drm,
+   "%s firmware %s: size (%zuKB) exceeds max supported 
size (%uKB)\n",
+   intel_uc_fw_type_repr(uc_fw->type), 
uc_fw->file_selected.path,
+   fw->size / SZ_1K, INTEL_UC_RSVD_GGTT_PER_FW / SZ_1K);
+
+   /* try to find another blob to load */
+   release_firmware(fw);
+   err = -ENOENT;
+   }
+
/* Any error is terminal if overriding. Don't bother searching for 
older versions */
if (err && intel_uc_fw_is_overridden(uc_fw))
goto fail;
@@ -677,14 +688,28 @@ int intel_uc_fw_fetch(struct intel_uc_fw *uc_fw)
 
 static u32 uc_fw_ggtt_offset(struct intel_uc_fw *uc_fw)
 {
-   struct i915_ggtt *ggtt = __uc_fw_to_gt(uc_fw)->ggtt;
+   struct intel_gt *gt = __uc_fw_to_gt(uc_fw);
+   struct i915_ggtt *ggtt = gt->ggtt;
struct drm_mm_node *node = >uc_fw;
+   u32 offset = uc_fw->type * INTEL_UC_RSVD_GGTT_PER_FW;
+
+   /*
+* To keep the math simple, we use 8MB for the root tile and 8MB for
+* the media one. This will need to be updated if we ever have more
+* than 1 media GT
+*/
+   BUILD_BUG_ON(INTEL_UC_FW_NUM_TYPES * INTEL_UC_RSVD_GGTT_PER_FW > SZ_8M);
+   GEM_BUG_ON(gt->type == GT_MEDIA && gt->info.id > 1);
+   if (gt->type == GT_MEDIA)
+   offset += SZ_8M;
 
GEM_BUG_ON(!drm_mm_node_allocated(node));
GEM_BUG_ON(upper_32_bits(node->start));
GEM_BUG_ON(upper_32_bits(node->start + node->size - 1));
+   GEM_BUG_ON(offset + uc_fw->obj->base.size > node->size);
+   GEM_BUG_ON(uc_fw->obj->base.size > INTEL_UC_RSVD_GGTT_PER_FW);
 
-   return lower_32_bits(node->start);
+   return lower_32_bits(node->start + offset);
 }
 
 static void uc_fw_bind_ggtt(struct intel_uc_fw *uc_fw)
@@ -699,7 +724,6 @@ static void uc_fw_bind_ggtt(struct intel_uc_fw *uc_fw)
dummy->bi.pages = obj->mm.pages;
 
GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
-   GEM_BUG_ON(dummy->node_size > ggtt->uc_fw.size);
 
/* uc_fw->obj cache domains were not controlled across suspend */
if (i915_gem_object_has_struct_page(obj))
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h 
b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h
index cb586f7df270..7b3db41efa6e 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h
@@ -6,6 +6,7 @@
 #ifndef _INTEL_UC_FW_H_
 #define _INTEL_UC_FW_H_
 
+#include 
 #include 
 #include "intel_uc_fw_abi.h"
 #include "intel_device_info.h"
@@ -114,6 +115,18 @@ struct intel_uc_fw {
 
(uc)->fw.file_selected.minor_ver, \
 
(uc)->fw.file_selected.patch_ver))
 
+/*
+ * When we load the uC binaries, we pin them in a reserved section at the top 
of
+ * the GGTT, which is ~18 MBs. On multi-GT systems where the GTs share the 
GGTT,
+ * we also need to make sure that each binary is pinned to a unique location
+ * during load, because the different GT can go through the FW load at the same
+ * time. Given that the available space is much greater than what is required 
by
+ * the binaries, to keep things simple instead of dynamically partitioning the
+ * reserved section to make space for all the blobs we can just reserve a 
static
+ * chunk for each binary.
+ */
+#define INTEL_UC_RSVD_GGTT_PER_FW SZ_2M
+
 #ifdef CONFIG_DRM_I915_DEBUG_GUC
 void intel_uc_fw_change_status(struct intel_uc_fw *uc_fw,
   enum intel_uc_fw_status status);
-- 
2.37.3