Re: [PATCH v2 3/6] drm/i915 Implement LMEM backup and restore for suspend / resume

2021-09-10 Thread Thomas Hellström



On 9/6/21 6:55 PM, Thomas Hellström wrote:

Just evict unpinned objects to system. For pinned LMEM objects,
make a backup system object and blit the contents to that.

Backup is performed in three steps,
1: Opportunistically evict evictable objects using the gpu blitter.
2: After gt idle, evict evictable objects using the gpu blitter. This will
be modified in an upcoming patch to backup pinned objects that are not used
by the blitter itself.
3: Backup remaining pinned objects using memcpy.

Also move uC suspend to after 2) to make sure we have a functional GuC
during 2) if using GuC submission.

v2:
- Major refactor to make sure gem_exec_suspend@hang-SX subtests work, and
   suspend / resume works with a slightly modified GuC submission enabling
   patch series.

Signed-off-by: Thomas Hellström 
---
  drivers/gpu/drm/i915/Makefile |   1 +
  .../gpu/drm/i915/gem/i915_gem_object_types.h  |   1 +
  drivers/gpu/drm/i915/gem/i915_gem_pm.c|  92 +++-
  drivers/gpu/drm/i915/gem/i915_gem_pm.h|   3 +-
  drivers/gpu/drm/i915/gem/i915_gem_ttm.c   |  29 ++-
  drivers/gpu/drm/i915/gem/i915_gem_ttm.h   |  10 +
  drivers/gpu/drm/i915/gem/i915_gem_ttm_pm.c| 205 ++
  drivers/gpu/drm/i915/gem/i915_gem_ttm_pm.h|  24 ++
  drivers/gpu/drm/i915/gt/intel_gt_pm.c |   4 +-
  drivers/gpu/drm/i915/i915_drv.c   |  10 +-
  drivers/gpu/drm/i915/i915_drv.h   |   2 +-
  11 files changed, 364 insertions(+), 17 deletions(-)
  create mode 100644 drivers/gpu/drm/i915/gem/i915_gem_ttm_pm.c
  create mode 100644 drivers/gpu/drm/i915/gem/i915_gem_ttm_pm.h

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index c36c8a4f0716..3379a0a6c91e 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -155,6 +155,7 @@ gem-y += \
gem/i915_gem_throttle.o \
gem/i915_gem_tiling.o \
gem/i915_gem_ttm.o \
+   gem/i915_gem_ttm_pm.o \
gem/i915_gem_userptr.o \
gem/i915_gem_wait.o \
gem/i915_gemfs.o
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h 
b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
index 2471f36aaff3..734cc8e16481 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
@@ -534,6 +534,7 @@ struct drm_i915_gem_object {
struct {
struct sg_table *cached_io_st;
struct i915_gem_object_page_iter get_io_page;
+   struct drm_i915_gem_object *backup;
bool created:1;
} ttm;
  
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pm.c b/drivers/gpu/drm/i915/gem/i915_gem_pm.c

index 8b9d7d14c4bd..9746c255ddcc 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_pm.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_pm.c
@@ -5,6 +5,7 @@
   */
  
  #include "gem/i915_gem_pm.h"

+#include "gem/i915_gem_ttm_pm.h"
  #include "gt/intel_gt.h"
  #include "gt/intel_gt_pm.h"
  #include "gt/intel_gt_requests.h"
@@ -39,7 +40,79 @@ void i915_gem_suspend(struct drm_i915_private *i915)
i915_gem_drain_freed_objects(i915);
  }
  
-void i915_gem_suspend_late(struct drm_i915_private *i915)

+static int lmem_restore(struct drm_i915_private *i915, bool allow_gpu)
+{
+   struct intel_memory_region *mr;
+   int ret = 0, id;
+
+   for_each_memory_region(mr, i915, id) {
+   if (mr->type == INTEL_MEMORY_LOCAL) {
+   ret = i915_ttm_restore_region(mr, allow_gpu);
+   if (ret)
+   break;
+   }
+   }
+
+   return ret;
+}
+
+static int lmem_suspend(struct drm_i915_private *i915, bool allow_gpu,
+   bool backup_pinned)
+{
+   struct intel_memory_region *mr;
+   int ret = 0, id;
+
+   for_each_memory_region(mr, i915, id) {
+   if (mr->type == INTEL_MEMORY_LOCAL) {
+   ret = i915_ttm_backup_region(mr, allow_gpu, 
backup_pinned);
+   if (ret)
+   break;
+   }
+   }
+
+   return ret;
+}
+
+static void lmem_recover(struct drm_i915_private *i915)
+{
+   struct intel_memory_region *mr;
+   int id;
+
+   for_each_memory_region(mr, i915, id)
+   if (mr->type == INTEL_MEMORY_LOCAL)
+   i915_ttm_recover_region(mr);
+}
+
+int i915_gem_backup_suspend(struct drm_i915_private *i915)
+{
+   int ret;
+
+   /* Opportunistically try to evict unpinned objects */
+   ret = lmem_suspend(i915, true, false);
+   if (ret)
+   goto out_recover;
+
+   i915_gem_suspend(i915);
+
+   /*
+* More objects may have become unpinned as requests were
+* retired. Now try to evict again. The gt may be wedged here
+* in which case we automatically fall back to memcpy.
+*/
+
+   ret = lmem_suspend(i915, true, false);
+   if (ret)
+  

Re: [PATCH v2 3/6] drm/i915 Implement LMEM backup and restore for suspend / resume

2021-09-08 Thread Thomas Hellström

Hi, Matt,

Thanks for reviewing.

On 9/7/21 7:37 PM, Matthew Auld wrote:



+    i915_gem_ww_unlock_single(backup);
+    i915_gem_object_put(backup);


I assume we need to set ttm.backup = NULL somewhere here on the 
failure path, or don't drop the ref? Or at least it looks like 
potential uaf later?


Yes, I think on failure, we just don't drop the ref here in case 
something at some point decides to retry.


I'll fix up this and other comments.

/Thomas





+
+    return err;
+}
+


Re: [PATCH v2 3/6] drm/i915 Implement LMEM backup and restore for suspend / resume

2021-09-07 Thread Matthew Auld

On 06/09/2021 17:55, Thomas Hellström wrote:

Just evict unpinned objects to system. For pinned LMEM objects,
make a backup system object and blit the contents to that.

Backup is performed in three steps,
1: Opportunistically evict evictable objects using the gpu blitter.
2: After gt idle, evict evictable objects using the gpu blitter. This will
be modified in an upcoming patch to backup pinned objects that are not used
by the blitter itself.
3: Backup remaining pinned objects using memcpy.

Also move uC suspend to after 2) to make sure we have a functional GuC
during 2) if using GuC submission.

v2:
- Major refactor to make sure gem_exec_suspend@hang-SX subtests work, and
   suspend / resume works with a slightly modified GuC submission enabling
   patch series.

Signed-off-by: Thomas Hellström 





+
+static int i915_ttm_backup(struct i915_gem_apply_to_region *apply,
+  struct drm_i915_gem_object *obj)
+{
+   struct i915_gem_ttm_pm_apply *pm_apply =
+   container_of(apply, typeof(*pm_apply), base);
+   struct ttm_buffer_object *bo = i915_gem_to_ttm(obj);
+   struct ttm_buffer_object *backup_bo;
+   struct drm_i915_private *i915 =
+   container_of(bo->bdev, typeof(*i915), bdev);
+   struct intel_memory_region *sys_region;
+   struct drm_i915_gem_object *backup;
+   struct ttm_operation_ctx ctx = {};
+   int err = 0;
+
+   if (bo->resource->mem_type == I915_PL_SYSTEM || obj->ttm.backup)
+   return 0;
+
+   if (pm_apply->allow_gpu && i915_gem_object_evictable(obj))
+   return ttm_bo_validate(bo, i915_ttm_sys_placement(), );
+
+   if (!pm_apply->backup_pinned)
+   return 0;
+
+   sys_region = i915->mm.regions[INTEL_REGION_SMEM];
+   backup = i915_gem_object_create_region(sys_region,
+  obj->base.size,
+  0, 0);


create_shmem()?


+   if (IS_ERR(backup))
+   return PTR_ERR(backup);
+
+   err = i915_gem_object_lock(backup, apply->ww);
+   if (err)
+   goto out_no_lock;
+
+   backup_bo = i915_gem_to_ttm(backup);
+   err = ttm_tt_populate(backup_bo->bdev, backup_bo->ttm, );
+   if (err)
+   goto out_no_populate;
+
+   err = i915_gem_obj_copy_ttm(backup, obj, pm_apply->allow_gpu, false);
+   GEM_WARN_ON(err);
+
+   obj->ttm.backup = backup;
+   return 0;
+
+out_no_populate:
+   i915_gem_ww_unlock_single(backup);
+out_no_lock:
+   i915_gem_object_put(backup);
+
+   return err;
+}
+
+static int i915_ttm_recover(struct i915_gem_apply_to_region *apply,
+   struct drm_i915_gem_object *obj)
+{
+   i915_ttm_backup_free(obj);
+   return 0;
+}
+
+/**
+ * i915_ttm_recover_region - Free the backup of all objects of a region
+ * @mr: The memory region
+ *
+ * Checks all objects of a region if there is backup attached and if so
+ * frees that backup. Typically this is called to recover after a partially
+ * performed backup.
+ */
+void i915_ttm_recover_region(struct intel_memory_region *mr)
+{
+   static const struct i915_gem_apply_to_region_ops recover_ops = {
+   .process_obj = i915_ttm_recover,
+   };
+   struct i915_gem_apply_to_region apply = {.ops = _ops};
+   int ret;
+
+   ret = i915_gem_process_region(mr, );
+   GEM_WARN_ON(ret);
+}
+
+/**
+ * i915_ttm_backup_region - Back up all objects of a region to smem.
+ * @mr: The memory region
+ * @allow_gpu: Whether to allow the gpu blitter for this backup.
+ * @backup_pinned: Backup also pinned objects.
+ *
+ * Loops over all objects of a region and either evicts them if they are
+ * evictable or backs them up using a backup object if they are pinned.
+ *
+ * Return: Zero on success. Negative error code on error.
+ */
+int i915_ttm_backup_region(struct intel_memory_region *mr, bool allow_gpu,
+  bool backup_pinned)
+{
+   static const struct i915_gem_apply_to_region_ops backup_ops = {
+   .process_obj = i915_ttm_backup,
+   };
+   struct i915_gem_ttm_pm_apply pm_apply = {
+   .base = {.ops = _ops},
+   .allow_gpu = allow_gpu,
+   .backup_pinned = backup_pinned,
+   };
+
+   return i915_gem_process_region(mr, _apply.base);
+}
+
+static int i915_ttm_restore(struct i915_gem_apply_to_region *apply,
+   struct drm_i915_gem_object *obj)
+{
+   struct i915_gem_ttm_pm_apply *pm_apply =
+   container_of(apply, typeof(*pm_apply), base);
+   struct drm_i915_gem_object *backup = obj->ttm.backup;
+   struct ttm_buffer_object *backup_bo = i915_gem_to_ttm(backup);
+   struct ttm_operation_ctx ctx = {};
+   int err;
+
+   if (!obj->ttm.backup)


if (!backup)


+   return 0;
+
+   if (!pm_apply->allow_gpu && (obj->flags & I915_BO_ALLOC_USER))
+