Re: [PATCH v2 3/6] drm/i915 Implement LMEM backup and restore for suspend / resume
On 9/6/21 6:55 PM, Thomas Hellström wrote: Just evict unpinned objects to system. For pinned LMEM objects, make a backup system object and blit the contents to that. Backup is performed in three steps, 1: Opportunistically evict evictable objects using the gpu blitter. 2: After gt idle, evict evictable objects using the gpu blitter. This will be modified in an upcoming patch to backup pinned objects that are not used by the blitter itself. 3: Backup remaining pinned objects using memcpy. Also move uC suspend to after 2) to make sure we have a functional GuC during 2) if using GuC submission. v2: - Major refactor to make sure gem_exec_suspend@hang-SX subtests work, and suspend / resume works with a slightly modified GuC submission enabling patch series. Signed-off-by: Thomas Hellström --- drivers/gpu/drm/i915/Makefile | 1 + .../gpu/drm/i915/gem/i915_gem_object_types.h | 1 + drivers/gpu/drm/i915/gem/i915_gem_pm.c| 92 +++- drivers/gpu/drm/i915/gem/i915_gem_pm.h| 3 +- drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 29 ++- drivers/gpu/drm/i915/gem/i915_gem_ttm.h | 10 + drivers/gpu/drm/i915/gem/i915_gem_ttm_pm.c| 205 ++ drivers/gpu/drm/i915/gem/i915_gem_ttm_pm.h| 24 ++ drivers/gpu/drm/i915/gt/intel_gt_pm.c | 4 +- drivers/gpu/drm/i915/i915_drv.c | 10 +- drivers/gpu/drm/i915/i915_drv.h | 2 +- 11 files changed, 364 insertions(+), 17 deletions(-) create mode 100644 drivers/gpu/drm/i915/gem/i915_gem_ttm_pm.c create mode 100644 drivers/gpu/drm/i915/gem/i915_gem_ttm_pm.h diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile index c36c8a4f0716..3379a0a6c91e 100644 --- a/drivers/gpu/drm/i915/Makefile +++ b/drivers/gpu/drm/i915/Makefile @@ -155,6 +155,7 @@ gem-y += \ gem/i915_gem_throttle.o \ gem/i915_gem_tiling.o \ gem/i915_gem_ttm.o \ + gem/i915_gem_ttm_pm.o \ gem/i915_gem_userptr.o \ gem/i915_gem_wait.o \ gem/i915_gemfs.o diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h index 2471f36aaff3..734cc8e16481 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h @@ -534,6 +534,7 @@ struct drm_i915_gem_object { struct { struct sg_table *cached_io_st; struct i915_gem_object_page_iter get_io_page; + struct drm_i915_gem_object *backup; bool created:1; } ttm; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pm.c b/drivers/gpu/drm/i915/gem/i915_gem_pm.c index 8b9d7d14c4bd..9746c255ddcc 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_pm.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_pm.c @@ -5,6 +5,7 @@ */ #include "gem/i915_gem_pm.h" +#include "gem/i915_gem_ttm_pm.h" #include "gt/intel_gt.h" #include "gt/intel_gt_pm.h" #include "gt/intel_gt_requests.h" @@ -39,7 +40,79 @@ void i915_gem_suspend(struct drm_i915_private *i915) i915_gem_drain_freed_objects(i915); } -void i915_gem_suspend_late(struct drm_i915_private *i915) +static int lmem_restore(struct drm_i915_private *i915, bool allow_gpu) +{ + struct intel_memory_region *mr; + int ret = 0, id; + + for_each_memory_region(mr, i915, id) { + if (mr->type == INTEL_MEMORY_LOCAL) { + ret = i915_ttm_restore_region(mr, allow_gpu); + if (ret) + break; + } + } + + return ret; +} + +static int lmem_suspend(struct drm_i915_private *i915, bool allow_gpu, + bool backup_pinned) +{ + struct intel_memory_region *mr; + int ret = 0, id; + + for_each_memory_region(mr, i915, id) { + if (mr->type == INTEL_MEMORY_LOCAL) { + ret = i915_ttm_backup_region(mr, allow_gpu, backup_pinned); + if (ret) + break; + } + } + + return ret; +} + +static void lmem_recover(struct drm_i915_private *i915) +{ + struct intel_memory_region *mr; + int id; + + for_each_memory_region(mr, i915, id) + if (mr->type == INTEL_MEMORY_LOCAL) + i915_ttm_recover_region(mr); +} + +int i915_gem_backup_suspend(struct drm_i915_private *i915) +{ + int ret; + + /* Opportunistically try to evict unpinned objects */ + ret = lmem_suspend(i915, true, false); + if (ret) + goto out_recover; + + i915_gem_suspend(i915); + + /* +* More objects may have become unpinned as requests were +* retired. Now try to evict again. The gt may be wedged here +* in which case we automatically fall back to memcpy. +*/ + + ret = lmem_suspend(i915, true, false); + if (ret) +
Re: [PATCH v2 3/6] drm/i915 Implement LMEM backup and restore for suspend / resume
Hi, Matt, Thanks for reviewing. On 9/7/21 7:37 PM, Matthew Auld wrote: + i915_gem_ww_unlock_single(backup); + i915_gem_object_put(backup); I assume we need to set ttm.backup = NULL somewhere here on the failure path, or don't drop the ref? Or at least it looks like potential uaf later? Yes, I think on failure, we just don't drop the ref here in case something at some point decides to retry. I'll fix up this and other comments. /Thomas + + return err; +} +
Re: [PATCH v2 3/6] drm/i915 Implement LMEM backup and restore for suspend / resume
On 06/09/2021 17:55, Thomas Hellström wrote: Just evict unpinned objects to system. For pinned LMEM objects, make a backup system object and blit the contents to that. Backup is performed in three steps, 1: Opportunistically evict evictable objects using the gpu blitter. 2: After gt idle, evict evictable objects using the gpu blitter. This will be modified in an upcoming patch to backup pinned objects that are not used by the blitter itself. 3: Backup remaining pinned objects using memcpy. Also move uC suspend to after 2) to make sure we have a functional GuC during 2) if using GuC submission. v2: - Major refactor to make sure gem_exec_suspend@hang-SX subtests work, and suspend / resume works with a slightly modified GuC submission enabling patch series. Signed-off-by: Thomas Hellström + +static int i915_ttm_backup(struct i915_gem_apply_to_region *apply, + struct drm_i915_gem_object *obj) +{ + struct i915_gem_ttm_pm_apply *pm_apply = + container_of(apply, typeof(*pm_apply), base); + struct ttm_buffer_object *bo = i915_gem_to_ttm(obj); + struct ttm_buffer_object *backup_bo; + struct drm_i915_private *i915 = + container_of(bo->bdev, typeof(*i915), bdev); + struct intel_memory_region *sys_region; + struct drm_i915_gem_object *backup; + struct ttm_operation_ctx ctx = {}; + int err = 0; + + if (bo->resource->mem_type == I915_PL_SYSTEM || obj->ttm.backup) + return 0; + + if (pm_apply->allow_gpu && i915_gem_object_evictable(obj)) + return ttm_bo_validate(bo, i915_ttm_sys_placement(), ); + + if (!pm_apply->backup_pinned) + return 0; + + sys_region = i915->mm.regions[INTEL_REGION_SMEM]; + backup = i915_gem_object_create_region(sys_region, + obj->base.size, + 0, 0); create_shmem()? + if (IS_ERR(backup)) + return PTR_ERR(backup); + + err = i915_gem_object_lock(backup, apply->ww); + if (err) + goto out_no_lock; + + backup_bo = i915_gem_to_ttm(backup); + err = ttm_tt_populate(backup_bo->bdev, backup_bo->ttm, ); + if (err) + goto out_no_populate; + + err = i915_gem_obj_copy_ttm(backup, obj, pm_apply->allow_gpu, false); + GEM_WARN_ON(err); + + obj->ttm.backup = backup; + return 0; + +out_no_populate: + i915_gem_ww_unlock_single(backup); +out_no_lock: + i915_gem_object_put(backup); + + return err; +} + +static int i915_ttm_recover(struct i915_gem_apply_to_region *apply, + struct drm_i915_gem_object *obj) +{ + i915_ttm_backup_free(obj); + return 0; +} + +/** + * i915_ttm_recover_region - Free the backup of all objects of a region + * @mr: The memory region + * + * Checks all objects of a region if there is backup attached and if so + * frees that backup. Typically this is called to recover after a partially + * performed backup. + */ +void i915_ttm_recover_region(struct intel_memory_region *mr) +{ + static const struct i915_gem_apply_to_region_ops recover_ops = { + .process_obj = i915_ttm_recover, + }; + struct i915_gem_apply_to_region apply = {.ops = _ops}; + int ret; + + ret = i915_gem_process_region(mr, ); + GEM_WARN_ON(ret); +} + +/** + * i915_ttm_backup_region - Back up all objects of a region to smem. + * @mr: The memory region + * @allow_gpu: Whether to allow the gpu blitter for this backup. + * @backup_pinned: Backup also pinned objects. + * + * Loops over all objects of a region and either evicts them if they are + * evictable or backs them up using a backup object if they are pinned. + * + * Return: Zero on success. Negative error code on error. + */ +int i915_ttm_backup_region(struct intel_memory_region *mr, bool allow_gpu, + bool backup_pinned) +{ + static const struct i915_gem_apply_to_region_ops backup_ops = { + .process_obj = i915_ttm_backup, + }; + struct i915_gem_ttm_pm_apply pm_apply = { + .base = {.ops = _ops}, + .allow_gpu = allow_gpu, + .backup_pinned = backup_pinned, + }; + + return i915_gem_process_region(mr, _apply.base); +} + +static int i915_ttm_restore(struct i915_gem_apply_to_region *apply, + struct drm_i915_gem_object *obj) +{ + struct i915_gem_ttm_pm_apply *pm_apply = + container_of(apply, typeof(*pm_apply), base); + struct drm_i915_gem_object *backup = obj->ttm.backup; + struct ttm_buffer_object *backup_bo = i915_gem_to_ttm(backup); + struct ttm_operation_ctx ctx = {}; + int err; + + if (!obj->ttm.backup) if (!backup) + return 0; + + if (!pm_apply->allow_gpu && (obj->flags & I915_BO_ALLOC_USER)) +