amdgpu: Throttle visible VRAM moves separately

John Brooks Fri, 07 Jul 2017 15:24:07 -0700

On Thu, Jul 06, 2017 at 07:51:27PM +0900, Michel Dänzer wrote:
> From: John Brooks <j...@fastquake.com>
> 
> The BO move throttling code is designed to allow VRAM to fill quickly if it
> is relatively empty. However, this does not take into account situations
> where the visible VRAM is smaller than total VRAM, and total VRAM may not
> be close to full but the visible VRAM segment is under pressure. In such
> situations, visible VRAM would experience unrestricted swapping and
> performance would drop.
> 
> Add a separate counter specifically for moves involving visible VRAM, and
> check it before moving BOs there.
> 
> v2: Only perform calculations for separate counter if visible VRAM is
>     smaller than total VRAM. (Michel Dänzer)
> v3: [Michel Dänzer]
> * Use BO's location rather than the AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED
>   flag to determine whether to account a move for visible VRAM in most
>   cases.
> * Use a single
> 
>       if (adev->mc.visible_vram_size < adev->mc.real_vram_size) {
> 
>   block in amdgpu_cs_get_threshold_for_moves.
> 
> Fixes: 95844d20ae02 (drm/amdgpu: throttle buffer migrations at CS using a 
> fixed MBps limit (v2))
> Signed-off-by: John Brooks <j...@fastquake.com>
> Signed-off-by: Michel Dänzer <michel.daen...@amd.com>


Changes look good. You can have my

Reviewed-by: John Brooks <j...@fastquake.com>

if you want it.

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  6 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     | 92 
> ++++++++++++++++++++++++------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 12 +++-
>  3 files changed, 87 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index b95c1074d42c..463d6c241157 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1162,7 +1162,9 @@ struct amdgpu_cs_parser {
>       struct list_head                validated;
>       struct dma_fence                *fence;
>       uint64_t                        bytes_moved_threshold;
> +     uint64_t                        bytes_moved_vis_threshold;
>       uint64_t                        bytes_moved;
> +     uint64_t                        bytes_moved_vis;
>       struct amdgpu_bo_list_entry     *evictable;
>  
>       /* user fence */
> @@ -1596,6 +1598,7 @@ struct amdgpu_device {
>               spinlock_t              lock;
>               s64                     last_update_us;
>               s64                     accum_us; /* accumulated microseconds */
> +             s64                     accum_us_vis; /* for visible VRAM */
>               u32                     log2_max_MBps;
>       } mm_stats;
>  
> @@ -1892,7 +1895,8 @@ void amdgpu_pci_config_reset(struct amdgpu_device 
> *adev);
>  bool amdgpu_need_post(struct amdgpu_device *adev);
>  void amdgpu_update_display_priority(struct amdgpu_device *adev);
>  
> -void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes);
> +void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes,
> +                               u64 num_vis_bytes);
>  void amdgpu_ttm_placement_from_domain(struct amdgpu_bo *abo, u32 domain);
>  bool amdgpu_ttm_bo_is_amdgpu_bo(struct ttm_buffer_object *bo);
>  int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 82131d70a06b..44ec11d4d733 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -220,10 +220,11 @@ static s64 bytes_to_us(struct amdgpu_device *adev, u64 
> bytes)
>   * ticks. The accumulated microseconds (us) are converted to bytes and
>   * returned.
>   */
> -static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
> +static void amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev,
> +                                           u64 *max_bytes,
> +                                           u64 *max_vis_bytes)
>  {
>       s64 time_us, increment_us;
> -     u64 max_bytes;
>       u64 free_vram, total_vram, used_vram;
>  
>       /* Allow a maximum of 200 accumulated ms. This is basically per-IB
> @@ -235,8 +236,11 @@ static u64 amdgpu_cs_get_threshold_for_moves(struct 
> amdgpu_device *adev)
>        */
>       const s64 us_upper_bound = 200000;
>  
> -     if (!adev->mm_stats.log2_max_MBps)
> -             return 0;
> +     if (!adev->mm_stats.log2_max_MBps) {
> +             *max_bytes = 0;
> +             *max_vis_bytes = 0;
> +             return;
> +     }
>  
>       total_vram = adev->mc.real_vram_size - adev->vram_pin_size;
>       used_vram = atomic64_read(&adev->vram_usage);
> @@ -277,23 +281,45 @@ static u64 amdgpu_cs_get_threshold_for_moves(struct 
> amdgpu_device *adev)
>               adev->mm_stats.accum_us = max(min_us, adev->mm_stats.accum_us);
>       }
>  
> -     /* This returns 0 if the driver is in debt to disallow (optional)
> +     /* This is set to 0 if the driver is in debt to disallow (optional)
>        * buffer moves.
>        */
> -     max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
> +     *max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
> +
> +     /* Do the same for visible VRAM if half of it is free */
> +     if (adev->mc.visible_vram_size < adev->mc.real_vram_size) {
> +             u64 total_vis_vram = adev->mc.visible_vram_size;
> +             u64 used_vis_vram = atomic64_read(&adev->vram_vis_usage);
> +
> +             if (used_vis_vram < total_vis_vram) {
> +                     u64 free_vis_vram = total_vis_vram - used_vis_vram;
> +                     adev->mm_stats.accum_us_vis = 
> min(adev->mm_stats.accum_us_vis +
> +                                                       increment_us, 
> us_upper_bound);
> +
> +                     if (free_vis_vram >= total_vis_vram / 2)
> +                             adev->mm_stats.accum_us_vis =
> +                                     max(bytes_to_us(adev, free_vis_vram / 
> 2),
> +                                         adev->mm_stats.accum_us_vis);
> +             }
> +
> +             *max_vis_bytes = us_to_bytes(adev, adev->mm_stats.accum_us_vis);
> +     } else {
> +             *max_vis_bytes = 0;
> +     }
>  
>       spin_unlock(&adev->mm_stats.lock);
> -     return max_bytes;
>  }
>  
>  /* Report how many bytes have really been moved for the last command
>   * submission. This can result in a debt that can stop buffer migrations
>   * temporarily.
>   */
> -void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes)
> +void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes,
> +                               u64 num_vis_bytes)
>  {
>       spin_lock(&adev->mm_stats.lock);
>       adev->mm_stats.accum_us -= bytes_to_us(adev, num_bytes);
> +     adev->mm_stats.accum_us_vis -= bytes_to_us(adev, num_vis_bytes);
>       spin_unlock(&adev->mm_stats.lock);
>  }
>  
> @@ -301,7 +327,7 @@ static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser 
> *p,
>                                struct amdgpu_bo *bo)
>  {
>       struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
> -     u64 initial_bytes_moved;
> +     u64 initial_bytes_moved, bytes_moved;
>       uint32_t domain;
>       int r;
>  
> @@ -311,17 +337,35 @@ static int amdgpu_cs_bo_validate(struct 
> amdgpu_cs_parser *p,
>       /* Don't move this buffer if we have depleted our allowance
>        * to move it. Don't move anything if the threshold is zero.
>        */
> -     if (p->bytes_moved < p->bytes_moved_threshold)
> -             domain = bo->prefered_domains;
> -     else
> +     if (p->bytes_moved < p->bytes_moved_threshold) {
> +             if (adev->mc.visible_vram_size < adev->mc.real_vram_size &&
> +                 (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)) {
> +                     /* And don't move a CPU_ACCESS_REQUIRED BO to limited
> +                      * visible VRAM if we've depleted our allowance to do
> +                      * that.
> +                      */
> +                     if (p->bytes_moved_vis < p->bytes_moved_vis_threshold)
> +                             domain = bo->prefered_domains;
> +                     else
> +                             domain = bo->allowed_domains;
> +             } else {
> +                     domain = bo->prefered_domains;
> +             }
> +     } else {
>               domain = bo->allowed_domains;
> +     }
>  
>  retry:
>       amdgpu_ttm_placement_from_domain(bo, domain);
>       initial_bytes_moved = atomic64_read(&adev->num_bytes_moved);
>       r = ttm_bo_validate(&bo->tbo, &bo->placement, true, false);
> -     p->bytes_moved += atomic64_read(&adev->num_bytes_moved) -
> -             initial_bytes_moved;
> +     bytes_moved = atomic64_read(&adev->num_bytes_moved) -
> +                   initial_bytes_moved;
> +     p->bytes_moved += bytes_moved;
> +     if (adev->mc.visible_vram_size < adev->mc.real_vram_size &&
> +         bo->tbo.mem.mem_type == TTM_PL_VRAM &&
> +         bo->tbo.mem.start < adev->mc.visible_vram_size >> PAGE_SHIFT)
> +             p->bytes_moved_vis += bytes_moved;
>  
>       if (unlikely(r == -ENOMEM) && domain != bo->allowed_domains) {
>               domain = bo->allowed_domains;
> @@ -347,7 +391,8 @@ static bool amdgpu_cs_try_evict(struct amdgpu_cs_parser 
> *p,
>               struct amdgpu_bo_list_entry *candidate = p->evictable;
>               struct amdgpu_bo *bo = candidate->robj;
>               struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
> -             u64 initial_bytes_moved;
> +             u64 initial_bytes_moved, bytes_moved;
> +             bool update_bytes_moved_vis;
>               uint32_t other;
>  
>               /* If we reached our current BO we can forget it */
> @@ -367,10 +412,17 @@ static bool amdgpu_cs_try_evict(struct amdgpu_cs_parser 
> *p,
>  
>               /* Good we can try to move this BO somewhere else */
>               amdgpu_ttm_placement_from_domain(bo, other);
> +             update_bytes_moved_vis =
> +                     adev->mc.visible_vram_size < adev->mc.real_vram_size &&
> +                     bo->tbo.mem.mem_type == TTM_PL_VRAM &&
> +                     bo->tbo.mem.start < adev->mc.visible_vram_size >> 
> PAGE_SHIFT;
>               initial_bytes_moved = atomic64_read(&adev->num_bytes_moved);
>               r = ttm_bo_validate(&bo->tbo, &bo->placement, true, false);
> -             p->bytes_moved += atomic64_read(&adev->num_bytes_moved) -
> +             bytes_moved = atomic64_read(&adev->num_bytes_moved) -
>                       initial_bytes_moved;
> +             p->bytes_moved += bytes_moved;
> +             if (update_bytes_moved_vis)
> +                     p->bytes_moved_vis += bytes_moved;
>  
>               if (unlikely(r))
>                       break;
> @@ -550,8 +602,10 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser 
> *p,
>               list_splice(&need_pages, &p->validated);
>       }
>  
> -     p->bytes_moved_threshold = amdgpu_cs_get_threshold_for_moves(p->adev);
> +     amdgpu_cs_get_threshold_for_moves(p->adev, &p->bytes_moved_threshold,
> +                                       &p->bytes_moved_vis_threshold);
>       p->bytes_moved = 0;
> +     p->bytes_moved_vis = 0;
>       p->evictable = list_last_entry(&p->validated,
>                                      struct amdgpu_bo_list_entry,
>                                      tv.head);
> @@ -575,8 +629,8 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser 
> *p,
>               goto error_validate;
>       }
>  
> -     amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved);
> -
> +     amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
> +                                  p->bytes_moved_vis);
>       fpriv->vm.last_eviction_counter =
>               atomic64_read(&p->adev->num_evictions);
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index a85e75327456..e429829ae93d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -322,7 +322,7 @@ int amdgpu_bo_create_restricted(struct amdgpu_device 
> *adev,
>       struct amdgpu_bo *bo;
>       enum ttm_bo_type type;
>       unsigned long page_align;
> -     u64 initial_bytes_moved;
> +     u64 initial_bytes_moved, bytes_moved;
>       size_t acc_size;
>       int r;
>  
> @@ -398,8 +398,14 @@ int amdgpu_bo_create_restricted(struct amdgpu_device 
> *adev,
>       r = ttm_bo_init_reserved(&adev->mman.bdev, &bo->tbo, size, type,
>                                &bo->placement, page_align, !kernel, NULL,
>                                acc_size, sg, resv, &amdgpu_ttm_bo_destroy);
> -     amdgpu_cs_report_moved_bytes(adev,
> -             atomic64_read(&adev->num_bytes_moved) - initial_bytes_moved);
> +     bytes_moved = atomic64_read(&adev->num_bytes_moved) -
> +                   initial_bytes_moved;
> +     if (adev->mc.visible_vram_size < adev->mc.real_vram_size &&
> +         bo->tbo.mem.mem_type == TTM_PL_VRAM &&
> +         bo->tbo.mem.start < adev->mc.visible_vram_size >> PAGE_SHIFT)
> +             amdgpu_cs_report_moved_bytes(adev, bytes_moved, bytes_moved);
> +     else
> +             amdgpu_cs_report_moved_bytes(adev, bytes_moved, 0);
>  
>       if (unlikely(r != 0))
>               return r;
> -- 
> 2.13.2
> 
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH v3 2/5] drm/amdgpu: Throttle visible VRAM moves separately

Reply via email to