[ping]

Hi Christian or Alex,

Do you mind reviewing this change?

Thanks,
  Felix


On 2017-09-21 08:09 PM, Felix Kuehling wrote:
> When many wavefronts cause VM faults at the same time, it can
> overwhelm the interrupt handler and cause IH ring overflows before
> the driver can notify or kill the faulting application.
>
> As a workaround I'm introducing limited per-VM fault credit. After
> that number of VM faults have occurred, further VM faults are
> filtered out at the prescreen stage of processing.
>
> This depends on the PASID in the interrupt packet, so it currently
> only works for KFD contexts.
>
> Signed-off-by: Felix Kuehling <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 31 +++++++++++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  7 ++++++-
>  drivers/gpu/drm/amd/amdgpu/cik_ih.c     | 19 +++++++++++++++++--
>  drivers/gpu/drm/amd/amdgpu/cz_ih.c      | 19 +++++++++++++++++--
>  drivers/gpu/drm/amd/amdgpu/iceland_ih.c | 19 +++++++++++++++++--
>  drivers/gpu/drm/amd/amdgpu/tonga_ih.c   | 19 +++++++++++++++++--
>  drivers/gpu/drm/amd/amdgpu/vega10_ih.c  | 11 +++++++----
>  7 files changed, 112 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 8fcc743..c91d5c7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -2682,6 +2682,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct 
> amdgpu_vm *vm,
>       }
>  
>       INIT_KFIFO(vm->faults);
> +     vm->fault_credit = 16;
>  
>       return 0;
>  
> @@ -2776,6 +2777,36 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct 
> amdgpu_vm *vm)
>  }
>  
>  /**
> + * amdgpu_vm_pasid_fault_credit - Check fault credit for given PASID
> + *
> + * @adev: amdgpu_device pointer
> + * @pasid: PASID do identify the VM
> + *
> + * This function is expected to be called in interrupt context. Returns
> + * true if there was fault credit, false otherwise
> + */
> +bool amdgpu_vm_pasid_fault_credit(struct amdgpu_device *adev,
> +                               unsigned int pasid)
> +{
> +     struct amdgpu_vm *vm;
> +
> +     spin_lock(&adev->vm_manager.pasid_lock);
> +     vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
> +     spin_unlock(&adev->vm_manager.pasid_lock);
> +     if (!vm)
> +             /* VM not found, can't track fault credit */
> +             return true;
> +
> +     /* No lock needed. only accessed by IRQ handler */
> +     if (!vm->fault_credit)
> +             /* Too many faults in this VM */
> +             return false;
> +
> +     vm->fault_credit--;
> +     return true;
> +}
> +
> +/**
>   * amdgpu_vm_manager_init - init the VM manager
>   *
>   * @adev: amdgpu_device pointer
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index 447ed6e..66efbc2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -165,8 +165,11 @@ struct amdgpu_vm {
>       /* Flag to indicate ATS support from PTE for GFX9 */
>       bool                    pte_support_ats;
>  
> -     /* Up to 128 pending page faults */
> +     /* Up to 128 pending retry page faults */
>       DECLARE_KFIFO(faults, u64, 128);
> +
> +     /* Limit non-retry fault storms */
> +     unsigned int            fault_credit;
>  };
>  
>  struct amdgpu_vm_id {
> @@ -244,6 +247,8 @@ void amdgpu_vm_manager_fini(struct amdgpu_device *adev);
>  int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>                  int vm_context, unsigned int pasid);
>  void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm);
> +bool amdgpu_vm_pasid_fault_credit(struct amdgpu_device *adev,
> +                               unsigned int pasid);
>  void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm,
>                        struct list_head *validated,
>                        struct amdgpu_bo_list_entry *entry);
> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_ih.c 
> b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
> index 07d3d89..a870b35 100644
> --- a/drivers/gpu/drm/amd/amdgpu/cik_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
> @@ -237,8 +237,23 @@ static u32 cik_ih_get_wptr(struct amdgpu_device *adev)
>   */
>  static bool cik_ih_prescreen_iv(struct amdgpu_device *adev)
>  {
> -     /* Process all interrupts */
> -     return true;
> +     u32 ring_index = adev->irq.ih.rptr >> 2;
> +     u16 pasid;
> +
> +     switch (le32_to_cpu(adev->irq.ih.ring[ring_index]) & 0xff) {
> +     case 146:
> +     case 147:
> +             pasid = le32_to_cpu(adev->irq.ih.ring[ring_index + 2]) >> 16;
> +             if (!pasid || amdgpu_vm_pasid_fault_credit(adev, pasid))
> +                     return true;
> +             break;
> +     default:
> +             /* Not a VM fault */
> +             return true;
> +     }
> +
> +     adev->irq.ih.rptr += 16;
> +     return false;
>  }
>  
>   /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c 
> b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> index b6cdf4a..fa61d64 100644
> --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> @@ -216,8 +216,23 @@ static u32 cz_ih_get_wptr(struct amdgpu_device *adev)
>   */
>  static bool cz_ih_prescreen_iv(struct amdgpu_device *adev)
>  {
> -     /* Process all interrupts */
> -     return true;
> +     u32 ring_index = adev->irq.ih.rptr >> 2;
> +     u16 pasid;
> +
> +     switch (le32_to_cpu(adev->irq.ih.ring[ring_index]) & 0xff) {
> +     case 146:
> +     case 147:
> +             pasid = le32_to_cpu(adev->irq.ih.ring[ring_index + 2]) >> 16;
> +             if (!pasid || amdgpu_vm_pasid_fault_credit(adev, pasid))
> +                     return true;
> +             break;
> +     default:
> +             /* Not a VM fault */
> +             return true;
> +     }
> +
> +     adev->irq.ih.rptr += 16;
> +     return false;
>  }
>  
>  /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c 
> b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
> index 65ed6d3..bd592cb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
> @@ -216,8 +216,23 @@ static u32 iceland_ih_get_wptr(struct amdgpu_device 
> *adev)
>   */
>  static bool iceland_ih_prescreen_iv(struct amdgpu_device *adev)
>  {
> -     /* Process all interrupts */
> -     return true;
> +     u32 ring_index = adev->irq.ih.rptr >> 2;
> +     u16 pasid;
> +
> +     switch (le32_to_cpu(adev->irq.ih.ring[ring_index]) & 0xff) {
> +     case 146:
> +     case 147:
> +             pasid = le32_to_cpu(adev->irq.ih.ring[ring_index + 2]) >> 16;
> +             if (!pasid || amdgpu_vm_pasid_fault_credit(adev, pasid))
> +                     return true;
> +             break;
> +     default:
> +             /* Not a VM fault */
> +             return true;
> +     }
> +
> +     adev->irq.ih.rptr += 16;
> +     return false;
>  }
>  
>  /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c 
> b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
> index 5ed0069..aa4e320 100644
> --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
> @@ -227,8 +227,23 @@ static u32 tonga_ih_get_wptr(struct amdgpu_device *adev)
>   */
>  static bool tonga_ih_prescreen_iv(struct amdgpu_device *adev)
>  {
> -     /* Process all interrupts */
> -     return true;
> +     u32 ring_index = adev->irq.ih.rptr >> 2;
> +     u16 pasid;
> +
> +     switch (le32_to_cpu(adev->irq.ih.ring[ring_index]) & 0xff) {
> +     case 146:
> +     case 147:
> +             pasid = le32_to_cpu(adev->irq.ih.ring[ring_index + 2]) >> 16;
> +             if (!pasid || amdgpu_vm_pasid_fault_credit(adev, pasid))
> +                     return true;
> +             break;
> +     default:
> +             /* Not a VM fault */
> +             return true;
> +     }
> +
> +     adev->irq.ih.rptr += 16;
> +     return false;
>  }
>  
>  /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c 
> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> index a3b30d8..6973257 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> @@ -260,15 +260,18 @@ static bool vega10_ih_prescreen_iv(struct amdgpu_device 
> *adev)
>               return true;
>       }
>  
> -     /* Not a retry fault */
> -     if (!(dw5 & 0x80))
> -             return true;
> -
>       pasid = dw3 & 0xffff;
>       /* No PASID, can't identify faulting process */
>       if (!pasid)
>               return true;
>  
> +     /* Not a retry fault, check fault credit */
> +     if (!(dw5 & 0x80)) {
> +             if (!amdgpu_vm_pasid_fault_credit(adev, pasid))
> +                     goto ignore_iv;
> +             return true;
> +     }
> +
>       addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>       key = AMDGPU_VM_FAULT(pasid, addr);
>       r = amdgpu_ih_add_fault(adev, key);

_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to