hash for fault handling on GMC9 v2

Kuehling, Felix Thu, 07 Mar 2019 17:38:15 -0800

Hmm, that's a clever (and elegant) little data structure. The series is 
Reviewed-by: Felix Kuehling <[email protected]>


Regards,
   Felix

On 3/7/2019 8:28 AM, Christian König wrote:
> Further testing showed that the idea with the chash doesn't work as expected.
> Especially we can't predict when we can remove the entries from the hash 
> again.
>
> So replace the chash with a ring buffer/hash mix where entries in the 
> container
> age automatically based on their timestamp.
>
> v2: use ring buffer / hash mix
>
> Signed-off-by: Christian König <[email protected]>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 49 ++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 34 ++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 60 ++-----------------------
>   3 files changed, 86 insertions(+), 57 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index 5a32a0d2ad31..579cadd16886 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -240,3 +240,52 @@ void amdgpu_gmc_agp_location(struct amdgpu_device *adev, 
> struct amdgpu_gmc *mc)
>       dev_info(adev->dev, "AGP: %lluM 0x%016llX - 0x%016llX\n",
>                       mc->agp_size >> 20, mc->agp_start, mc->agp_end);
>   }
> +
> +/**
> + * amdgpu_gmc_filter_faults - filter VM faults
> + *
> + * @adev: amdgpu device structure
> + * @addr: address of the VM fault
> + * @pasid: PASID of the process causing the fault
> + * @timestamp: timestamp of the fault
> + *
> + * Returns:
> + * True if the fault was filtered and should not be processed further.
> + * False if the fault is a new one and needs to be handled.
> + */
> +bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
> +                           uint16_t pasid, uint64_t timestamp)
> +{
> +     struct amdgpu_gmc *gmc = &adev->gmc;
> +
> +     uint64_t stamp, key = addr << 4 | pasid;
> +     struct amdgpu_gmc_fault *fault;
> +     uint32_t hash;
> +
> +     /* If we don't have space left in the ring buffer return immediately */
> +     stamp = max(timestamp, AMDGPU_GMC_FAULT_TIMEOUT + 1) -
> +             AMDGPU_GMC_FAULT_TIMEOUT;
> +     if (gmc->fault_ring[gmc->last_fault].timestamp >= stamp)
> +             return true;
> +
> +     /* Try to find the fault in the hash */
> +     hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
> +     fault = &gmc->fault_ring[gmc->fault_hash[hash].idx];
> +     do {
> +             if (fault->key == key)
> +                     return true;
> +
> +             stamp = fault->timestamp;
> +             fault = &gmc->fault_ring[fault->next];
> +     } while (fault->timestamp < stamp);
> +
> +     /* Add the fault to the ring */
> +     fault = &gmc->fault_ring[gmc->last_fault];
> +     fault->key = key;
> +     fault->timestamp = timestamp;
> +
> +     /* And update the hash */
> +     fault->next = gmc->fault_hash[hash].idx;
> +     gmc->fault_hash[hash].idx = gmc->last_fault++;
> +     return false;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> index 6ce45664ff87..071145ac67b5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> @@ -43,8 +43,34 @@
>    */
>   #define AMDGPU_GMC_HOLE_MASK        0x0000ffffffffffffULL
>   
> +/*
> + * Ring size as power of two for the log of recent faults.
> + */
> +#define AMDGPU_GMC_FAULT_RING_ORDER  8
> +#define AMDGPU_GMC_FAULT_RING_SIZE   (1 << AMDGPU_GMC_FAULT_RING_ORDER)
> +
> +/*
> + * Hash size as power of two for the log of recent faults
> + */
> +#define AMDGPU_GMC_FAULT_HASH_ORDER  8
> +#define AMDGPU_GMC_FAULT_HASH_SIZE   (1 << AMDGPU_GMC_FAULT_HASH_ORDER)
> +
> +/*
> + * Number of IH timestamp ticks until a fault is considered handled
> + */
> +#define AMDGPU_GMC_FAULT_TIMEOUT     5000ULL
> +
>   struct firmware;
>   
> +/*
> + * GMC page fault information
> + */
> +struct amdgpu_gmc_fault {
> +     uint64_t        timestamp;
> +     uint64_t        next:AMDGPU_GMC_FAULT_RING_ORDER;
> +     uint64_t        key:52;
> +};
> +
>   /*
>    * VMHUB structures, functions & helpers
>    */
> @@ -141,6 +167,12 @@ struct amdgpu_gmc {
>       struct kfd_vm_fault_info *vm_fault_info;
>       atomic_t                vm_fault_info_updated;
>   
> +     struct amdgpu_gmc_fault fault_ring[AMDGPU_GMC_FAULT_RING_SIZE];
> +     struct {
> +             uint64_t        idx:AMDGPU_GMC_FAULT_RING_ORDER;
> +     } fault_hash[AMDGPU_GMC_FAULT_HASH_SIZE];
> +     uint64_t                last_fault:AMDGPU_GMC_FAULT_RING_ORDER;
> +
>       const struct amdgpu_gmc_funcs   *gmc_funcs;
>   
>       struct amdgpu_xgmi xgmi;
> @@ -195,5 +227,7 @@ void amdgpu_gmc_gart_location(struct amdgpu_device *adev,
>                             struct amdgpu_gmc *mc);
>   void amdgpu_gmc_agp_location(struct amdgpu_device *adev,
>                            struct amdgpu_gmc *mc);
> +bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
> +                           uint16_t pasid, uint64_t timestamp);
>   
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 84904bd680df..0c37c0afb1bd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -416,62 +416,6 @@ static int gmc_v9_0_vm_fault_interrupt_state(struct 
> amdgpu_device *adev,
>       return 0;
>   }
>   
> -/**
> - * vega10_ih_prescreen_iv - prescreen an interrupt vector
> - *
> - * @adev: amdgpu_device pointer
> - *
> - * Returns true if the interrupt vector should be further processed.
> - */
> -static bool gmc_v9_0_prescreen_iv(struct amdgpu_device *adev,
> -                               struct amdgpu_iv_entry *entry,
> -                               uint64_t addr)
> -{
> -     struct amdgpu_vm *vm;
> -     u64 key;
> -     int r;
> -
> -     /* No PASID, can't identify faulting process */
> -     if (!entry->pasid)
> -             return true;
> -
> -     /* Not a retry fault */
> -     if (!(entry->src_data[1] & 0x80))
> -             return true;
> -
> -     /* Track retry faults in per-VM fault FIFO. */
> -     spin_lock(&adev->vm_manager.pasid_lock);
> -     vm = idr_find(&adev->vm_manager.pasid_idr, entry->pasid);
> -     if (!vm) {
> -             /* VM not found, process it normally */
> -             spin_unlock(&adev->vm_manager.pasid_lock);
> -             return true;
> -     }
> -
> -     key = AMDGPU_VM_FAULT(entry->pasid, addr);
> -     r = amdgpu_vm_add_fault(vm->fault_hash, key);
> -
> -     /* Hash table is full or the fault is already being processed,
> -      * ignore further page faults
> -      */
> -     if (r != 0) {
> -             spin_unlock(&adev->vm_manager.pasid_lock);
> -             return false;
> -     }
> -     /* No locking required with single writer and single reader */
> -     r = kfifo_put(&vm->faults, key);
> -     if (!r) {
> -             /* FIFO is full. Ignore it until there is space */
> -             amdgpu_vm_clear_fault(vm->fault_hash, key);
> -             spin_unlock(&adev->vm_manager.pasid_lock);
> -             return false;
> -     }
> -
> -     spin_unlock(&adev->vm_manager.pasid_lock);
> -     /* It's the first fault for this address, process it normally */
> -     return true;
> -}
> -
>   static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>                               struct amdgpu_irq_src *source,
>                               struct amdgpu_iv_entry *entry)
> @@ -484,9 +428,11 @@ static int gmc_v9_0_process_interrupt(struct 
> amdgpu_device *adev,
>       addr = (u64)entry->src_data[0] << 12;
>       addr |= ((u64)entry->src_data[1] & 0xf) << 44;
>   
> -     if (!gmc_v9_0_prescreen_iv(adev, entry, addr))
> +     if (retry_fault && amdgpu_gmc_filter_faults(adev, addr, entry->pasid,
> +                                                 entry->timestamp))
>               return 1; /* This also prevents sending it to KFD */
>   
> +     /* If it's the first fault for this address, process it normally */
>       if (!amdgpu_sriov_vf(adev)) {
>               status = RREG32(hub->vm_l2_pro_fault_status);
>               WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 1/2] drm/amdgpu: use ring/hash for fault handling on GMC9 v2

Reply via email to