[Public]

> -----Original Message-----
> From: Francis, David <[email protected]>
> Sent: Thursday, March 5, 2026 9:56 AM
> To: [email protected]
> Cc: Koenig, Christian <[email protected]>; Deucher, Alexander
> <[email protected]>; Freehill, Chris <[email protected]>;
> Francis, David <[email protected]>
> Subject: [PATCH] drm/amdgpu: Add profiling counters in fdinfo
>
> Add five counters to the fdinfo for amdgpu device files.
>
> They are:
> amd-vmfault-counter: %llu
> amd-queue-eviction-counter: %llu
> amd-svm-migrate-counter: %llu
> amd-svm-page-fault-counter: %llu
> amd-svm-unmap-counter: %llu
>
> These counters begin at 0 when a device file is opened.
> They are for use by profiling applications.
>
> Signed-off-by: David Francis <[email protected]>

These looks reasonable to me.
Acked-by: Alex Deucher <[email protected]>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c | 15 ++++++++++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c    |  5 ++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c  |  3 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c     | 31
> ++++++++++++++++++++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h     | 26 ++++++++++++++++--
>  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c   |  6 +++++
>  drivers/gpu/drm/amd/amdkfd/kfd_svm.c       | 11 ++++++--
>  7 files changed, 90 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c
> index b349bb3676d5..96d6063ecaa8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c
> @@ -61,6 +61,7 @@ void amdgpu_show_fdinfo(struct drm_printer *p, struct
> drm_file *file)
>       struct amdgpu_vm *vm = &fpriv->vm;
>
>       struct amdgpu_mem_stats stats[__AMDGPU_PL_NUM];
> +     struct amdgpu_process_stats process_stats;
>       ktime_t usage[AMDGPU_HW_IP_NUM];
>       const char *pl_name[] = {
>               [TTM_PL_VRAM] = "vram",
> @@ -74,7 +75,7 @@ void amdgpu_show_fdinfo(struct drm_printer *p, struct
> drm_file *file)
>       };
>       unsigned int hw_ip, i;
>
> -     amdgpu_vm_get_memory(vm, stats);
> +     amdgpu_vm_get_memory(vm, stats, &process_stats);
>       amdgpu_ctx_mgr_usage(&fpriv->ctx_mgr, usage);
>
>       /*
> @@ -114,6 +115,18 @@ void amdgpu_show_fdinfo(struct drm_printer *p,
> struct drm_file *file)
>                  (stats[TTM_PL_TT].drm.shared +
>                   stats[TTM_PL_TT].drm.private) / 1024UL);
>
> +     /* Amdgpu specific counters: */
> +     drm_printf(p, "amd-vmfault-counter:\t%llu\n",
> +                process_stats.vmfault_counter);
> +     drm_printf(p, "amd-queue-eviction-counter:\t%llu\n",
> +                process_stats.queue_eviction_counter);
> +     drm_printf(p, "amd-svm-migrate-counter:\t%llu\n",
> +                process_stats.svm_migrate_counter);
> +     drm_printf(p, "amd-svm-page-fault-counter:\t%llu\n",
> +                process_stats.svm_page_fault_counter);
> +     drm_printf(p, "amd-svm-unmap-counter:\t%llu\n",
> +                process_stats.svm_unmap_counter);
> +
>       for (hw_ip = 0; hw_ip < AMDGPU_HW_IP_NUM; ++hw_ip) {
>               if (!usage[hw_ip])
>                       continue;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> index 82bc6d657e5a..ad1042639dbe 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> @@ -476,6 +476,7 @@ void amdgpu_irq_dispatch(struct amdgpu_device
> *adev,
>       struct amdgpu_iv_entry entry;
>       unsigned int client_id, src_id;
>       struct amdgpu_irq_src *src;
> +     struct amdgpu_vm *vm;
>       bool handled = false;
>       int r;
>
> @@ -513,6 +514,10 @@ void amdgpu_irq_dispatch(struct amdgpu_device
> *adev,
>                       client_id, src_id);
>
>       } else if ((src = adev->irq.client[client_id].sources[src_id])) {
> +             vm = amdgpu_vm_get_vm_from_pasid(adev, entry.pasid);
> +             if (vm)
> +                     amdgpu_vm_increment_process_counter(vm,
> AMDGPU_VM_VMFAULT_COUNTER);
> +
>               r = src->funcs->process(adev, src, &entry);
>               if (r < 0)
>                       dev_err(adev->dev, "error processing interrupt
> (%d)\n", diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 001fcfcbde0f..9ba6f166cb5c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -1300,6 +1300,9 @@ amdgpu_userq_evict_all(struct
> amdgpu_userq_mgr *uq_mgr)
>               queue = amdgpu_userq_get(uq_mgr, queue_id);
>               if (!queue)
>                       continue;
> +
> +             amdgpu_vm_increment_process_counter(queue-
> >fw_obj.obj->vm_bo->vm,
> +AMDGPU_VM_QUEUE_EVICTION_COUNTER);
> +
>               r = amdgpu_userq_preempt_helper(queue);
>               if (r)
>                       ret = r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 01fef0e4f408..d7d82f23377f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -1241,10 +1241,12 @@ int amdgpu_vm_update_range(struct
> amdgpu_device *adev, struct amdgpu_vm *vm,  }
>
>  void amdgpu_vm_get_memory(struct amdgpu_vm *vm,
> -                       struct amdgpu_mem_stats
> stats[__AMDGPU_PL_NUM])
> +                       struct amdgpu_mem_stats
> stats[__AMDGPU_PL_NUM],
> +                       struct amdgpu_process_stats *process_stats)
>  {
>       spin_lock(&vm->status_lock);
>       memcpy(stats, vm->stats, sizeof(*stats) * __AMDGPU_PL_NUM);
> +     memcpy(process_stats, &vm->process_stats, sizeof(*process_stats));
>       spin_unlock(&vm->status_lock);
>  }
>
> @@ -2472,7 +2474,7 @@ static void amdgpu_vm_destroy_task_info(struct
> kref *kref)
>       kfree(ti);
>  }
>
> -static inline struct amdgpu_vm *
> +inline struct amdgpu_vm *
>  amdgpu_vm_get_vm_from_pasid(struct amdgpu_device *adev, u32 pasid)  {
>       struct amdgpu_vm *vm;
> @@ -3234,3 +3236,28 @@ void amdgpu_sdma_set_vm_pte_scheds(struct
> amdgpu_device *adev,
>       adev->vm_manager.vm_pte_num_scheds = adev-
> >sdma.num_instances;
>       adev->vm_manager.vm_pte_funcs = vm_pte_funcs;  }
> +
> +void amdgpu_vm_increment_process_counter(struct amdgpu_vm *vm,
> enum
> +amdgpu_process_stat_type stat_type) {
> +     spin_lock(&vm->status_lock);
> +     switch (stat_type) {
> +     case AMDGPU_VM_VMFAULT_COUNTER:
> +             vm->process_stats.vmfault_counter++;
> +             break;
> +     case AMDGPU_VM_QUEUE_EVICTION_COUNTER:
> +             vm->process_stats.queue_eviction_counter++;
> +             break;
> +     case AMDGPU_VM_SVM_MIGRATE_COUNTER:
> +             vm->process_stats.svm_migrate_counter++;
> +             break;
> +     case AMDGPU_VM_SVM_PAGE_FAULT_COUNTER:
> +             vm->process_stats.svm_page_fault_counter++;
> +             break;
> +     case AMDGPU_VM_SVM_UNMAP_COUNTER:
> +             vm->process_stats.svm_unmap_counter++;
> +             break;
> +     default:
> +             pr_debug("unknown process stat type 0x%x\n", stat_type);
> +     }
> +     spin_unlock(&vm->status_lock);
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index dc4b0ec672ec..4a63f0384c7d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -334,6 +334,14 @@ struct amdgpu_mem_stats {
>       uint64_t evicted;
>  };
>
> +struct amdgpu_process_stats {
> +     uint64_t vmfault_counter;
> +     uint64_t queue_eviction_counter;
> +     uint64_t svm_migrate_counter;
> +     uint64_t svm_page_fault_counter;
> +     uint64_t svm_unmap_counter;
> +};
> +
>  struct amdgpu_vm {
>       /* tree of virtual addresses mapped */
>       struct rb_root_cached   va;
> @@ -348,8 +356,9 @@ struct amdgpu_vm {
>       /* Lock to protect vm_bo add/del/move on all lists of vm */
>       spinlock_t              status_lock;
>
> -     /* Memory statistics for this vm, protected by status_lock */
> +     /* Statistics for this vm, protected by stats_lock */
>       struct amdgpu_mem_stats stats[__AMDGPU_PL_NUM];
> +     struct amdgpu_process_stats process_stats;
>
>       /*
>        * The following lists contain amdgpu_vm_bo_base objects for either
> @@ -586,6 +595,8 @@ amdgpu_vm_get_task_info_vm(struct amdgpu_vm
> *vm);
>
>  void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info);
>
> +struct amdgpu_vm *amdgpu_vm_get_vm_from_pasid(struct
> amdgpu_device
> +*adev, u32 pasid);
> +
>  bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
>                           u32 vmid, u32 node_id, uint64_t addr, uint64_t ts,
>                           bool write_fault);
> @@ -595,7 +606,8 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm
> *vm);  void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
>                               struct amdgpu_vm *vm);
>  void amdgpu_vm_get_memory(struct amdgpu_vm *vm,
> -                       struct amdgpu_mem_stats
> stats[__AMDGPU_PL_NUM]);
> +                       struct amdgpu_mem_stats
> stats[__AMDGPU_PL_NUM],
> +                       struct amdgpu_process_stats *process_stats);
>
>  int amdgpu_vm_pt_clear(struct amdgpu_device *adev, struct amdgpu_vm
> *vm,
>                      struct amdgpu_bo_vm *vmbo, bool immediate); @@ -
> 621,6 +633,16 @@ int amdgpu_vm_pt_map_tables(struct amdgpu_device
> *adev, struct amdgpu_vm *vm);
>
>  bool amdgpu_vm_is_bo_always_valid(struct amdgpu_vm *vm, struct
> amdgpu_bo *bo);
>
> +enum amdgpu_process_stat_type {
> +     AMDGPU_VM_VMFAULT_COUNTER,
> +     AMDGPU_VM_QUEUE_EVICTION_COUNTER,
> +     AMDGPU_VM_SVM_MIGRATE_COUNTER,
> +     AMDGPU_VM_SVM_PAGE_FAULT_COUNTER,
> +     AMDGPU_VM_SVM_UNMAP_COUNTER,
> +};
> +
> +void amdgpu_vm_increment_process_counter(struct amdgpu_vm *vm,
> enum
> +amdgpu_process_stat_type stat_type);
> +
>  /**
>   * amdgpu_vm_tlb_seq - return tlb flush sequence number
>   * @vm: the amdgpu_vm structure to query diff --git
> a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> index b3d304aab686..c341b6842460 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> @@ -427,6 +427,9 @@ svm_migrate_vma_to_vram(struct kfd_node *node,
> struct svm_range *prange,
>                                     start >> PAGE_SHIFT, end >> PAGE_SHIFT,
>                                     0, node->id, prange->prefetch_loc,
>                                     prange->preferred_loc, trigger);
> +     pdd = svm_range_get_pdd_by_node(prange, node);
> +     if (pdd)
> +
>       amdgpu_vm_increment_process_counter(drm_priv_to_vm(pdd-
> >drm_priv),
> +AMDGPU_VM_SVM_MIGRATE_COUNTER);
>
>       r = migrate_vma_setup(&migrate);
>       if (r) {
> @@ -729,6 +732,9 @@ svm_migrate_vma_to_ram(struct kfd_node *node,
> struct svm_range *prange,
>                                     start >> PAGE_SHIFT, end >> PAGE_SHIFT,
>                                     node->id, 0, prange->prefetch_loc,
>                                     prange->preferred_loc, trigger);
> +     pdd = svm_range_get_pdd_by_node(prange, node);
> +     if (pdd)
> +
>       amdgpu_vm_increment_process_counter(drm_priv_to_vm(pdd-
> >drm_priv),
> +AMDGPU_VM_SVM_MIGRATE_COUNTER);
>
>       r = migrate_vma_setup(&migrate);
>       if (r) {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index fcddb54a439f..499882a76581 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -1375,9 +1375,11 @@ svm_range_unmap_from_gpus(struct svm_range
> *prange, unsigned long start,
>       struct kfd_process_device *pdd;
>       struct dma_fence *fence = NULL;
>       struct kfd_process *p;
> +     struct amdgpu_vm *vm;
>       uint32_t gpuidx;
>       int r = 0;
>
> +
>       if (!prange->mapped_to_gpu) {
>               pr_debug("prange 0x%p [0x%lx 0x%lx] not mapped to
> GPU\n",
>                        prange, prange->start, prange->last); @@ -1398,13
> +1400,14 @@ svm_range_unmap_from_gpus(struct svm_range *prange,
> unsigned long start,
>                       pr_debug("failed to find device idx %d\n", gpuidx);
>                       return -EINVAL;
>               }
> +             vm = drm_priv_to_vm(pdd->drm_priv);
>
>               kfd_smi_event_unmap_from_gpu(pdd->dev, p->lead_thread-
> >pid,
>                                            start, last, trigger);
> +             amdgpu_vm_increment_process_counter(vm,
> AMDGPU_VM_SVM_UNMAP_COUNTER);
>
>               r = svm_range_unmap_from_gpu(pdd->dev->adev,
> -                                          drm_priv_to_vm(pdd->drm_priv),
> -                                          start, last, &fence);
> +                                          vm, start, last, &fence);
>               if (r)
>                       break;
>
> @@ -3039,6 +3042,7 @@ svm_range_restore_pages(struct amdgpu_device
> *adev, unsigned int pasid,
>       struct svm_range_list *svms;
>       struct svm_range *prange;
>       struct kfd_process *p;
> +     struct kfd_process_device *pdd;
>       ktime_t timestamp = ktime_get_boottime();
>       struct kfd_node *node;
>       int32_t best_loc;
> @@ -3193,6 +3197,9 @@ svm_range_restore_pages(struct amdgpu_device
> *adev, unsigned int pasid,
>
>       kfd_smi_event_page_fault_start(node, p->lead_thread->pid, addr,
>                                      write_fault, timestamp);
> +     pdd = svm_range_get_pdd_by_node(prange, node);
> +     if (pdd)
> +
>       amdgpu_vm_increment_process_counter(drm_priv_to_vm(pdd-
> >drm_priv),
> +AMDGPU_VM_SVM_MIGRATE_COUNTER);
>
>       /* Align migration range start and size to granularity size */
>       size = 1UL << prange->granularity;
> --
> 2.34.1

Reply via email to