pm: Change gpu_metrics over to binary with per-reader snapshots (v4)

Tom St Denis Mon, 06 Apr 2026 09:26:12 -0700

The gpu_metrics sysfs file carries a binary blob but was implemented as
a text device_attribute, which imposes a hard PAGE_SIZE-1 cap and makes
it impossible for userspace to distinguish a successful short read from
a truncated one (stat reports 4 KiB regardless of actual payload size).


Convert gpu_metrics to a bin_attribute.  The declared file size is an
upper bound (128 KiB); the real payload length is the byte count
returned before EOF.

To guarantee that multi-chunk reads (which kernfs splits at PAGE_SIZE
boundaries) return a coherent snapshot, each reader (identified by its
struct file pointer) gets its own cached metrics buffer via an xarray.
When offset is 0, PMFW is sampled and stored in that reader's entry;
subsequent offsets are served from the same snapshot.  The entry is
freed on EOF.  A mutex serialises xarray mutations and PMFW access.

Stale entries from readers that close without reaching EOF (e.g.,
killed processes) are lazily evicted after 30 seconds whenever any
new reader starts.

This per-reader approach avoids the cross-contamination problem of
a single shared cache: concurrent readers each see their own coherent
PMFW snapshot rather than risking one reader's off=0 overwriting the
buffer while another reader is mid-way through a multi-chunk read.

V3: Per-reader snapshot state, so each open fd gets its own cached
    gpu_metrics buffer and all chunks for that reader come from the
    same sample (Vitaly)

V4: Remove the residual PAGE_SIZE cap that v3 still carried over from
    v1. The whole point of the bin_attribute conversion is to
    lift the old PAGE_SIZE-1 limit, but v3 still had:
      - WARN_ON_ONCE(len > PAGE_SIZE) with silent truncation to 4K
      - kzalloc(PAGE_SIZE) for the snapshot buffer (fixed allocation)
      - a separate memcpy from metrics into that fixed buffer
    In v4 the snapshot buffer is allocated with kmemdup() sized to the
    actual payload returned by amdgpu_dpm_get_gpu_metrics(), so payloads
    larger than PAGE_SIZE work correctly.  The only remaining upper bound
    is AMDGPU_GPU_METRICS_BIN_ATTR_MAX_SZ (128 KiB), which is the
    declared bin_attribute file size and serves as a sanity check —
    exceeding it now returns -EOVERFLOW instead of silently truncating.
    (Tom)

Signed-off-by: Tom St Denis <[email protected]>
Acked-by: Vitaly Prosyak <[email protected]>
Change-Id: Ib4fa233d9a25a396f1cc7d5fcf74f5f6578329f5
---
 drivers/gpu/drm/amd/pm/amdgpu_pm.c      | 208 +++++++++++++++++++++---
 drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h |   4 +
 drivers/gpu/drm/amd/pm/inc/amdgpu_pm.h  |   1 -
 3 files changed, 188 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index a4d8e667eafb..303a2d643b10 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -34,8 +34,19 @@
 #include <linux/nospec.h>
 #include <linux/pm_runtime.h>
 #include <linux/string_choices.h>
+#include <linux/sysfs.h>
+#include <linux/sizes.h>
+#include <linux/xarray.h>
 #include <asm/processor.h>
 
+/*
+ * Sysfs reports this as the file size (stat/ls); kernfs also uses it to cap
+ * read offsets.  Actual payload length is the return value of
+ * amdgpu_dpm_get_gpu_metrics() and must not exceed this.
+ */
+#define AMDGPU_GPU_METRICS_BIN_ATTR_MAX_SZ     SZ_128K
+
+
 #define MAX_NUM_OF_FEATURES_PER_SUBSET         8
 #define MAX_NUM_OF_SUBSETS                     8
 
@@ -1734,43 +1745,170 @@ static ssize_t amdgpu_get_pm_metrics(struct device 
*dev,
  * DOC: gpu_metrics
  *
  * The amdgpu driver provides a sysfs API for retrieving current gpu
- * metrics data. The file gpu_metrics is used for this. Reading the
- * file will dump all the current gpu metrics data.
+ * metrics data.  The binary sysfs file gpu_metrics is used for this.
+ * Reading the file returns the raw metrics blob.  The sysfs file size
+ * is an upper bound for inode metadata; the real length is the amount
+ * returned before EOF.
+ *
+ * Metrics are sampled atomically per reader: the first read at offset 0
+ * captures a snapshot into a per-fd cache; subsequent reads at higher
+ * offsets (for payloads that span multiple pages) are served from that
+ * same snapshot.  Concurrent readers each get their own cache.
  *
  * These data include temperature, frequency, engines utilization,
  * power consume, throttler status, fan speed and cpu core statistics(
  * available for APU only). That's it will give a snapshot of all sensors
  * at the same time.
  */
-static ssize_t amdgpu_get_gpu_metrics(struct device *dev,
-                                     struct device_attribute *attr,
-                                     char *buf)
+
+/* Per-reader snapshot entry, keyed by struct file pointer in the xarray */
+struct gpu_metrics_snap_entry {
+       void    *data;
+       size_t  size;
+       ktime_t timestamp;
+};
+
+/* Evict stale entries from readers that closed without reaching EOF */
+#define GPU_METRICS_SNAP_STALE_NS      (30ULL * NSEC_PER_SEC)
+
+static void gpu_metrics_evict_stale_locked(struct xarray *xa,
+                                          unsigned long skip_key)
+{
+       struct gpu_metrics_snap_entry *entry;
+       unsigned long idx;
+       ktime_t cutoff;
+
+       cutoff = ktime_sub(ktime_get(), ns_to_ktime(GPU_METRICS_SNAP_STALE_NS));
+
+       xa_for_each(xa, idx, entry) {
+               if (idx != skip_key && ktime_before(entry->timestamp, cutoff)) {
+                       xa_erase(xa, idx);
+                       kfree(entry->data);
+                       kfree(entry);
+               }
+       }
+}
+
+static void gpu_metrics_free_all(struct xarray *xa)
 {
+       struct gpu_metrics_snap_entry *entry;
+       unsigned long idx;
+
+       xa_for_each(xa, idx, entry) {
+               xa_erase(xa, idx);
+               kfree(entry->data);
+               kfree(entry);
+       }
+       xa_destroy(xa);
+}
+
+static bool amdgpu_pm_gpu_metrics_bin_visible(struct amdgpu_device *adev,
+                                             uint32_t mask)
+{
+       uint32_t gc_ver = amdgpu_ip_version(adev, GC_HWIP, 0);
+
+       if (!((ATTR_FLAG_BASIC | ATTR_FLAG_ONEVF) & mask))
+               return false;
+
+       return gc_ver >= IP_VERSION(9, 1, 0);
+}
+
+static ssize_t amdgpu_sysfs_gpu_metrics_read(struct file *f,
+                                             struct kobject *kobj,
+                                             const struct bin_attribute *attr,
+                                             char *buf, loff_t off,
+                                             size_t count)
+{
+       struct device *dev = kobj_to_dev(kobj);
        struct drm_device *ddev = dev_get_drvdata(dev);
        struct amdgpu_device *adev = drm_to_adev(ddev);
-       void *gpu_metrics;
-       ssize_t size = 0;
-       int ret;
+       unsigned long key = (unsigned long)f;
+       struct gpu_metrics_snap_entry *entry;
+       ssize_t ret;
 
-       ret = amdgpu_pm_get_access_if_active(adev);
-       if (ret)
-               return ret;
+       mutex_lock(&adev->pm.gpu_metrics_lock);
 
-       size = amdgpu_dpm_get_gpu_metrics(adev, &gpu_metrics);
-       if (size <= 0)
-               goto out;
+       if (off == 0) {
+               void *metrics;
+               void *old;
+               int len;
 
-       if (size >= PAGE_SIZE)
-               size = PAGE_SIZE - 1;
+               /* Evict stale entries from readers that never hit EOF */
+               gpu_metrics_evict_stale_locked(&adev->pm.gpu_metrics_readers,
+                                              key);
 
-       memcpy(buf, gpu_metrics, size);
+               ret = amdgpu_pm_get_access(adev);
+               if (ret)
+                       goto out_unlock;
 
-out:
-       amdgpu_pm_put_access(adev);
+               len = amdgpu_dpm_get_gpu_metrics(adev, &metrics);
+               amdgpu_pm_put_access(adev);
 
-       return size;
+               if (len <= 0) {
+                       ret = len;
+                       goto out_unlock;
+               }
+
+               if (WARN_ON_ONCE(len > AMDGPU_GPU_METRICS_BIN_ATTR_MAX_SZ)) {
+                       ret = -EOVERFLOW;
+                       goto out_unlock;
+               }
+
+               entry = xa_load(&adev->pm.gpu_metrics_readers, key);
+               if (!entry) {
+                       entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+                       if (!entry) {
+                               ret = -ENOMEM;
+                               goto out_unlock;
+                       }
+                       old = xa_store(&adev->pm.gpu_metrics_readers, key,
+                                      entry, GFP_KERNEL);
+                       if (xa_is_err(old)) {
+                               kfree(entry);
+                               ret = xa_err(old);
+                               goto out_unlock;
+                       }
+               }
+
+               /* (Re-)allocate snapshot buffer sized to actual payload */
+               kfree(entry->data);
+               entry->data = kmemdup(metrics, len, GFP_KERNEL);
+               if (!entry->data) {
+                       xa_erase(&adev->pm.gpu_metrics_readers, key);
+                       kfree(entry);
+                       ret = -ENOMEM;
+                       goto out_unlock;
+               }
+               entry->size = len;
+               entry->timestamp = ktime_get();
+       } else {
+               entry = xa_load(&adev->pm.gpu_metrics_readers, key);
+               if (!entry) {
+                       ret = -EIO;
+                       goto out_unlock;
+               }
+       }
+
+       if (off >= entry->size) {
+               xa_erase(&adev->pm.gpu_metrics_readers, key);
+               kfree(entry->data);
+               kfree(entry);
+               ret = 0;
+               goto out_unlock;
+       }
+
+       count = min_t(size_t, count, entry->size - off);
+       memcpy(buf, (u8 *)entry->data + off, count);
+       ret = count;
+
+out_unlock:
+       mutex_unlock(&adev->pm.gpu_metrics_lock);
+       return ret;
 }
 
+static const BIN_ATTR(gpu_metrics, 0444, amdgpu_sysfs_gpu_metrics_read, NULL,
+                     AMDGPU_GPU_METRICS_BIN_ATTR_MAX_SZ);
+
 static int amdgpu_show_powershift_percent(struct device *dev,
                                        char *buf, enum amd_pp_sensors sensor)
 {
@@ -2579,7 +2717,6 @@ static struct amdgpu_device_attr amdgpu_device_attrs[] = {
        AMDGPU_DEVICE_ATTR_RO(unique_id,                                
ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF),
        AMDGPU_DEVICE_ATTR_RW(thermal_throttling_logging,               
ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF),
        AMDGPU_DEVICE_ATTR_RW(apu_thermal_cap,                          
ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF),
-       AMDGPU_DEVICE_ATTR_RO(gpu_metrics,                              
ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF),
        AMDGPU_DEVICE_ATTR_RO(smartshift_apu_power,                     
ATTR_FLAG_BASIC,
                              .attr_update = ss_power_attr_update),
        AMDGPU_DEVICE_ATTR_RO(smartshift_dgpu_power,                    
ATTR_FLAG_BASIC,
@@ -2657,9 +2794,6 @@ static int default_attr_update(struct amdgpu_device 
*adev, struct amdgpu_device_
                     gc_ver != IP_VERSION(9, 4, 3)) ||
                    gc_ver < IP_VERSION(9, 0, 0))
                        *states = ATTR_STATE_UNSUPPORTED;
-       } else if (DEVICE_ATTR_IS(gpu_metrics)) {
-               if (gc_ver < IP_VERSION(9, 1, 0))
-                       *states = ATTR_STATE_UNSUPPORTED;
        } else if (DEVICE_ATTR_IS(pp_power_profile_mode)) {
                if (amdgpu_dpm_get_power_profile_mode(adev, NULL) == 
-EOPNOTSUPP)
                        *states = ATTR_STATE_UNSUPPORTED;
@@ -4755,6 +4889,19 @@ int amdgpu_pm_sysfs_init(struct amdgpu_device *adev)
        if (ret)
                goto err_out0;
 
+       if (amdgpu_pm_gpu_metrics_bin_visible(adev, mask)) {
+               mutex_init(&adev->pm.gpu_metrics_lock);
+               xa_init(&adev->pm.gpu_metrics_readers);
+               ret = sysfs_create_bin_file(&adev->dev->kobj,
+                                           &bin_attr_gpu_metrics);
+               if (ret) {
+                       xa_destroy(&adev->pm.gpu_metrics_readers);
+                       mutex_destroy(&adev->pm.gpu_metrics_lock);
+                       goto err_out1;
+               }
+               adev->pm.gpu_metrics_bin_registered = true;
+       }
+
        if (amdgpu_dpm_is_overdrive_supported(adev)) {
                ret = amdgpu_od_set_init(adev);
                if (ret)
@@ -4806,6 +4953,12 @@ int amdgpu_pm_sysfs_init(struct amdgpu_device *adev)
        return 0;
 
 err_out1:
+       if (adev->pm.gpu_metrics_bin_registered) {
+               sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_gpu_metrics);
+               gpu_metrics_free_all(&adev->pm.gpu_metrics_readers);
+               mutex_destroy(&adev->pm.gpu_metrics_lock);
+               adev->pm.gpu_metrics_bin_registered = false;
+       }
        amdgpu_device_attr_remove_groups(adev, &adev->pm.pm_attr_list);
 err_out0:
        if (adev->pm.int_hwmon_dev)
@@ -4821,6 +4974,13 @@ void amdgpu_pm_sysfs_fini(struct amdgpu_device *adev)
        if (adev->pm.int_hwmon_dev)
                hwmon_device_unregister(adev->pm.int_hwmon_dev);
 
+       if (adev->pm.gpu_metrics_bin_registered) {
+               sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_gpu_metrics);
+               gpu_metrics_free_all(&adev->pm.gpu_metrics_readers);
+               mutex_destroy(&adev->pm.gpu_metrics_lock);
+               adev->pm.gpu_metrics_bin_registered = false;
+       }
+
        amdgpu_device_attr_remove_groups(adev, &adev->pm.pm_attr_list);
 }
 
diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h 
b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
index aa3f427819a0..3677a4f543fb 100644
--- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
+++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
@@ -349,6 +349,10 @@ struct amdgpu_pm {
        /* dpm */
        bool                    dpm_enabled;
        bool                    sysfs_initialized;
+       bool                    gpu_metrics_bin_registered;
+       struct mutex            gpu_metrics_lock;
+       /* per-reader snapshot entries, keyed by (unsigned long)struct file * */
+       struct xarray           gpu_metrics_readers;
        struct amdgpu_dpm       dpm;
        const struct firmware   *fw;    /* SMC firmware */
        uint32_t                fw_version;
diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_pm.h 
b/drivers/gpu/drm/amd/pm/inc/amdgpu_pm.h
index c12ced32f780..dc6875871f1d 100644
--- a/drivers/gpu/drm/amd/pm/inc/amdgpu_pm.h
+++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_pm.h
@@ -73,7 +73,6 @@ enum amdgpu_device_attr_id {
        device_attr_id__unique_id,
        device_attr_id__thermal_throttling_logging,
        device_attr_id__apu_thermal_cap,
-       device_attr_id__gpu_metrics,
        device_attr_id__smartshift_apu_power,
        device_attr_id__smartshift_dgpu_power,
        device_attr_id__smartshift_bias,
-- 
2.51.0

[PATCH] drm/amd/pm: Change gpu_metrics over to binary with per-reader snapshots (v4)

Reply via email to