On 2/20/2026 1:51 PM, Philip Yang wrote:


On 2026-02-17 11:24, Xiaogang.Chen wrote:
From: Xiaogang Chen<[email protected]>

Current driver reports and limits memory allocation for each partition equally among partitions using same memory partition. Application may not be able to use all available memory when run on a partitioned gpu though system still has
enough free memory.

Add an option that app can use to have gpu partition allocate all available
memory.

Signed-off-by: Xiaogang Chen<[email protected]>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |  5 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 43 ++++++++++++++++++++++
  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h    | 17 +++++++++
  drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c    |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h    |  2 +
  5 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 3bfd79c89df3..006883c31342 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -805,7 +805,10 @@ u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device *adev, int xcp_id)
          } else {
              tmp = adev->gmc.mem_partitions[mem_id].size;
          }
-        do_div(tmp, adev->xcp_mgr->num_xcp_per_mem_partition);
+
+        if (adev->xcp_mgr->mem_alloc_mode == AMDGPU_PARTITION_MEM_ALLOC_EVEN)
+            do_div(tmp, adev->xcp_mgr->num_xcp_per_mem_partition);
+
I think amdgpu_amdkfd_reserve_mem_limit should change to account total vram used, to prevent vram
over-commitment for not ALLOC_EVEN mode.

I thought it. The customer wants this knows there is OOM risk if they keep allocating more than system has. If we put limitation I think we need decide how many percentage of ram reserved for system. It applies to APU. For dGPU the allocation would fail or cause eviction if vram ran out.

          return ALIGN_DOWN(tmp, PAGE_SIZE);
      } else if (adev->apu_prefer_gtt) {
          return (ttm_tt_pages_limit() << PAGE_SHIFT);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index cab3196a87fb..1da46eeb3f2c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -1580,6 +1580,40 @@ static ssize_t amdgpu_gfx_set_compute_partition(struct device *dev,
      return count;
  }
  +static ssize_t amdgpu_gfx_get_compute_partition_mem_alloc_mode(
+                        struct device *dev, struct device_attribute *addr,
+                        char *buf)
+{
+    struct drm_device *ddev = dev_get_drvdata(dev);
+    struct amdgpu_device *adev = drm_to_adev(ddev);
+    int mode = adev->xcp_mgr->mem_alloc_mode;
+
+    /* Only minimal precaution taken to reject requests while in reset.*/
+    if (amdgpu_in_reset(adev))
+        return -EPERM;
+
+    return sysfs_emit(buf, "%s\n",
+              amdgpu_gfx_compute_mem_alloc_mode_desc(mode));
+}
+
+
+static ssize_t amdgpu_gfx_set_compute_partition_mem_alloc_mode(
+                        struct device *dev, struct device_attribute *addr,
+                        const char *buf, size_t count)
+{
+    struct drm_device *ddev = dev_get_drvdata(dev);
+    struct amdgpu_device *adev = drm_to_adev(ddev);
+
+    if (!strncasecmp("EVEN", buf, strlen("EVEN")))
+        adev->xcp_mgr->mem_alloc_mode = AMDGPU_PARTITION_MEM_ALLOC_EVEN;
+    else if (!strncasecmp("ALL", buf, strlen("ALL")))
+        adev->xcp_mgr->mem_alloc_mode = AMDGPU_PARTITION_MEM_ALLOC_ALL;
+    else
+        return -EINVAL;
+
+    return count;
+}
+
  static const char *xcp_desc[] = {
      [AMDGPU_SPX_PARTITION_MODE] = "SPX",
      [AMDGPU_DPX_PARTITION_MODE] = "DPX",
@@ -1935,6 +1969,10 @@ static DEVICE_ATTR(gfx_reset_mask, 0444,
  static DEVICE_ATTR(compute_reset_mask, 0444,
             amdgpu_gfx_get_compute_reset_mask, NULL);
  +static DEVICE_ATTR(compute_partition_mem_alloc_mode, 0644,
+           amdgpu_gfx_get_compute_partition_mem_alloc_mode,
+           amdgpu_gfx_set_compute_partition_mem_alloc_mode);
+
  static int amdgpu_gfx_sysfs_xcp_init(struct amdgpu_device *adev)
  {
      struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr;
@@ -1955,6 +1993,11 @@ static int amdgpu_gfx_sysfs_xcp_init(struct amdgpu_device *adev)
      if (r)
          return r;
  +    r = device_create_file(adev->dev,
+ &dev_attr_compute_partition_mem_alloc_mode);
+    if (r)
+        return r;
+
      if (xcp_switch_supported)
          r = device_create_file(adev->dev,
&dev_attr_available_compute_partition);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 720ed3a2c78c..f5713891f205 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -71,6 +71,11 @@ enum amdgpu_gfx_partition {
      AMDGPU_AUTO_COMPUTE_PARTITION_MODE = -2,
  };
  +enum amdgpu_gfx_partition_mem_alloc_mode {
+    AMDGPU_PARTITION_MEM_ALLOC_EVEN = 0,
+    AMDGPU_PARTITION_MEM_ALLOC_ALL  = 1,
+};
+
  #define NUM_XCC(x) hweight16(x)
    enum amdgpu_gfx_ras_mem_id_type {
@@ -676,4 +681,16 @@ static inline const char *amdgpu_gfx_compute_mode_desc(int mode)
      }
  }
  +static inline const char *amdgpu_gfx_compute_mem_alloc_mode_desc(int mode)
+{
+    switch (mode) {
+    case AMDGPU_PARTITION_MEM_ALLOC_EVEN:
+        return "EVEN";
+    case AMDGPU_PARTITION_MEM_ALLOC_ALL:
+        return "ALL";
+    default:
+        return "UNKNOWN";
+    }
+}
+
  #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
index 73250ab45f20..a2d50f90a066 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
@@ -181,6 +181,7 @@ int amdgpu_xcp_init(struct amdgpu_xcp_mgr *xcp_mgr, int num_xcps, int mode)
      }
        xcp_mgr->num_xcps = num_xcps;
+    xcp_mgr->mem_alloc_mode = AMDGPU_PARTITION_MEM_ALLOC_EVEN;
      amdgpu_xcp_update_partition_sched_list(adev);
        return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
index 8058e8f35d41..878c1c422893 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
@@ -132,6 +132,8 @@ struct amdgpu_xcp_mgr {
      struct amdgpu_xcp_cfg *xcp_cfg;
      uint32_t supp_xcp_modes;
      uint32_t avail_xcp_modes;
+    /* used to determin KFD memory alloc mode for each partition */
+    uint32_t mem_alloc_mode;
rename to mem_capping_mode?

I thought this name is more general as the modes are declared at enum amdgpu_gfx_partition_mem_alloc_mode which indicates meaning of each mode. If new mode needed we can add it at this enum.

Regards

Xiaogang


Regards,
Philip
  };
    struct amdgpu_xcp_mgr_funcs {

Reply via email to