On 2026-02-17 11:24, Xiaogang.Chen wrote:
From: Xiaogang Chen<[email protected]>

Current driver reports and limits memory allocation for each partition equally
among partitions using same memory partition. Application may not be able to
use all available memory when run on a partitioned gpu though system still has
enough free memory.

Add an option that app can use to have gpu partition allocate all available
memory.

Signed-off-by: Xiaogang Chen<[email protected]>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |  5 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 43 ++++++++++++++++++++++
  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h    | 17 +++++++++
  drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c    |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h    |  2 +
  5 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 3bfd79c89df3..006883c31342 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -805,7 +805,10 @@ u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device 
*adev, int xcp_id)
                } else {
                        tmp = adev->gmc.mem_partitions[mem_id].size;
                }
-               do_div(tmp, adev->xcp_mgr->num_xcp_per_mem_partition);
+
+               if (adev->xcp_mgr->mem_alloc_mode == 
AMDGPU_PARTITION_MEM_ALLOC_EVEN)
+                       do_div(tmp, adev->xcp_mgr->num_xcp_per_mem_partition);
+
I think amdgpu_amdkfd_reserve_mem_limit should change to account total vram used, to prevent vram
over-commitment for not ALLOC_EVEN mode.
                return ALIGN_DOWN(tmp, PAGE_SIZE);
        } else if (adev->apu_prefer_gtt) {
                return (ttm_tt_pages_limit() << PAGE_SHIFT);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index cab3196a87fb..1da46eeb3f2c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -1580,6 +1580,40 @@ static ssize_t amdgpu_gfx_set_compute_partition(struct 
device *dev,
        return count;
  }
+static ssize_t amdgpu_gfx_get_compute_partition_mem_alloc_mode(
+                                               struct device *dev, struct 
device_attribute *addr,
+                                               char *buf)
+{
+       struct drm_device *ddev = dev_get_drvdata(dev);
+       struct amdgpu_device *adev = drm_to_adev(ddev);
+       int mode = adev->xcp_mgr->mem_alloc_mode;
+
+       /* Only minimal precaution taken to reject requests while in reset.*/
+       if (amdgpu_in_reset(adev))
+               return -EPERM;
+
+       return sysfs_emit(buf, "%s\n",
+                         amdgpu_gfx_compute_mem_alloc_mode_desc(mode));
+}
+
+
+static ssize_t amdgpu_gfx_set_compute_partition_mem_alloc_mode(
+                                               struct device *dev, struct 
device_attribute *addr,
+                                               const char *buf, size_t count)
+{
+       struct drm_device *ddev = dev_get_drvdata(dev);
+       struct amdgpu_device *adev = drm_to_adev(ddev);
+
+       if (!strncasecmp("EVEN", buf, strlen("EVEN")))
+               adev->xcp_mgr->mem_alloc_mode = AMDGPU_PARTITION_MEM_ALLOC_EVEN;
+       else if (!strncasecmp("ALL", buf, strlen("ALL")))
+               adev->xcp_mgr->mem_alloc_mode = AMDGPU_PARTITION_MEM_ALLOC_ALL;
+       else
+               return -EINVAL;
+
+       return count;
+}
+
  static const char *xcp_desc[] = {
        [AMDGPU_SPX_PARTITION_MODE] = "SPX",
        [AMDGPU_DPX_PARTITION_MODE] = "DPX",
@@ -1935,6 +1969,10 @@ static DEVICE_ATTR(gfx_reset_mask, 0444,
  static DEVICE_ATTR(compute_reset_mask, 0444,
                   amdgpu_gfx_get_compute_reset_mask, NULL);
+static DEVICE_ATTR(compute_partition_mem_alloc_mode, 0644,
+                  amdgpu_gfx_get_compute_partition_mem_alloc_mode,
+                  amdgpu_gfx_set_compute_partition_mem_alloc_mode);
+
  static int amdgpu_gfx_sysfs_xcp_init(struct amdgpu_device *adev)
  {
        struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr;
@@ -1955,6 +1993,11 @@ static int amdgpu_gfx_sysfs_xcp_init(struct 
amdgpu_device *adev)
        if (r)
                return r;
+ r = device_create_file(adev->dev,
+                              &dev_attr_compute_partition_mem_alloc_mode);
+       if (r)
+               return r;
+
        if (xcp_switch_supported)
                r = device_create_file(adev->dev,
                                       &dev_attr_available_compute_partition);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 720ed3a2c78c..f5713891f205 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -71,6 +71,11 @@ enum amdgpu_gfx_partition {
        AMDGPU_AUTO_COMPUTE_PARTITION_MODE = -2,
  };
+enum amdgpu_gfx_partition_mem_alloc_mode {
+       AMDGPU_PARTITION_MEM_ALLOC_EVEN = 0,
+       AMDGPU_PARTITION_MEM_ALLOC_ALL  = 1,
+};
+
  #define NUM_XCC(x) hweight16(x)
enum amdgpu_gfx_ras_mem_id_type {
@@ -676,4 +681,16 @@ static inline const char *amdgpu_gfx_compute_mode_desc(int 
mode)
        }
  }
+static inline const char *amdgpu_gfx_compute_mem_alloc_mode_desc(int mode)
+{
+       switch (mode) {
+       case AMDGPU_PARTITION_MEM_ALLOC_EVEN:
+               return "EVEN";
+       case AMDGPU_PARTITION_MEM_ALLOC_ALL:
+               return "ALL";
+       default:
+               return "UNKNOWN";
+       }
+}
+
  #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
index 73250ab45f20..a2d50f90a066 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
@@ -181,6 +181,7 @@ int amdgpu_xcp_init(struct amdgpu_xcp_mgr *xcp_mgr, int 
num_xcps, int mode)
        }
xcp_mgr->num_xcps = num_xcps;
+       xcp_mgr->mem_alloc_mode = AMDGPU_PARTITION_MEM_ALLOC_EVEN;
        amdgpu_xcp_update_partition_sched_list(adev);
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
index 8058e8f35d41..878c1c422893 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
@@ -132,6 +132,8 @@ struct amdgpu_xcp_mgr {
        struct amdgpu_xcp_cfg *xcp_cfg;
        uint32_t supp_xcp_modes;
        uint32_t avail_xcp_modes;
+       /* used to determin KFD memory alloc mode for each partition */
+       uint32_t mem_alloc_mode;
rename to mem_capping_mode?

Regards,
Philip
  };
struct amdgpu_xcp_mgr_funcs {

Reply via email to