On 2026-02-17 11:24, Xiaogang.Chen wrote:
From: Xiaogang Chen<[email protected]>
Current driver reports and limits memory allocation for each partition equally
among partitions using same memory partition. Application may not be able to
use all available memory when run on a partitioned gpu though system still has
enough free memory.
Add an option that app can use to have gpu partition allocate all available
memory.
Signed-off-by: Xiaogang Chen<[email protected]>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 5 ++-
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 43 ++++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 17 +++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h | 2 +
5 files changed, 67 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 3bfd79c89df3..006883c31342 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -805,7 +805,10 @@ u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device
*adev, int xcp_id)
} else {
tmp = adev->gmc.mem_partitions[mem_id].size;
}
- do_div(tmp, adev->xcp_mgr->num_xcp_per_mem_partition);
+
+ if (adev->xcp_mgr->mem_alloc_mode ==
AMDGPU_PARTITION_MEM_ALLOC_EVEN)
+ do_div(tmp, adev->xcp_mgr->num_xcp_per_mem_partition);
+
I think amdgpu_amdkfd_reserve_mem_limit should change to account total
vram used, to prevent vram
over-commitment for not ALLOC_EVEN mode.
return ALIGN_DOWN(tmp, PAGE_SIZE);
} else if (adev->apu_prefer_gtt) {
return (ttm_tt_pages_limit() << PAGE_SHIFT);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index cab3196a87fb..1da46eeb3f2c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -1580,6 +1580,40 @@ static ssize_t amdgpu_gfx_set_compute_partition(struct
device *dev,
return count;
}
+static ssize_t amdgpu_gfx_get_compute_partition_mem_alloc_mode(
+ struct device *dev, struct
device_attribute *addr,
+ char *buf)
+{
+ struct drm_device *ddev = dev_get_drvdata(dev);
+ struct amdgpu_device *adev = drm_to_adev(ddev);
+ int mode = adev->xcp_mgr->mem_alloc_mode;
+
+ /* Only minimal precaution taken to reject requests while in reset.*/
+ if (amdgpu_in_reset(adev))
+ return -EPERM;
+
+ return sysfs_emit(buf, "%s\n",
+ amdgpu_gfx_compute_mem_alloc_mode_desc(mode));
+}
+
+
+static ssize_t amdgpu_gfx_set_compute_partition_mem_alloc_mode(
+ struct device *dev, struct
device_attribute *addr,
+ const char *buf, size_t count)
+{
+ struct drm_device *ddev = dev_get_drvdata(dev);
+ struct amdgpu_device *adev = drm_to_adev(ddev);
+
+ if (!strncasecmp("EVEN", buf, strlen("EVEN")))
+ adev->xcp_mgr->mem_alloc_mode = AMDGPU_PARTITION_MEM_ALLOC_EVEN;
+ else if (!strncasecmp("ALL", buf, strlen("ALL")))
+ adev->xcp_mgr->mem_alloc_mode = AMDGPU_PARTITION_MEM_ALLOC_ALL;
+ else
+ return -EINVAL;
+
+ return count;
+}
+
static const char *xcp_desc[] = {
[AMDGPU_SPX_PARTITION_MODE] = "SPX",
[AMDGPU_DPX_PARTITION_MODE] = "DPX",
@@ -1935,6 +1969,10 @@ static DEVICE_ATTR(gfx_reset_mask, 0444,
static DEVICE_ATTR(compute_reset_mask, 0444,
amdgpu_gfx_get_compute_reset_mask, NULL);
+static DEVICE_ATTR(compute_partition_mem_alloc_mode, 0644,
+ amdgpu_gfx_get_compute_partition_mem_alloc_mode,
+ amdgpu_gfx_set_compute_partition_mem_alloc_mode);
+
static int amdgpu_gfx_sysfs_xcp_init(struct amdgpu_device *adev)
{
struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr;
@@ -1955,6 +1993,11 @@ static int amdgpu_gfx_sysfs_xcp_init(struct
amdgpu_device *adev)
if (r)
return r;
+ r = device_create_file(adev->dev,
+ &dev_attr_compute_partition_mem_alloc_mode);
+ if (r)
+ return r;
+
if (xcp_switch_supported)
r = device_create_file(adev->dev,
&dev_attr_available_compute_partition);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 720ed3a2c78c..f5713891f205 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -71,6 +71,11 @@ enum amdgpu_gfx_partition {
AMDGPU_AUTO_COMPUTE_PARTITION_MODE = -2,
};
+enum amdgpu_gfx_partition_mem_alloc_mode {
+ AMDGPU_PARTITION_MEM_ALLOC_EVEN = 0,
+ AMDGPU_PARTITION_MEM_ALLOC_ALL = 1,
+};
+
#define NUM_XCC(x) hweight16(x)
enum amdgpu_gfx_ras_mem_id_type {
@@ -676,4 +681,16 @@ static inline const char *amdgpu_gfx_compute_mode_desc(int
mode)
}
}
+static inline const char *amdgpu_gfx_compute_mem_alloc_mode_desc(int mode)
+{
+ switch (mode) {
+ case AMDGPU_PARTITION_MEM_ALLOC_EVEN:
+ return "EVEN";
+ case AMDGPU_PARTITION_MEM_ALLOC_ALL:
+ return "ALL";
+ default:
+ return "UNKNOWN";
+ }
+}
+
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
index 73250ab45f20..a2d50f90a066 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
@@ -181,6 +181,7 @@ int amdgpu_xcp_init(struct amdgpu_xcp_mgr *xcp_mgr, int
num_xcps, int mode)
}
xcp_mgr->num_xcps = num_xcps;
+ xcp_mgr->mem_alloc_mode = AMDGPU_PARTITION_MEM_ALLOC_EVEN;
amdgpu_xcp_update_partition_sched_list(adev);
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
index 8058e8f35d41..878c1c422893 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
@@ -132,6 +132,8 @@ struct amdgpu_xcp_mgr {
struct amdgpu_xcp_cfg *xcp_cfg;
uint32_t supp_xcp_modes;
uint32_t avail_xcp_modes;
+ /* used to determin KFD memory alloc mode for each partition */
+ uint32_t mem_alloc_mode;
rename to mem_capping_mode?
Regards,
Philip
};
struct amdgpu_xcp_mgr_funcs {