Use dedicated memory as vf ras command buffer.
Signed-off-by: YiPeng Chai <[email protected]>
---
drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h | 28 ++++-
.../drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c | 114 ++++++++++++------
.../drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.h | 9 +-
3 files changed, 107 insertions(+), 44 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
index c20d10263492..067832b5936c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
@@ -498,14 +498,30 @@ struct amd_sriov_ras_chk_criti {
uint32_t hit;
};
+union amd_sriov_ras_host_push {
+ struct amd_sriov_ras_telemetry_error_count error_count;
+ struct amd_sriov_ras_cper_dump cper_dump;
+ struct amd_sriov_ras_chk_criti chk_criti;
+};
+
+#define AMD_SRIOV_UNIRAS_CMD_MAX_SIZE (PAGE_SIZE * 13)
+struct amd_sriov_uniras_shared_mem {
+ uint8_t blocks_ecc_buf[PAGE_SIZE];
+ uint8_t cmd_buf[AMD_SRIOV_UNIRAS_CMD_MAX_SIZE];
+};
+
struct amdsriov_ras_telemetry {
struct amd_sriov_ras_telemetry_header header;
-
- union {
- struct amd_sriov_ras_telemetry_error_count error_count;
- struct amd_sriov_ras_cper_dump cper_dump;
- struct amd_sriov_ras_chk_criti chk_criti;
- } body;
+ union amd_sriov_ras_host_push body;
+ struct amd_sriov_uniras_shared_mem uniras_shared_mem;
+
+ /* This is to prevent the size definition of the previous structure
+ * from exceeding AMD_STROV_MSG.RAS_TELEMETRy_size_KB_V1.
+ */
+ uint8_t reserved[AMD_SRIOV_MSG_RAS_TELEMETRY_SIZE_KB_V1 * 1024 -
+ sizeof(struct amd_sriov_ras_telemetry_header) -
+ sizeof(union amd_sriov_ras_host_push) -
+ sizeof(struct amd_sriov_uniras_shared_mem)];
};
/* version data stored in MAILBOX_MSGBUF_RCV_DW1 for future expansion */
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c
b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c
index a75479593864..0c72e1e5834c 100644
--- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c
@@ -30,33 +30,82 @@
#include "amdgpu_virt_ras_cmd.h"
#include "amdgpu_ras_mgr.h"
+static int amdgpu_virt_ras_get_cmd_shared_mem(struct ras_core_context
*ras_core,
+ uint32_t cmd, uint32_t mem_size, struct amdgpu_virt_shared_mem
*shared_mem)
+{
+ struct amdgpu_device *adev = ras_core->dev;
+ struct amdsriov_ras_telemetry *ras_telemetry_cpu;
+ struct amdsriov_ras_telemetry *ras_telemetry_gpu;
+ uint64_t fw_vram_usage_start_offset = 0;
+ uint64_t ras_telemetry_offset = 0;
+
+ if (!adev->virt.fw_reserve.ras_telemetry)
+ return -EINVAL;
+
+ if (adev->mman.fw_vram_usage_va &&
+ adev->mman.fw_vram_usage_va <= adev->virt.fw_reserve.ras_telemetry)
{
+ fw_vram_usage_start_offset =
adev->mman.fw_vram_usage_start_offset;
+ ras_telemetry_offset =
(uintptr_t)adev->virt.fw_reserve.ras_telemetry -
+ (uintptr_t)adev->mman.fw_vram_usage_va;
+ } else if (adev->mman.drv_vram_usage_va &&
+ adev->mman.drv_vram_usage_va <=
adev->virt.fw_reserve.ras_telemetry) {
+ fw_vram_usage_start_offset =
adev->mman.drv_vram_usage_start_offset;
+ ras_telemetry_offset =
(uintptr_t)adev->virt.fw_reserve.ras_telemetry -
+ (uintptr_t)adev->mman.drv_vram_usage_va;
+ } else {
+ return -EINVAL;
+ }
+
+ ras_telemetry_cpu =
+ (struct amdsriov_ras_telemetry
*)adev->virt.fw_reserve.ras_telemetry;
+ ras_telemetry_gpu =
+ (struct amdsriov_ras_telemetry *)(fw_vram_usage_start_offset +
+ ras_telemetry_offset);
+
+ if (cmd == RAS_CMD__GET_ALL_BLOCK_ECC_STATUS) {
+ if (mem_size > PAGE_SIZE)
+ return -ENOMEM;
+
+ shared_mem->cpu_addr =
ras_telemetry_cpu->uniras_shared_mem.blocks_ecc_buf;
+ shared_mem->gpa =
+
(uintptr_t)ras_telemetry_gpu->uniras_shared_mem.blocks_ecc_buf -
+ adev->gmc.vram_start;
+ shared_mem->size = mem_size;
+ } else {
+ if (mem_size > AMD_SRIOV_UNIRAS_CMD_MAX_SIZE)
+ return -ENOMEM;
+
+ shared_mem->cpu_addr =
ras_telemetry_cpu->uniras_shared_mem.cmd_buf;
+ shared_mem->gpa =
+ (uintptr_t)ras_telemetry_gpu->uniras_shared_mem.cmd_buf
-
+ adev->gmc.vram_start;
+ shared_mem->size = mem_size;
+ }
+
+ return 0;
+}
+
static int amdgpu_virt_ras_remote_ioctl_cmd(struct ras_core_context *ras_core,
struct ras_cmd_ctx *cmd, void *output_data, uint32_t
output_size)
{
- struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
uint32_t mem_len = ALIGN(sizeof(*cmd) + output_size,
AMDGPU_GPU_PAGE_SIZE);
struct ras_cmd_ctx *rcmd;
- struct amdgpu_bo *rcmd_bo = NULL;
- uint64_t mc_addr = 0;
- void *cpu_addr = NULL;
+ struct amdgpu_virt_shared_mem shared_mem = {0};
int ret = 0;
- ret = amdgpu_bo_create_kernel(adev, mem_len, PAGE_SIZE,
- AMDGPU_GEM_DOMAIN_VRAM, &rcmd_bo, &mc_addr, (void
**)&cpu_addr);
+ ret = amdgpu_virt_ras_get_cmd_shared_mem(ras_core, cmd->cmd_id,
mem_len, &shared_mem);
if (ret)
return ret;
- rcmd = (struct ras_cmd_ctx *)cpu_addr;
+ rcmd = (struct ras_cmd_ctx *)shared_mem.cpu_addr;
memset(rcmd, 0, mem_len);
memcpy(rcmd, cmd, sizeof(*cmd));
ret = amdgpu_virt_send_remote_ras_cmd(ras_core->dev,
- mc_addr - adev->gmc.vram_start, mem_len);
+ shared_mem.gpa, mem_len);
if (!ret) {
- if (rcmd->cmd_res) {
- ret = rcmd->cmd_res;
- goto out;
- }
+ if (rcmd->cmd_res)
+ return rcmd->cmd_res;
cmd->cmd_res = rcmd->cmd_res;
cmd->output_size = rcmd->output_size;
@@ -64,9 +113,6 @@ static int amdgpu_virt_ras_remote_ioctl_cmd(struct
ras_core_context *ras_core,
memcpy(output_data, rcmd->output_buff_raw,
rcmd->output_size);
}
-out:
- amdgpu_bo_free_kernel(&rcmd_bo, &mc_addr, &cpu_addr);
-
return ret;
}
@@ -77,6 +123,9 @@ static int amdgpu_virt_ras_send_remote_cmd(struct
ras_core_context *ras_core,
struct ras_cmd_ctx rcmd = {0};
int ret;
+ if (input_size > RAS_CMD_MAX_IN_SIZE)
+ return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
+
rcmd.cmd_id = cmd_id;
rcmd.input_size = input_size;
memcpy(rcmd.input_buff_raw, input_data, input_size);
@@ -146,7 +195,7 @@ static int amdgpu_virt_ras_get_batch_records(struct
ras_core_context *ras_core,
struct ras_cmd_batch_trace_record_rsp *rsp = rsp_cache;
struct batch_ras_trace_info *batch;
int ret = 0;
- uint8_t i;
+ uint32_t i;
if (!rsp->real_batch_num || (batch_id < rsp->start_batch_id) ||
(batch_id >= (rsp->start_batch_id + rsp->real_batch_num))) {
@@ -249,14 +298,14 @@ static int __fill_get_blocks_ecc_cmd(struct amdgpu_device
*adev,
{
struct ras_cmd_ctx *rcmd;
- if (!blks_ecc || !blks_ecc->bo || !blks_ecc->cpu_addr)
+ if (!blks_ecc || !blks_ecc->shared_mem.cpu_addr)
return -EINVAL;
- rcmd = (struct ras_cmd_ctx *)blks_ecc->cpu_addr;
+ rcmd = (struct ras_cmd_ctx *)blks_ecc->shared_mem.cpu_addr;
rcmd->cmd_id = RAS_CMD__GET_ALL_BLOCK_ECC_STATUS;
rcmd->input_size = sizeof(struct ras_cmd_blocks_ecc_req);
- rcmd->output_buf_size = blks_ecc->size - sizeof(*rcmd);
+ rcmd->output_buf_size = blks_ecc->shared_mem.size - sizeof(*rcmd);
return 0;
}
@@ -305,15 +354,15 @@ static int amdgpu_virt_ras_get_block_ecc(struct
ras_core_context *ras_core,
if (!virt_ras->blocks_ecc.auto_update_actived) {
ret = __set_cmd_auto_update(adev,
RAS_CMD__GET_ALL_BLOCK_ECC_STATUS,
- blks_ecc->mc_addr - adev->gmc.vram_start,
- blks_ecc->size, true);
+ blks_ecc->shared_mem.gpa,
+ blks_ecc->shared_mem.size, true);
if (ret)
return ret;
blks_ecc->auto_update_actived = true;
}
- blks_ecc_cmd_ctx = blks_ecc->cpu_addr;
+ blks_ecc_cmd_ctx = blks_ecc->shared_mem.cpu_addr;
blks_ecc_rsp = (struct ras_cmd_blocks_ecc_rsp
*)blks_ecc_cmd_ctx->output_buff_raw;
output_data->ce_count =
blks_ecc_rsp->blocks[input_data->block_id].ce_count;
@@ -392,11 +441,9 @@ int amdgpu_virt_ras_hw_init(struct amdgpu_device *adev)
amdgpu_virt_get_ras_capability(adev);
memset(blks_ecc, 0, sizeof(*blks_ecc));
- blks_ecc->size = PAGE_SIZE;
- if (amdgpu_bo_create_kernel(adev, blks_ecc->size,
- PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
- &blks_ecc->bo, &blks_ecc->mc_addr,
- (void **)&blks_ecc->cpu_addr))
+ if (amdgpu_virt_ras_get_cmd_shared_mem(ras_mgr->ras_core,
+ RAS_CMD__GET_ALL_BLOCK_ECC_STATUS,
+ PAGE_SIZE, &blks_ecc->shared_mem))
return -ENOMEM;
return 0;
@@ -409,18 +456,15 @@ int amdgpu_virt_ras_hw_fini(struct amdgpu_device *adev)
(struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc;
- if (blks_ecc->bo) {
+ if (blks_ecc->shared_mem.cpu_addr) {
__set_cmd_auto_update(adev,
RAS_CMD__GET_ALL_BLOCK_ECC_STATUS,
- blks_ecc->mc_addr - adev->gmc.vram_start,
- blks_ecc->size, false);
+ blks_ecc->shared_mem.gpa,
+ blks_ecc->shared_mem.size, false);
- memset(blks_ecc->cpu_addr, 0, blks_ecc->size);
- amdgpu_bo_free_kernel(&blks_ecc->bo,
- &blks_ecc->mc_addr, &blks_ecc->cpu_addr);
-
- memset(blks_ecc, 0, sizeof(*blks_ecc));
+ memset(blks_ecc->shared_mem.cpu_addr, 0,
blks_ecc->shared_mem.size);
}
+ memset(blks_ecc, 0, sizeof(*blks_ecc));
return 0;
}
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.h
b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.h
index 53b0f3f60103..001e4cfb823e 100644
--- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.h
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.h
@@ -30,11 +30,14 @@ struct remote_batch_trace_mgr {
struct ras_cmd_batch_trace_record_rsp batch_trace;
};
-struct vram_blocks_ecc {
- struct amdgpu_bo *bo;
- uint64_t mc_addr;
+struct amdgpu_virt_shared_mem {
+ uint64_t gpa;
void *cpu_addr;
uint32_t size;
+};
+
+struct vram_blocks_ecc {
+ struct amdgpu_virt_shared_mem shared_mem;
bool auto_update_actived;
};
--
2.43.0