Add umc common ras functions.

Signed-off-by: YiPeng Chai <yipeng.c...@amd.com>
Reviewed-by: Tao Zhou <tao.zh...@amd.com>
Reviewed-by: Hawking Zhang <hawking.zh...@amd.com>
---
 drivers/gpu/drm/amd/ras/rascore/ras_umc.c | 706 ++++++++++++++++++++++
 drivers/gpu/drm/amd/ras/rascore/ras_umc.h | 166 +++++
 2 files changed, 872 insertions(+)
 create mode 100644 drivers/gpu/drm/amd/ras/rascore/ras_umc.c
 create mode 100644 drivers/gpu/drm/amd/ras/rascore/ras_umc.h

diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc.c 
b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
new file mode 100644
index 000000000000..4067359bb299
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
@@ -0,0 +1,706 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "ras.h"
+#include "ras_umc.h"
+#include "ras_umc_v12_0.h"
+
+#define MAX_ECC_NUM_PER_RETIREMENT  16
+
+/* bad page timestamp format
+ * yy[31:27] mm[26:23] day[22:17] hh[16:12] mm[11:6] ss[5:0]
+ */
+#define EEPROM_TIMESTAMP_MINUTE  6
+#define EEPROM_TIMESTAMP_HOUR    12
+#define EEPROM_TIMESTAMP_DAY     17
+#define EEPROM_TIMESTAMP_MONTH   23
+#define EEPROM_TIMESTAMP_YEAR    27
+
+static uint64_t ras_umc_get_eeprom_timestamp(struct ras_core_context *ras_core)
+{
+       struct ras_time tm = {0};
+       uint64_t utc_timestamp = 0;
+       uint64_t eeprom_timestamp = 0;
+
+       utc_timestamp = ras_core_get_utc_second_timestamp(ras_core);
+       if (!utc_timestamp)
+               return utc_timestamp;
+
+       ras_core_convert_timestamp_to_time(ras_core, utc_timestamp, &tm);
+
+       /* the year range is 2000 ~ 2031, set the year if not in the range */
+       if (tm.tm_year < 2000)
+               tm.tm_year = 2000;
+       if (tm.tm_year > 2031)
+               tm.tm_year = 2031;
+
+       tm.tm_year -= 2000;
+
+       eeprom_timestamp = tm.tm_sec + (tm.tm_min << EEPROM_TIMESTAMP_MINUTE)
+                               + (tm.tm_hour << EEPROM_TIMESTAMP_HOUR)
+                               + (tm.tm_mday << EEPROM_TIMESTAMP_DAY)
+                               + (tm.tm_mon << EEPROM_TIMESTAMP_MONTH)
+                               + (tm.tm_year << EEPROM_TIMESTAMP_YEAR);
+       eeprom_timestamp &= 0xffffffff;
+
+       return eeprom_timestamp;
+}
+
+static const struct ras_umc_ip_func *ras_umc_get_ip_func(
+                               struct ras_core_context *ras_core, uint32_t 
ip_version)
+{
+       switch (ip_version) {
+       case IP_VERSION(12, 0, 0):
+               return &ras_umc_func_v12_0;
+       default:
+               RAS_DEV_ERR(ras_core->dev,
+                       "UMC ip version(0x%x) is not supported!\n", ip_version);
+               break;
+       }
+
+       return NULL;
+}
+
+int ras_umc_psp_convert_ma_to_pa(struct ras_core_context *ras_core,
+               struct umc_mca_addr *in, struct umc_phy_addr *out,
+               uint32_t nps)
+{
+       struct ras_ta_query_address_input addr_in;
+       struct ras_ta_query_address_output addr_out;
+       int ret;
+
+       if (!in)
+               return -EINVAL;
+
+       memset(&addr_in, 0, sizeof(addr_in));
+       memset(&addr_out, 0, sizeof(addr_out));
+
+       addr_in.ma.err_addr = in->err_addr;
+       addr_in.ma.ch_inst = in->ch_inst;
+       addr_in.ma.umc_inst = in->umc_inst;
+       addr_in.ma.node_inst = in->node_inst;
+       addr_in.ma.socket_id = in->socket_id;
+
+       addr_in.addr_type = RAS_TA_MCA_TO_PA;
+
+       ret = ras_psp_query_address(ras_core, &addr_in, &addr_out);
+       if (ret) {
+               RAS_DEV_WARN(ras_core->dev,
+                       "Failed to query RAS physical address for 0x%llx, 
ret:%d",
+                       in->err_addr, ret);
+               return -EREMOTEIO;
+       }
+
+       if (out) {
+               out->pa = addr_out.pa.pa;
+               out->bank = addr_out.pa.bank;
+               out->channel_idx = addr_out.pa.channel_idx;
+       }
+
+       return 0;
+}
+
+static int ras_umc_log_ecc(struct ras_core_context *ras_core,
+               unsigned long idx, void *data)
+{
+       struct ras_umc *ras_umc = &ras_core->ras_umc;
+       int ret;
+
+       mutex_lock(&ras_umc->tree_lock);
+       ret = radix_tree_insert(&ras_umc->root, idx, data);
+       if (!ret)
+               radix_tree_tag_set(&ras_umc->root, idx, 
UMC_ECC_NEW_DETECTED_TAG);
+       mutex_unlock(&ras_umc->tree_lock);
+
+       return ret;
+}
+
+int ras_umc_clear_logged_ecc(struct ras_core_context *ras_core)
+{
+       struct ras_umc *ras_umc = &ras_core->ras_umc;
+       uint64_t buf[8] = {0};
+       void  **slot;
+       void *data;
+       void *iter = buf;
+
+       mutex_lock(&ras_umc->tree_lock);
+       radix_tree_for_each_slot(slot, &ras_umc->root, iter, 0) {
+               data = ras_radix_tree_delete_iter(&ras_umc->root, iter);
+               kfree(data);
+       }
+       mutex_unlock(&ras_umc->tree_lock);
+
+       return 0;
+}
+
+static void ras_umc_reserve_eeprom_record(struct ras_core_context *ras_core,
+                               struct eeprom_umc_record *record)
+{
+       struct ras_umc *ras_umc = &ras_core->ras_umc;
+       uint64_t page_pfn[16];
+       int count = 0, i;
+
+       memset(page_pfn, 0, sizeof(page_pfn));
+       if (ras_umc->ip_func && ras_umc->ip_func->eeprom_record_to_nps_pages) {
+               count = ras_umc->ip_func->eeprom_record_to_nps_pages(ras_core,
+                                       record, record->cur_nps, page_pfn, 
ARRAY_SIZE(page_pfn));
+               if (count <= 0) {
+                       RAS_DEV_ERR(ras_core->dev,
+                               "Fail to convert error address! count:%d\n", 
count);
+                       return;
+               }
+       }
+
+       /* Reserve memory */
+       for (i = 0; i < count; i++)
+               ras_core_event_notify(ras_core,
+                       RAS_EVENT_ID__RESERVE_BAD_PAGE, &page_pfn[i]);
+}
+
+/* When gpu reset is ongoing, ecc logging operations will be pended.
+ */
+int ras_umc_log_bad_bank_pending(struct ras_core_context *ras_core, struct 
ras_bank_ecc *bank)
+{
+       struct ras_umc *ras_umc = &ras_core->ras_umc;
+       struct ras_bank_ecc_node *ecc_node;
+
+       ecc_node = kzalloc(sizeof(*ecc_node), GFP_KERNEL);
+       if (!ecc_node)
+               return -ENOMEM;
+
+       memcpy(&ecc_node->ecc, bank, sizeof(ecc_node->ecc));
+
+       mutex_lock(&ras_umc->pending_ecc_lock);
+       list_add_tail(&ecc_node->node, &ras_umc->pending_ecc_list);
+       mutex_unlock(&ras_umc->pending_ecc_lock);
+
+       return 0;
+}
+
+/* After gpu reset is complete, re-log the pending error banks.
+ */
+int ras_umc_log_pending_bad_bank(struct ras_core_context *ras_core)
+{
+       struct ras_umc *ras_umc = &ras_core->ras_umc;
+       struct ras_bank_ecc_node *ecc_node, *tmp;
+
+       mutex_lock(&ras_umc->pending_ecc_lock);
+       list_for_each_entry_safe(ecc_node,
+               tmp, &ras_umc->pending_ecc_list, node){
+               if (ecc_node && !ras_umc_log_bad_bank(ras_core, 
&ecc_node->ecc)) {
+                       list_del(&ecc_node->node);
+                       kfree(ecc_node);
+               }
+       }
+       mutex_unlock(&ras_umc->pending_ecc_lock);
+
+       return 0;
+}
+
+int ras_umc_log_bad_bank(struct ras_core_context *ras_core, struct 
ras_bank_ecc *bank)
+{
+       struct ras_umc *ras_umc = &ras_core->ras_umc;
+       struct eeprom_umc_record umc_rec;
+       struct eeprom_umc_record *err_rec;
+       int ret;
+
+       memset(&umc_rec, 0, sizeof(umc_rec));
+
+       mutex_lock(&ras_umc->bank_log_lock);
+       ret = ras_umc->ip_func->bank_to_eeprom_record(ras_core, bank, &umc_rec);
+       if (ret)
+               goto out;
+
+       err_rec = kzalloc(sizeof(*err_rec), GFP_KERNEL);
+       if (!err_rec) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       memcpy(err_rec, &umc_rec, sizeof(umc_rec));
+       ret = ras_umc_log_ecc(ras_core, err_rec->cur_nps_retired_row_pfn, 
err_rec);
+       if (ret) {
+               if (ret == -EEXIST) {
+                       RAS_DEV_INFO(ras_core->dev, "The bad pages have been 
logged before.\n");
+                       ret = 0;
+               }
+
+               kfree(err_rec);
+               goto out;
+       }
+
+       ras_umc_reserve_eeprom_record(ras_core, err_rec);
+
+       ret = ras_core_event_notify(ras_core,
+                       RAS_EVENT_ID__BAD_PAGE_DETECTED, NULL);
+
+out:
+       mutex_unlock(&ras_umc->bank_log_lock);
+       return ret;
+}
+
+static int ras_umc_get_new_records(struct ras_core_context *ras_core,
+                       struct eeprom_umc_record *records, u32 num)
+{
+       struct ras_umc *ras_umc = &ras_core->ras_umc;
+       struct eeprom_umc_record *entries[MAX_ECC_NUM_PER_RETIREMENT];
+       u32 entry_num = num < MAX_ECC_NUM_PER_RETIREMENT ? num : 
MAX_ECC_NUM_PER_RETIREMENT;
+       int count = 0;
+       int new_detected, i;
+
+       mutex_lock(&ras_umc->tree_lock);
+       new_detected = radix_tree_gang_lookup_tag(&ras_umc->root, (void 
**)entries,
+                       0, entry_num, UMC_ECC_NEW_DETECTED_TAG);
+       for (i = 0; i < new_detected; i++) {
+               if (!entries[i])
+                       continue;
+
+               memcpy(&records[i], entries[i], sizeof(struct 
eeprom_umc_record));
+               count++;
+               radix_tree_tag_clear(&ras_umc->root,
+                               entries[i]->cur_nps_retired_row_pfn, 
UMC_ECC_NEW_DETECTED_TAG);
+       }
+       mutex_unlock(&ras_umc->tree_lock);
+
+       return count;
+}
+
+static bool ras_umc_check_retired_record(struct ras_core_context *ras_core,
+                               struct eeprom_umc_record *record, bool 
from_eeprom)
+{
+       struct ras_umc *ras_umc = &ras_core->ras_umc;
+       struct eeprom_store_record *data = &ras_umc->umc_err_data.rom_data;
+       uint32_t nps = 0;
+       int i, ret;
+
+       if (from_eeprom) {
+               nps = ras_umc->umc_err_data.umc_nps_mode;
+               if (ras_umc->ip_func && 
ras_umc->ip_func->eeprom_record_to_nps_record) {
+                       ret = 
ras_umc->ip_func->eeprom_record_to_nps_record(ras_core, record, nps);
+                       if (ret)
+                               RAS_DEV_WARN(ras_core->dev,
+                                       "Failed to adjust eeprom record, 
ret:%d", ret);
+               }
+               return false;
+       }
+
+       for (i = 0; i < data->count; i++) {
+               if ((data->bps[i].retired_row_pfn == record->retired_row_pfn) &&
+                   (data->bps[i].cur_nps_retired_row_pfn == 
record->cur_nps_retired_row_pfn))
+                       return true;
+       }
+
+       return false;
+}
+
+/* alloc/realloc bps array */
+static int ras_umc_realloc_err_data_space(struct ras_core_context *ras_core,
+               struct eeprom_store_record *data, int pages)
+{
+       unsigned int old_space = data->count + data->space_left;
+       unsigned int new_space = old_space + pages;
+       unsigned int align_space = ALIGN(new_space, 512);
+       void *bps = kzalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
+
+       if (!bps)
+               return -ENOMEM;
+
+       if (data->bps) {
+               memcpy(bps, data->bps,
+                               data->count * sizeof(*data->bps));
+               kfree(data->bps);
+       }
+
+       data->bps = bps;
+       data->space_left += align_space - old_space;
+       return 0;
+}
+
+static int ras_umc_update_eeprom_rom_data(struct ras_core_context *ras_core,
+               struct eeprom_umc_record *bps)
+{
+       struct eeprom_store_record *data = 
&ras_core->ras_umc.umc_err_data.rom_data;
+
+       if (!data->space_left &&
+               ras_umc_realloc_err_data_space(ras_core, data, 256)) {
+               return  -ENOMEM;
+       }
+
+       memcpy(&data->bps[data->count], bps, sizeof(*data->bps));
+       data->count++;
+       data->space_left--;
+       return 0;
+}
+
+static int ras_umc_update_eeprom_ram_data(struct ras_core_context *ras_core,
+                               struct eeprom_umc_record *bps)
+{
+       struct ras_umc *ras_umc = &ras_core->ras_umc;
+       struct eeprom_store_record *data = &ras_umc->umc_err_data.ram_data;
+       uint64_t page_pfn[16];
+       int count = 0, j;
+
+       if (!data->space_left &&
+               ras_umc_realloc_err_data_space(ras_core, data, 256)) {
+               return  -ENOMEM;
+       }
+
+       memset(page_pfn, 0, sizeof(page_pfn));
+       if (ras_umc->ip_func && ras_umc->ip_func->eeprom_record_to_nps_pages)
+               count = ras_umc->ip_func->eeprom_record_to_nps_pages(ras_core,
+                                       bps, bps->cur_nps, page_pfn, 
ARRAY_SIZE(page_pfn));
+
+       if (count > 0) {
+               for (j = 0; j < count; j++) {
+                       bps->cur_nps_retired_row_pfn = page_pfn[j];
+                       memcpy(&data->bps[data->count], bps, 
sizeof(*data->bps));
+                       data->count++;
+                       data->space_left--;
+               }
+       } else {
+               memcpy(&data->bps[data->count], bps, sizeof(*data->bps));
+               data->count++;
+               data->space_left--;
+       }
+
+       return 0;
+}
+
+/* it deal with vram only. */
+static int ras_umc_add_bad_pages(struct ras_core_context *ras_core,
+                                struct eeprom_umc_record *bps,
+                                int pages, bool from_eeprom)
+{
+       struct ras_umc *ras_umc = &ras_core->ras_umc;
+       struct ras_umc_err_data *data = &ras_umc->umc_err_data;
+       int i, ret = 0;
+
+       if (!bps || pages <= 0)
+               return 0;
+
+       mutex_lock(&ras_umc->umc_lock);
+       for (i = 0; i < pages; i++) {
+               if (ras_umc_check_retired_record(ras_core, &bps[i], 
from_eeprom))
+                       continue;
+
+               ret = ras_umc_update_eeprom_rom_data(ras_core, &bps[i]);
+               if (ret)
+                       goto out;
+
+               if (data->last_retired_pfn == bps[i].cur_nps_retired_row_pfn)
+                       continue;
+
+               data->last_retired_pfn = bps[i].cur_nps_retired_row_pfn;
+
+               if (from_eeprom)
+                       ras_umc_reserve_eeprom_record(ras_core, &bps[i]);
+
+               ret = ras_umc_update_eeprom_ram_data(ras_core, &bps[i]);
+               if (ret)
+                       goto out;
+       }
+out:
+       mutex_unlock(&ras_umc->umc_lock);
+
+       return ret;
+}
+
+/*
+ * read error record array in eeprom and reserve enough space for
+ * storing new bad pages
+ */
+int ras_umc_load_bad_pages(struct ras_core_context *ras_core)
+{
+       struct eeprom_umc_record *bps;
+       uint32_t ras_num_recs;
+       int ret;
+
+       ras_num_recs = ras_eeprom_get_record_count(ras_core);
+       /* no bad page record, skip eeprom access */
+       if (!ras_num_recs ||
+           ras_core->ras_eeprom.record_threshold_config == DISABLE_RETIRE_PAGE)
+               return 0;
+
+       bps = kcalloc(ras_num_recs, sizeof(*bps), GFP_KERNEL);
+       if (!bps)
+               return -ENOMEM;
+
+       ret = ras_eeprom_read(ras_core, bps, ras_num_recs);
+       if (ret) {
+               RAS_DEV_ERR(ras_core->dev, "Failed to load EEPROM table 
records!");
+       } else {
+               ras_core->ras_umc.umc_err_data.last_retired_pfn = 
UMC_INV_MEM_PFN;
+               ret = ras_umc_add_bad_pages(ras_core, bps, ras_num_recs, true);
+       }
+
+       kfree(bps);
+       return ret;
+}
+
+/*
+ * write error record array to eeprom, the function should be
+ * protected by recovery_lock
+ * new_cnt: new added UE count, excluding reserved bad pages, can be NULL
+ */
+static int ras_umc_save_bad_pages(struct ras_core_context *ras_core)
+{
+       struct ras_umc *ras_umc = &ras_core->ras_umc;
+       struct eeprom_store_record *data = &ras_umc->umc_err_data.rom_data;
+       uint32_t eeprom_record_num;
+       int save_count;
+       int ret = 0;
+
+       if (!data->bps)
+               return 0;
+
+       eeprom_record_num = ras_eeprom_get_record_count(ras_core);
+       mutex_lock(&ras_umc->umc_lock);
+       save_count = data->count - eeprom_record_num;
+       /* only new entries are saved */
+       if (save_count > 0) {
+               if (ras_eeprom_append(ras_core,
+                                          &data->bps[eeprom_record_num],
+                                          save_count)) {
+                       RAS_DEV_ERR(ras_core->dev, "Failed to save EEPROM table 
data!");
+                       ret = -EIO;
+                       goto exit;
+               }
+
+               RAS_DEV_INFO(ras_core->dev, "Saved %d pages to EEPROM 
table.\n", save_count);
+       }
+
+exit:
+       mutex_unlock(&ras_umc->umc_lock);
+       return ret;
+}
+
+int ras_umc_handle_bad_pages(struct ras_core_context *ras_core, void *data)
+{
+       struct eeprom_umc_record records[MAX_ECC_NUM_PER_RETIREMENT];
+       int count, ret;
+
+       memset(records, 0, sizeof(records));
+       count = ras_umc_get_new_records(ras_core, records, ARRAY_SIZE(records));
+       if (count <= 0)
+               return -ENODATA;
+
+       ret = ras_umc_add_bad_pages(ras_core, records, count, false);
+       if (ret) {
+               RAS_DEV_ERR(ras_core->dev, "Failed to add ras bad page!\n");
+               return -EINVAL;
+       }
+
+       ret = ras_umc_save_bad_pages(ras_core);
+       if (ret) {
+               RAS_DEV_ERR(ras_core->dev, "Failed to save ras bad page\n");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+int ras_umc_sw_init(struct ras_core_context *ras_core)
+{
+       struct ras_umc *ras_umc = &ras_core->ras_umc;
+
+       memset(ras_umc, 0, sizeof(*ras_umc));
+
+       INIT_LIST_HEAD(&ras_umc->pending_ecc_list);
+
+       INIT_RADIX_TREE(&ras_umc->root, GFP_KERNEL);
+
+       mutex_init(&ras_umc->tree_lock);
+       mutex_init(&ras_umc->pending_ecc_lock);
+       mutex_init(&ras_umc->umc_lock);
+       mutex_init(&ras_umc->bank_log_lock);
+
+       return 0;
+}
+
+int ras_umc_sw_fini(struct ras_core_context *ras_core)
+{
+       struct ras_umc *ras_umc = &ras_core->ras_umc;
+       struct ras_umc_err_data *umc_err_data = &ras_umc->umc_err_data;
+       struct ras_bank_ecc_node *ecc_node, *tmp;
+
+       mutex_destroy(&ras_umc->umc_lock);
+       mutex_destroy(&ras_umc->bank_log_lock);
+
+       if (umc_err_data->rom_data.bps) {
+               umc_err_data->rom_data.count = 0;
+               kfree(umc_err_data->rom_data.bps);
+               umc_err_data->rom_data.bps = NULL;
+               umc_err_data->rom_data.space_left = 0;
+       }
+
+       if (umc_err_data->ram_data.bps) {
+               umc_err_data->ram_data.count = 0;
+               kfree(umc_err_data->ram_data.bps);
+               umc_err_data->ram_data.bps = NULL;
+               umc_err_data->ram_data.space_left = 0;
+       }
+
+       ras_umc_clear_logged_ecc(ras_core);
+
+       mutex_lock(&ras_umc->pending_ecc_lock);
+       list_for_each_entry_safe(ecc_node,
+               tmp, &ras_umc->pending_ecc_list, node){
+               list_del(&ecc_node->node);
+               kfree(ecc_node);
+       }
+       mutex_unlock(&ras_umc->pending_ecc_lock);
+
+       mutex_destroy(&ras_umc->tree_lock);
+       mutex_destroy(&ras_umc->pending_ecc_lock);
+
+       return 0;
+}
+
+int ras_umc_hw_init(struct ras_core_context *ras_core)
+{
+       struct ras_umc *ras_umc = &ras_core->ras_umc;
+       uint32_t nps;
+
+       nps = ras_core_get_curr_nps_mode(ras_core);
+
+       if (!nps || (nps >= UMC_MEMORY_PARTITION_MODE_UNKNOWN)) {
+               RAS_DEV_ERR(ras_core->dev, "Invalid memory NPS mode: %u!\n", 
nps);
+               return -ENODATA;
+       }
+
+       ras_umc->umc_err_data.umc_nps_mode = nps;
+
+       ras_umc->umc_vram_type = ras_core->config->umc_cfg.umc_vram_type;
+       if (!ras_umc->umc_vram_type) {
+               RAS_DEV_ERR(ras_core->dev, "Invalid UMC VRAM Type: %u!\n",
+                       ras_umc->umc_vram_type);
+               return -ENODATA;
+       }
+
+       ras_umc->umc_ip_version = ras_core->config->umc_ip_version;
+       ras_umc->ip_func = ras_umc_get_ip_func(ras_core, 
ras_umc->umc_ip_version);
+       if (!ras_umc->ip_func)
+               return -EINVAL;
+
+       return 0;
+}
+
+int ras_umc_hw_fini(struct ras_core_context *ras_core)
+{
+       return 0;
+}
+
+int ras_umc_clean_badpage_data(struct ras_core_context *ras_core)
+{
+       struct ras_umc_err_data *data = &ras_core->ras_umc.umc_err_data;
+
+       mutex_lock(&ras_core->ras_umc.umc_lock);
+
+       kfree(data->rom_data.bps);
+       kfree(data->ram_data.bps);
+
+       memset(data, 0, sizeof(*data));
+       mutex_unlock(&ras_core->ras_umc.umc_lock);
+
+       return 0;
+}
+
+int ras_umc_fill_eeprom_record(struct ras_core_context *ras_core,
+               uint64_t err_addr, uint32_t umc_inst, struct umc_phy_addr 
*cur_nps_addr,
+               enum umc_memory_partition_mode cur_nps, struct 
eeprom_umc_record *record)
+{
+       struct eeprom_umc_record *err_rec = record;
+
+       /* Set bad page pfn and nps mode */
+       EEPROM_RECORD_SETUP_UMC_ADDR_AND_NPS(err_rec,
+                       RAS_ADDR_TO_PFN(cur_nps_addr->pa), cur_nps);
+
+       err_rec->address = err_addr;
+       err_rec->ts = ras_umc_get_eeprom_timestamp(ras_core);
+       err_rec->err_type = RAS_EEPROM_ERR_NON_RECOVERABLE;
+       err_rec->cu = 0;
+       err_rec->mem_channel = cur_nps_addr->channel_idx;
+       err_rec->mcumc_id = umc_inst;
+       err_rec->cur_nps_retired_row_pfn = RAS_ADDR_TO_PFN(cur_nps_addr->pa);
+       err_rec->cur_nps_bank = cur_nps_addr->bank;
+       err_rec->cur_nps = cur_nps;
+       return 0;
+}
+
+int ras_umc_get_saved_eeprom_count(struct ras_core_context *ras_core)
+{
+       struct ras_umc_err_data *err_data = &ras_core->ras_umc.umc_err_data;
+
+       return err_data->rom_data.count;
+}
+
+int ras_umc_get_badpage_count(struct ras_core_context *ras_core)
+{
+       struct eeprom_store_record *data = 
&ras_core->ras_umc.umc_err_data.ram_data;
+
+       return data->count;
+}
+
+int ras_umc_get_badpage_record(struct ras_core_context *ras_core, uint32_t 
index, void *record)
+{
+       struct eeprom_store_record *data = 
&ras_core->ras_umc.umc_err_data.ram_data;
+
+       if (index >= data->count)
+               return -EINVAL;
+
+       memcpy(record, &data->bps[index], sizeof(struct eeprom_umc_record));
+       return 0;
+}
+
+bool ras_umc_check_retired_addr(struct ras_core_context *ras_core, uint64_t 
addr)
+{
+       struct ras_umc *ras_umc = &ras_core->ras_umc;
+       struct eeprom_store_record *data = &ras_umc->umc_err_data.ram_data;
+       uint64_t page_pfn = RAS_ADDR_TO_PFN(addr);
+       int i, ret = false;
+
+       mutex_lock(&ras_umc->umc_lock);
+       for (i = 0; i < data->count; i++) {
+               if (data->bps[i].cur_nps_retired_row_pfn == page_pfn) {
+                       ret = true;
+                       break;
+               }
+       }
+       mutex_unlock(&ras_umc->umc_lock);
+
+       return ret;
+}
+
+int ras_umc_translate_soc_pa_and_bank(struct ras_core_context *ras_core,
+       uint64_t *soc_pa, struct umc_bank_addr *bank_addr, bool bank_to_pa)
+{
+       struct ras_umc *ras_umc = &ras_core->ras_umc;
+       int ret = 0;
+
+       if (bank_to_pa)
+               ret = ras_umc->ip_func->bank_to_soc_pa(ras_core, *bank_addr, 
soc_pa);
+       else
+               ret = ras_umc->ip_func->soc_pa_to_bank(ras_core, *soc_pa, 
bank_addr);
+
+       return ret;
+}
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc.h 
b/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
new file mode 100644
index 000000000000..7d9e779d8c4c
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
@@ -0,0 +1,166 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __RAS_UMC_H__
+#define __RAS_UMC_H__
+#include "ras.h"
+#include "ras_eeprom.h"
+#include "ras_cmd.h"
+
+#define UMC_VRAM_TYPE_UNKNOWN 0
+#define UMC_VRAM_TYPE_GDDR1   1
+#define UMC_VRAM_TYPE_DDR2    2
+#define UMC_VRAM_TYPE_GDDR3   3
+#define UMC_VRAM_TYPE_GDDR4   4
+#define UMC_VRAM_TYPE_GDDR5   5
+#define UMC_VRAM_TYPE_HBM     6
+#define UMC_VRAM_TYPE_DDR3    7
+#define UMC_VRAM_TYPE_DDR4    8
+#define UMC_VRAM_TYPE_GDDR6   9
+#define UMC_VRAM_TYPE_DDR5    10
+#define UMC_VRAM_TYPE_LPDDR4  11
+#define UMC_VRAM_TYPE_LPDDR5  12
+#define UMC_VRAM_TYPE_HBM3E   13
+
+#define UMC_ECC_NEW_DETECTED_TAG       0x1
+#define UMC_INV_MEM_PFN  (0xFFFFFFFFFFFFFFFF)
+
+/* three column bits and one row bit in MCA address flip
+ * in bad page retirement
+ */
+#define UMC_PA_FLIP_BITS_NUM 4
+
+enum umc_memory_partition_mode {
+       UMC_MEMORY_PARTITION_MODE_NONE = 0,
+       UMC_MEMORY_PARTITION_MODE_NPS1 = 1,
+       UMC_MEMORY_PARTITION_MODE_NPS2 = 2,
+       UMC_MEMORY_PARTITION_MODE_NPS3 = 3,
+       UMC_MEMORY_PARTITION_MODE_NPS4 = 4,
+       UMC_MEMORY_PARTITION_MODE_NPS6 = 6,
+       UMC_MEMORY_PARTITION_MODE_NPS8 = 8,
+       UMC_MEMORY_PARTITION_MODE_UNKNOWN
+};
+
+struct ras_core_context;
+struct ras_bank_ecc;
+
+struct umc_flip_bits {
+       uint32_t flip_bits_in_pa[UMC_PA_FLIP_BITS_NUM];
+       uint32_t flip_row_bit;
+       uint32_t r13_in_pa;
+       uint32_t bit_num;
+};
+
+struct umc_mca_addr {
+       uint64_t err_addr;
+       uint32_t ch_inst;
+       uint32_t umc_inst;
+       uint32_t node_inst;
+       uint32_t socket_id;
+};
+
+struct umc_phy_addr {
+       uint64_t pa;
+       uint32_t bank;
+       uint32_t channel_idx;
+};
+
+struct umc_bank_addr {
+       uint32_t stack_id; /* SID */
+       uint32_t bank_group;
+       uint32_t bank;
+       uint32_t row;
+       uint32_t column;
+       uint32_t channel;
+       uint32_t subchannel; /* Also called Pseudochannel (PC) */
+};
+
+struct ras_umc_ip_func {
+       int (*bank_to_eeprom_record)(struct ras_core_context *ras_core,
+                       struct ras_bank_ecc *bank, struct eeprom_umc_record 
*record);
+       int (*eeprom_record_to_nps_record)(struct ras_core_context *ras_core,
+                       struct eeprom_umc_record *record, uint32_t nps);
+       int (*eeprom_record_to_nps_pages)(struct ras_core_context *ras_core,
+                       struct eeprom_umc_record *record, uint32_t nps,
+                       uint64_t *pfns, uint32_t num);
+       int (*bank_to_soc_pa)(struct ras_core_context *ras_core,
+                       struct umc_bank_addr bank_addr, uint64_t *soc_pa);
+       int (*soc_pa_to_bank)(struct ras_core_context *ras_core,
+                       uint64_t soc_pa, struct umc_bank_addr *bank_addr);
+};
+
+struct eeprom_store_record {
+       /* point to data records array */
+       struct eeprom_umc_record *bps;
+       /* the count of entries */
+       int count;
+       /* the space can place new entries */
+       int space_left;
+};
+
+struct ras_umc_err_data {
+       struct eeprom_store_record rom_data;
+       struct eeprom_store_record ram_data;
+       enum umc_memory_partition_mode umc_nps_mode;
+       uint64_t last_retired_pfn;
+};
+
+struct ras_umc {
+       u32 umc_ip_version;
+       u32 umc_vram_type;
+       const struct ras_umc_ip_func *ip_func;
+       struct radix_tree_root root;
+       struct mutex  tree_lock;
+       struct mutex  umc_lock;
+       struct mutex  bank_log_lock;
+       struct mutex  pending_ecc_lock;
+       struct ras_umc_err_data umc_err_data;
+       struct list_head pending_ecc_list;
+};
+
+int ras_umc_sw_init(struct ras_core_context *ras);
+int ras_umc_sw_fini(struct ras_core_context *ras);
+int ras_umc_hw_init(struct ras_core_context *ras);
+int ras_umc_hw_fini(struct ras_core_context *ras);
+int ras_umc_psp_convert_ma_to_pa(struct ras_core_context *ras_core,
+               struct umc_mca_addr *in, struct umc_phy_addr *out,
+               uint32_t nps);
+int ras_umc_handle_bad_pages(struct ras_core_context *ras_core, void *data);
+int ras_umc_log_bad_bank(struct ras_core_context *ras, struct ras_bank_ecc 
*bank);
+int ras_umc_log_bad_bank_pending(struct ras_core_context *ras_core, struct 
ras_bank_ecc *bank);
+int ras_umc_log_pending_bad_bank(struct ras_core_context *ras_core);
+int ras_umc_clear_logged_ecc(struct ras_core_context *ras_core);
+int ras_umc_load_bad_pages(struct ras_core_context *ras_core);
+int ras_umc_get_saved_eeprom_count(struct ras_core_context *ras_core);
+int ras_umc_clean_badpage_data(struct ras_core_context *ras_core);
+int ras_umc_fill_eeprom_record(struct ras_core_context *ras_core,
+               uint64_t err_addr, uint32_t umc_inst, struct umc_phy_addr 
*cur_nps_addr,
+               enum umc_memory_partition_mode cur_nps, struct 
eeprom_umc_record *record);
+
+int ras_umc_get_badpage_count(struct ras_core_context *ras_core);
+int ras_umc_get_badpage_record(struct ras_core_context *ras_core, uint32_t 
index, void *record);
+bool ras_umc_check_retired_addr(struct ras_core_context *ras_core, uint64_t 
addr);
+int ras_umc_translate_soc_pa_and_bank(struct ras_core_context *ras_core,
+                       uint64_t *soc_pa, struct umc_bank_addr *bank_addr, bool 
bank_to_pa);
+#endif
-- 
2.34.1

Reply via email to