On Mon, 23 Jun 2025 16:54:20 +0200 "Fabio M. De Francesco" <fabio.m.de.france...@linux.intel.com> wrote:
> When Firmware First is enabled, BIOS handles errors first and then it makes > them available to the kernel via the Common Platform Error Record (CPER) > sections (UEFI 2.10 Appendix N). Linux parses the CPER sections via one of > two similar paths, either ELOG or GHES. The errors managed by ELOG are > signaled to the BIOS by the I/O Machine Check Architecture (I/O MCA). > > Currently, ELOG and GHES show some inconsistencies in how they report to > userspace via trace events. > > Therefore, make the two mentioned paths act similarly by tracing the CPER > CXL Protocol Error Section (UEFI v2.10, Appendix N.2.13). > > Cc: Dan Williams <dan.j.willi...@intel.com> > Reviewed-by: Kuppuswamy Sathyanarayanan > <sathyanarayanan.kuppusw...@linux.intel.com> > Signed-off-by: Fabio M. De Francesco <fabio.m.de.france...@linux.intel.com> > --- > drivers/acpi/acpi_extlog.c | 62 ++++++++++++++++++++++++++++++++++++++ > drivers/cxl/core/ras.c | 6 ++++ > include/cxl/event.h | 2 ++ > 3 files changed, 70 insertions(+) > > diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c > index cefe8d2d8affc..9a37b08aacfea 100644 > --- a/drivers/acpi/acpi_extlog.c > +++ b/drivers/acpi/acpi_extlog.c > @@ -12,6 +12,7 @@ > #include <linux/ratelimit.h> > #include <linux/edac.h> > #include <linux/ras.h> > +#include <cxl/event.h> > #include <acpi/ghes.h> > #include <asm/cpu.h> > #include <asm/mce.h> > @@ -160,6 +161,60 @@ static void extlog_print_pcie(struct cper_sec_pcie > *pcie_err, > pci_dev_put(pdev); > } > > +static void > +extlog_cxl_cper_handle_prot_err(struct cxl_cper_sec_prot_err *prot_err, > + int severity) > +{ > + struct cxl_cper_prot_err_work_data wd; > + u8 *dvsec_start, *cap_start; A bunch of this is identical to cxl_cper_post_prot_err() Can we factor that stuff out for common use? > + > + if (!(prot_err->valid_bits & PROT_ERR_VALID_AGENT_ADDRESS)) { > + pr_warn_ratelimited("CXL CPER invalid agent type\n"); > + return; > + } > + > + if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) { > + pr_warn_ratelimited("CXL CPER invalid protocol error log\n"); > + return; > + } > + > + if (prot_err->err_len != sizeof(struct cxl_ras_capability_regs)) { > + pr_warn_ratelimited("CXL CPER invalid RAS Cap size (%u)\n", > + prot_err->err_len); > + return; > + } > + > + if ((prot_err->agent_type == RCD || prot_err->agent_type == DEVICE || > + prot_err->agent_type == LD || prot_err->agent_type == FMLD) && > + !(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER)) > + pr_warn_ratelimited(FW_WARN > + "CXL CPER no device serial number\n"); Whilst some of this check isn't present in cxl_cper_post_prot_err(), it should be harmless. > + > + switch (prot_err->agent_type) { > + case RCD: > + case DEVICE: > + case LD: > + case FMLD: > + case RP: > + case DSP: > + case USP: > + memcpy(&wd.prot_err, prot_err, sizeof(wd.prot_err)); > + > + dvsec_start = (u8 *)(prot_err + 1); > + cap_start = dvsec_start + prot_err->dvsec_len; > + > + memcpy(&wd.ras_cap, cap_start, sizeof(wd.ras_cap)); > + wd.severity = cper_severity_to_aer(severity); > + break; > + default: > + pr_err_ratelimited("CXL CPER reserved agent type: %d\n", > + prot_err->agent_type); > + return; > + } > + > + cxl_cper_ras_handle_prot_err(&wd); > +}