From: Marek Majtyka <[email protected]> Changes and fixes: - CMEM/SMEM driver trace capabilities added - CMEM/SMEM cpu0 affinity used - SMEM code maintainability improved - CMEM multiple ca parity error event added - CMEM mpr dump on signal bit [21] - CMEM clear mr5 ca parity error flag added - CMEM irq storm fix
Signed-off-by: Marek Majtyka <[email protected]> --- drivers/edac/Kconfig | 7 ++ drivers/edac/axxia_edac-cmc_56xx.c | 222 ++++++++++++++++++++++++++++------ drivers/edac/axxia_edac-l2_cpu_56xx.c | 2 +- drivers/edac/axxia_edac-l3_56xx.c | 40 ++++-- drivers/edac/axxia_edac-mc_56xx.c | 178 ++++++++++++++++++++++----- include/trace/events/edac_cmc.h | 101 ++++++++++++++++ include/trace/events/edac_mc.h | 104 ++++++++++++++++ 7 files changed, 577 insertions(+), 77 deletions(-) create mode 100644 include/trace/events/edac_cmc.h create mode 100644 include/trace/events/edac_mc.h diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index aa91dda..797dd53 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -409,6 +409,13 @@ config EDAC_AXXIA_SYSMEM_6700 the System Memory error detection. System Memory error detection is interrupt driven. +config DEBUG_EDAC_AXXIA_SYSMEM + depends on ARCH_AXXIA + bool "AXXIA EDAC SYSMEM error injection interface." + help + Support for configuration of SYSMEM edac tracing functionality. + It works for both 5600 and 6700 board families. + config EDAC_AXXIA_CMEM_5600 depends on ARCH_AXXIA bool "AXXIA EDAC CMem Controller for 5600" diff --git a/drivers/edac/axxia_edac-cmc_56xx.c b/drivers/edac/axxia_edac-cmc_56xx.c index bdc76b1..f29e6926 100644 --- a/drivers/edac/axxia_edac-cmc_56xx.c +++ b/drivers/edac/axxia_edac-cmc_56xx.c @@ -1,7 +1,7 @@ /* * drivers/edac/axxia_edac-mc.c * - * EDAC Driver for Intel's Axxia 5600 Configuration Memory Controller + * EDAC Driver for Intel's Axxia 5600/6700 Configuration Memory Controller * * Copyright (C) 2016 Intel Inc. * @@ -31,6 +31,9 @@ #include "edac_core.h" #include "edac_module.h" +#define CREATE_TRACE_POINTS +#include <trace/events/edac_cmc.h> + #define FMT "%s: syscon lookup failed hence using hardcoded register address\n" #define MPR_FMT2 "\n%3d %#010x %#010x" @@ -53,7 +56,9 @@ #define CM_MPR_PAGE 0x1 #define CM_56XX_DENALI_CTL_00 0x0 +#define CM_56XX_DENALI_CTL_33 0x84 #define CM_56XX_DENALI_CTL_34 0x88 +#define CM_56XX_DENALI_CTL_45 0xb4 #define CM_56XX_DENALI_CTL_74 0x128 #define CM_56XX_DENALI_CTL_80 0x140 @@ -90,6 +95,7 @@ #define INT_BIT_8 (0x00000100) #define INT_BIT_11 (0x00000800) #define INT_BIT_21 (0x00200000) +#define INT_BIT_23 (0x00800000) #define INT_BIT_25 (0x02000000) #define INT_BIT_30 (0x40000000) #define INT_BIT_31 (0x80000000) @@ -108,6 +114,7 @@ INT_BIT_7 |\ INT_BIT_11 |\ INT_BIT_21 |\ + INT_BIT_23 |\ INT_BIT_31)) #define CM_INT_MASK_FULL (~(\ @@ -120,6 +127,7 @@ INT_BIT_7 |\ INT_BIT_11 |\ INT_BIT_21 |\ + INT_BIT_23 |\ INT_BIT_25 |\ INT_BIT_30 |\ INT_BIT_31)) @@ -127,9 +135,7 @@ #define CM_INT_MASK_ALL (0x7fffffff) #define ALIVE_NOTIFICATION_PERIOD (90*1000) -static int log = 1; -module_param(log, int, 0644); -MODULE_PARM_DESC(log, "Log each error to kernel log."); +static cpumask_t only_cpu_0 = { CPU_BITS_CPU0 }; static int force_restart = 1; module_param(force_restart, int, 0644); @@ -185,8 +191,9 @@ static atomic64_t mc_counter = ATOMIC_INIT(0); * Bit [1] = A memory access outside the defined PHYSICAL memory space * has occurred. * Bit [0] = The memory reset is valid on the DFI bus. - - * Of these 1, 2, 3, 4, 5, 6, 7, 11, 21, 25, and 30 are of interest. + * + * Of these 1, 2, 3, 4, 5, 6, 7, 11, 13, 14, 15, 16, 21, 24, 25, and 30 + * are of our interest. */ /* @@ -196,10 +203,10 @@ static atomic64_t mc_counter = ATOMIC_INIT(0); * one need to collect dumps for all available cs. Below given example * for two cs0/cs1. * - * CMEM MC cmmon_isr_sw cmmon_wq + * CMEM MC cmmon_isr_sw wq_alerts * | | | * | | | - * |ALERT_N - int_status bit [30] | + * |ALERT_N - int_status bit [30] or [21] | * |------------------>| | * | |schedule cmmon_wq | * | |------------------>| @@ -270,6 +277,19 @@ struct __packed cm_56xx_denali_ctl_00 #endif }; +struct __packed cm_56xx_denali_ctl_33 +{ +#ifdef CPU_BIG_ENDIAN + unsigned reserved : 6; + unsigned write : 1; + unsigned write_modereg : 25; +#else /* Little Endian */ + unsigned write_modereg : 25; + unsigned write : 1; + unsigned reserved : 6; +#endif +}; + /* Trigger MPR */ struct __packed cm_56xx_denali_ctl_34 { @@ -292,6 +312,24 @@ struct __packed cm_56xx_denali_ctl_34 #endif }; +/* + * this structure is the same for all registers(one definition used) + * cm_56xx_denali_ctl_45, cm_56xx_denali_ctl_48, + * cm_56xx_denali_ctl_53, cm_56xx_denali_ctl_56 + */ +struct __packed cm_56xx_denali_ctl_45 +{ +#ifdef CPU_BIG_ENDIAN + unsigned absolete1 : 6; + unsigned reserved : 7; + unsigned mrsingle_data_0 : 17; +#else /* Little Endian */ + unsigned mrsingle_data_0 : 17; + unsigned reserved : 7; + unsigned absolete1 : 6; +#endif +}; + #ifdef CONFIG_DEBUG_EDAC_AXXIA_CMEM #define CM_56XX_DENALI_CTL_62 0xf8 @@ -395,6 +433,7 @@ enum events { EV_PORT_ERROR, EV_WRAP_ERROR, EV_PARITY_ERROR, + EV_SEC_PARITY_ERROR, NR_EVENTS }; @@ -409,6 +448,7 @@ static char *block_name[] = { "port_error", "wrap_error", "parity_error", + "second_parity_error", "alert_n_cs0_dram0_ca_par_error", "alert_n_cs0_dram0_crc_error", "alert_n_cs0_dram1_ca_par_error", @@ -437,6 +477,7 @@ static const u32 event_mask[NR_EVENTS] = { [EV_PORT_ERROR] = INT_BIT_7, [EV_WRAP_ERROR] = INT_BIT_11, [EV_PARITY_ERROR] = INT_BIT_21, + [EV_SEC_PARITY_ERROR] = INT_BIT_23, }; static const struct event_logging { @@ -453,6 +494,7 @@ static const struct event_logging { [EV_PORT_ERROR] = {0, KERN_CRIT, "Port error"}, [EV_WRAP_ERROR] = {0, KERN_CRIT, "Wrap error"}, [EV_PARITY_ERROR] = {0, KERN_CRIT, "Parity error"}, + [EV_SEC_PARITY_ERROR] = {1, KERN_CRIT, "Second parity error"}, }; /* Private structure for common edac device */ @@ -670,7 +712,7 @@ handle_events(struct intel_edac_dev_info *edac_dev, set_val = readl( edac_dev->axi2ser3_region + SYSCON_PERSIST_SCRATCH); - /* set bit 3 in pscratch reg */ + /* set bit 7 in pscratch reg */ set_val = set_val | CMEM_PERSIST_SCRATCH_BIT; writel(set_val, @@ -721,6 +763,48 @@ store_mpr_dump(struct intel_edac_dev_info *edac_dev, int cs) MAX_DQ * MPR_PAGE_BYTES); } +static int clear_ca_parity_error(struct intel_edac_dev_info *dev_info, int cs) +{ + + struct cm_56xx_denali_ctl_45 denali_ctl_45; + struct cm_56xx_denali_ctl_33 denali_ctl_33; + + if (ncr_read(dev_info->cm_region, + CM_56XX_DENALI_CTL_45 + 0xc + 0x20 * cs, + 4, (u32 *) &denali_ctl_45)) + goto error_read; + + /* + * Clear always as we can't get info about state change + * from denali_ctl_45, which means this check would be faulty!!! + * if (denali_ctl_45.mrsingle_data_0 & 0x10) + */ + denali_ctl_45.mrsingle_data_0 &= 0x3FFEF; /* clear A4 bit */ + denali_ctl_33.write = 1; /* write */ + denali_ctl_33.write_modereg = 0x800005; /* MR5 write */ + denali_ctl_33.write_modereg |= (cs << 8); /* chip select */ + + if (ncr_write(dev_info->cm_region, + CM_56XX_DENALI_CTL_45 + 0x20 * cs, + 4, (u32 *) &denali_ctl_45)) + goto error_write; + + if (ncr_write(dev_info->cm_region, + CM_56XX_DENALI_CTL_33, + 4, (u32 *) &denali_ctl_33)) + goto error_write; + return 0; + +error_write: + printk_ratelimited("%s: Write error when clearing ca parity in mr5\n", + dev_name(&dev_info->pdev->dev)); + return 1; +error_read: + printk_ratelimited("%s: Read error when clearing ca parity in mr5\n", + dev_name(&dev_info->pdev->dev)); + return 1; +} + static inline void __attribute__((always_inline)) update_alert_counters(struct intel_edac_dev_info *edac_dev, int cs) { @@ -739,8 +823,11 @@ update_alert_counters(struct intel_edac_dev_info *edac_dev, int cs) (u8 (*)[MPR_PAGE_BYTES]) (&edac_dev->data->mpr.dram_0_page[0]); int i; - for (i = 0; i < MAX_DQ; ++i) + for (i = 0; i < edac_dev->data->dram_count; ++i) { inc_alert_counter(edac_dev->data->alerts, cs, i, dram[i][3]); + trace_edac_cmc_dump_processed(edac_dev->cm_region >> 16, + cs, i, (int) dram[i][3]); + } } @@ -751,6 +838,9 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs) unsigned long flags; u32 regval; int i; +#ifdef CONFIG_DEBUG_EDAC_AXXIA_CMEM + u32 node = edac_dev->cm_region >> 16; +#endif mpr->mpr_page_id = page; @@ -761,11 +851,32 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs) goto error_read; mpr->dram_0_page[i] = regval & 0xff; + +#ifdef CONFIG_DEBUG_EDAC_AXXIA_CMEM + trace_edac_cmc_dump_collected(node, cs, i, 0, + (int) mpr->dram_0_page[i]); +#endif + mpr->dram_1_page[i] = ((regval & 0xff00) >> 8); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_CMEM + trace_edac_cmc_dump_collected(node, cs, i, 1, + (int) mpr->dram_1_page[i]); +#endif if (edac_dev->data->dram_count == MAX_DQ) { mpr->dram_2_page[i] = ((regval & 0xff0000) >> 16); + +#ifdef CONFIG_DEBUG_EDAC_AXXIA_CMEM + trace_edac_cmc_dump_collected(node, cs, i, 2, + (int) mpr->dram_2_page[i]); +#endif + mpr->dram_3_page[i] = ((regval & 0xff000000) >> 24); + +#ifdef CONFIG_DEBUG_EDAC_AXXIA_CMEM + trace_edac_cmc_dump_collected(node, cs, i, 3, + (int) mpr->dram_3_page[i]); +#endif } } raw_spin_lock_irqsave(&edac_dev->data->mpr_data_lock, flags); @@ -773,6 +884,7 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs) raw_spin_unlock_irqrestore(&edac_dev->data->mpr_data_lock, flags); update_alert_counters(edac_dev, cs); + clear_ca_parity_error(edac_dev, cs); return 0; error_read: @@ -804,13 +916,16 @@ cmmon_isr_sw(int interrupt, void *device) /* * NOTE: * ISR function is only reading int_status, and write into int_act - * registers. + * and int_mask registers (as well as rte config load) * - * - first handle critical events, which might require restart + * - first handles driver initialization if not configured in uboot, + * once initialized it mask irq bit 8 from raising interrupt + * as this bit must be acknowledged by rte, + * - second handle critical events, which might require restart * (handle_events) and then to the job outside isr - * - second collect MPR dump if any exists and then trigger new if + * - third collect MPR dump if any exists and then trigger new if * needed - all outside isr, - * - third wake up job outside isr to trigger mpr dump procedure when + * - finally wake up job outside isr to trigger mpr dump procedure when * ALERT_N reported (bit [30] is on) */ @@ -818,8 +933,12 @@ cmmon_isr_sw(int interrupt, void *device) 4, (u32 *) &denali_ctl_84)) goto error_read; - if (denali_ctl_84.int_status & INT_BIT_8) { - if (dev_info->is_controller_configured == 0) { + trace_edac_cmc_int_status(dev_info->cm_region >> 16, + denali_ctl_84.int_status); + + if (dev_info->is_controller_configured == 0) { + /* first init case */ + if (denali_ctl_84.int_status & INT_BIT_8) { ret = initialize(dev_info); if (ret) goto error_init; @@ -829,28 +948,46 @@ cmmon_isr_sw(int interrupt, void *device) goto error_init; dev_info->is_controller_configured = 1; - } - if (dev_info->is_ddr4) - denali_ctl_86.int_mask = CM_INT_MASK_FULL; - else - denali_ctl_86.int_mask = CM_INT_MASK_BASE; + denali_ctl_85.int_ack = INT_BIT_8; + if (ncr_write(dev_info->cm_region, + CM_56XX_DENALI_CTL_85, + 4, (u32 *) &denali_ctl_85)) + goto error_write; + + denali_ctl_86.int_mask = CM_INT_MASK_ALL; + if (ncr_write(dev_info->cm_region, + CM_56XX_DENALI_CTL_86, + 4, (u32 *) &denali_ctl_86)) + goto error_write; - if (ncr_write(dev_info->cm_region, - CM_56XX_DENALI_CTL_86, - 4, (u32 *) &denali_ctl_86)) { - goto error_write; } + /* + * SAFETY CHECK + * One cannot go further if driver is not fully functional!!! + */ return IRQ_HANDLED; - } - /* - * SAFETY CHECK - * one cannot go further if driver is not fully functional!!! - */ - if (dev_info->is_controller_configured == 0) - return IRQ_HANDLED; + } else { + /* reload config case */ + if (denali_ctl_84.int_status & INT_BIT_8) { + + denali_ctl_85.int_ack = INT_BIT_8; + if (ncr_write(dev_info->cm_region, + CM_56XX_DENALI_CTL_85, + 4, (u32 *) &denali_ctl_85)) + goto error_write; + + denali_ctl_86.int_mask = CM_INT_MASK_ALL; + if (ncr_write(dev_info->cm_region, + CM_56XX_DENALI_CTL_86, + 4, (u32 *) &denali_ctl_86)) + goto error_write; + + return IRQ_HANDLED; + } + } handle_events(dev_info, &denali_ctl_84); atomic_set(&dev_info->data->event_ready, 1); @@ -867,10 +1004,9 @@ cmmon_isr_sw(int interrupt, void *device) denali_ctl_85.int_ack |= INT_BIT_25; } - if (denali_ctl_84.int_status & INT_BIT_30) { + if (denali_ctl_84.int_status & (INT_BIT_30 | INT_BIT_21)) { atomic_inc(&dev_info->data->dump_in_progress); wake_up(&dev_info->data->dump_wq); - denali_ctl_85.int_ack |= INT_BIT_30; } } @@ -940,6 +1076,8 @@ static void intel_cm_alerts_error_check(struct edac_device_ctl_info *edac_dev) 4, (u32 *) &denali_ctl_34)) goto error_write; + trace_edac_cmc_dump_triggered(dev_info->cm_region >> 16, i); + /* wait */ ret = wait_event_timeout(dev_info->data->dump_wq, atomic_read(&dev_info->data->dump_ready), @@ -1026,6 +1164,7 @@ static void intel_cm_events_error_check(struct edac_device_ctl_info *edac_dev) case EV_MULT_ILLEGAL: case EV_UNCORR_ECC: case EV_MULT_UNCORR_ECC: + case EV_SEC_PARITY_ERROR: edac_device_handle_multi_ue(edac_dev, 0, i, counter, edac_dev->ctl_name); @@ -1214,7 +1353,6 @@ static int initialize(struct intel_edac_dev_info *dev_info) pr_err("Could not get dram version. Is config loaded?\n"); return ERR_STAGE_1; } - /*dev_info->is_ddr4 = 1;*/ dev_info->finish_alerts = 0; dev_info->finish_events = 0; @@ -1304,14 +1442,14 @@ static int enable_workers(struct intel_edac_dev_info *dev_info) atomic_set(&dev_info->data->event_ready, 0); atomic_set(&dev_info->data->dump_in_progress, 0); - dev_info->wq_events = alloc_workqueue("%s-events", WQ_MEM_RECLAIM, 1, + dev_info->wq_events = alloc_workqueue("%s-events", 0, 1, (dev_info->ctl_name)); if (!dev_info->wq_events) return ERR_STAGE_3; if (dev_info->is_ddr4) { dev_info->wq_alerts = - alloc_workqueue("%s-alerts", WQ_MEM_RECLAIM, 1, + alloc_workqueue("%s-alerts", 0, 1, (dev_info->ctl_name)); if (!dev_info->wq_alerts) return ERR_STAGE_4; @@ -1322,8 +1460,9 @@ static int enable_workers(struct intel_edac_dev_info *dev_info) INIT_WORK(&dev_info->offload_events, axxia_events_work); if (dev_info->is_ddr4) - queue_work(dev_info->wq_alerts, &dev_info->offload_alerts); - queue_work(dev_info->wq_events, &dev_info->offload_events); + queue_work_on(0, dev_info->wq_alerts, + &dev_info->offload_alerts); + queue_work_on(0, dev_info->wq_events, &dev_info->offload_events); return 0; } @@ -1332,6 +1471,7 @@ static int enable_driver_irq(struct intel_edac_dev_info *dev_info) { int irq = -1, rc = 0; struct cm_56xx_denali_ctl_86 denali_ctl_86; + struct irq_desc *desc; snprintf(&dev_info->data->irq_name[0], IRQ_NAME_LEN, "%s-mon", dev_info->ctl_name); @@ -1382,6 +1522,10 @@ static int enable_driver_irq(struct intel_edac_dev_info *dev_info) return ERR_STAGE_6; } + + desc = irq_to_desc(irq); + sched_setaffinity(desc->action->thread->pid, &only_cpu_0); + return 0; } diff --git a/drivers/edac/axxia_edac-l2_cpu_56xx.c b/drivers/edac/axxia_edac-l2_cpu_56xx.c index ecab59c..a183c65 100644 --- a/drivers/edac/axxia_edac-l2_cpu_56xx.c +++ b/drivers/edac/axxia_edac-l2_cpu_56xx.c @@ -1,7 +1,7 @@ /* * drivers/edac/axxia_edac-l2_cpu_56xx.c * - * EDAC Driver for Intel's Axxia 5600 System Memory Controller + * EDAC Driver for Intel's Axxia 5600/6700 System Memory Controller * * Copyright (C) 2016 Intel Inc. * diff --git a/drivers/edac/axxia_edac-l3_56xx.c b/drivers/edac/axxia_edac-l3_56xx.c index eb25eed..f390450 100644 --- a/drivers/edac/axxia_edac-l3_56xx.c +++ b/drivers/edac/axxia_edac-l3_56xx.c @@ -21,6 +21,8 @@ #include <linux/of_platform.h> #include <linux/of.h> #include <linux/of_address.h> +#include <linux/of_irq.h> +#include <linux/irq.h> #include <linux/platform_device.h> #include <linux/reboot.h> #include <linux/mfd/syscon.h> @@ -77,6 +79,8 @@ #define CCN_NODE_ERR_SYND_REG1 0x408 #define CCN_NODE_ERR_SYND_CLR 0x480 +static cpumask_t only_cpu_0 = { CPU_BITS_CPU0}; + union dickens_hnf_err_syndrome_reg0 { struct __packed { #ifdef CPU_BIG_ENDIAN @@ -250,17 +254,16 @@ static irqreturn_t ccn_irq_thread(int irq, void *device) return IRQ_HANDLED; } -static irqreturn_t ccn_irq_handler(int irq, void *device) +static irqreturn_t collect_and_clean(struct intel_edac_dev_info *dev_info, + int report_error) { - struct intel_edac_dev_info *dev_info = device; void __iomem *ccn_base = dev_info->dickens_L3; - - irqreturn_t res = IRQ_NONE; u64 err_sig_val[3]; u64 err_type_value[4]; u64 err_or; u64 err_synd_reg0 = 0, err_synd_reg1 = 0; int i; + irqreturn_t res = IRQ_NONE; /* PMU overflow is a special case - for the future */ err_or = err_sig_val[0] = readq(ccn_base + CCN_MN_ERR_SIG_VAL_63_0); @@ -351,11 +354,21 @@ static irqreturn_t ccn_irq_handler(int irq, void *device) } } - if (err_or) + if (err_or && report_error) dev_err(&dev_info->pdev->dev, "Error reported in %016llx %016llx %016llx.\n", err_sig_val[2], err_sig_val[1], err_sig_val[0]); + return res; +} + +static irqreturn_t ccn_irq_handler(int irq, void *device) +{ + struct intel_edac_dev_info *dev_info = device; + irqreturn_t res = IRQ_NONE; + + res = collect_and_clean(dev_info, 1); + /* HERE all error data collected, but interrupt not deasserted */ return IRQ_WAKE_THREAD; } @@ -414,8 +427,8 @@ static int intel_edac_l3_probe(struct platform_device *pdev) struct intel_edac_dev_info *dev_info = NULL; struct device_node *np = pdev->dev.of_node; struct resource *r; - struct arm_smccc_res ret; + struct irq_desc *desc; dev_info = devm_kzalloc(&pdev->dev, sizeof(*dev_info), GFP_KERNEL); if (!dev_info) @@ -470,9 +483,19 @@ static int intel_edac_l3_probe(struct platform_device *pdev) */ arm_smccc_smc(0xc4000027, CCN_MN_ERRINT_STATUS__PMU_EVENTS__DISABLE, 0, 0, 0, 0, 0, 0, &ret); + trace_edacl3_smc_results(&ret); + + if (ret.a0 != ARM_SMCCC_UNKNOWN) { + irqreturn_t res; - if (ret.a0 != ARM_SMCCC_UNKNOWN) dev_info->irq_used = 1; + /* clear all error from earlier boot stage */ + res = collect_and_clean(dev_info, 0); + arm_smccc_smc(0xc4000027, + CCN_MN_ERRINT_STATUS__INTREQ__DESSERT, + 0, 0, 0, 0, 0, 0, &ret); + trace_edacl3_smc_results(&ret); + } dev_info->edac_dev->pvt_info = dev_info; dev_info->edac_dev->dev = &dev_info->pdev->dev; @@ -501,6 +524,9 @@ static int intel_edac_l3_probe(struct platform_device *pdev) goto err2; } + desc = irq_to_desc(r->start); + sched_setaffinity(desc->action->thread->pid, &only_cpu_0); + return 0; err2: edac_device_free_ctl_info(dev_info->edac_dev); diff --git a/drivers/edac/axxia_edac-mc_56xx.c b/drivers/edac/axxia_edac-mc_56xx.c index c9429ee..fba04d1 100644 --- a/drivers/edac/axxia_edac-mc_56xx.c +++ b/drivers/edac/axxia_edac-mc_56xx.c @@ -1,7 +1,7 @@ /* * drivers/edac/axxia_edac-mc.c * - * EDAC Driver for Intel's Axxia 5600 System Memory Controller + * EDAC Driver for Intel's Axxia 5600/6700 System Memory Controller * * Copyright (C) 2016 Intel Inc. * @@ -29,6 +29,9 @@ #include "edac_core.h" #include "edac_module.h" +#define CREATE_TRACE_POINTS +#include <trace/events/edac_mc.h> + #define FMT "%s: syscon lookup failed hence using hardcoded register address\n" #define MPR_FMT9 "\n%3d %#010x %#010x %#010x %#010x"\ @@ -91,15 +94,39 @@ #define MPR_PAGE_BYTES 4 #define MPR_ERRORS 2 /* CRC, CA Parity error */ -#define SM_INT_MASK_LOW (0xfbbfef01) +#define INT_BIT_0 (0x00000001) +#define INT_BIT_1 (0x00000002) +#define INT_BIT_2 (0x00000004) +#define INT_BIT_3 (0x00000008) +#define INT_BIT_4 (0x00000010) +#define INT_BIT_5 (0x00000020) +#define INT_BIT_6 (0x00000040) +#define INT_BIT_7 (0x00000080) +#define INT_BIT_12 (0x00001000) +#define INT_BIT_22 (0x00400000) +#define INT_BIT_24 (0x01000000) +#define INT_BIT_26 (0x04000000) + + +#define SM_INT_MASK_LOW (~(\ + INT_BIT_1 |\ + INT_BIT_2 |\ + INT_BIT_3 |\ + INT_BIT_4 |\ + INT_BIT_5 |\ + INT_BIT_6 |\ + INT_BIT_7 |\ + INT_BIT_12 |\ + INT_BIT_22 |\ + INT_BIT_24 |\ + INT_BIT_26)) + #define SM_INT_MASK_ALL_LOW (0xffffffff) -#define SM_INT_MASK_HIGH (0x1) -#define SM_INT_MASK_ALL_HIGH (0x7) +#define SM_INT_MASK_HIGH (INT_BIT_0) +#define SM_INT_MASK_ALL_HIGH (INT_BIT_0|INT_BIT_1|INT_BIT_2) #define ALIVE_NOTIFICATION_PERIOD (90*1000) -static int log = 1; -module_param(log, int, 0644); -MODULE_PARM_DESC(log, "Log each error to kernel log."); +static cpumask_t only_cpu_0 = { CPU_BITS_CPU0}; static int force_restart = 1; module_param(force_restart, int, 0644); @@ -158,7 +185,7 @@ static atomic64_t mc_counter = ATOMIC_INIT(0); * has occurred. * Bit [0] = The memory reset is valid on the DFI bus. * - * Of these 1, 2, 3, 4, 5, 6, 7, 12, 22 and 26 are of interest. + * Of these 1, 2, 3, 4, 5, 6, 7, 12, 22, 24 and 26 are of interest. */ /* @@ -559,15 +586,15 @@ static char *block_name[] = { static const u32 event_mask[NR_EVENTS] = { - [EV_ILLEGAL] = 0x00000002, - [EV_MULT_ILLEGAL] = 0x00000004, - [EV_CORR_ECC] = 0x00000008, - [EV_MULT_CORR_ECC] = 0x00000010, - [EV_UNCORR_ECC] = 0x00000020, - [EV_MULT_UNCORR_ECC] = 0x00000040, - [EV_PORT_ERROR] = 0x00000080, - [EV_WRAP_ERROR] = 0x00001000, - [EV_PARITY_ERROR] = 0x00400000, + [EV_ILLEGAL] = INT_BIT_1, + [EV_MULT_ILLEGAL] = INT_BIT_2, + [EV_CORR_ECC] = INT_BIT_3, + [EV_MULT_CORR_ECC] = INT_BIT_4, + [EV_UNCORR_ECC] = INT_BIT_5, + [EV_MULT_UNCORR_ECC] = INT_BIT_6, + [EV_PORT_ERROR] = INT_BIT_7, + [EV_WRAP_ERROR] = INT_BIT_12, + [EV_PARITY_ERROR] = INT_BIT_22, }; static const struct event_logging { @@ -833,8 +860,11 @@ update_alert_counters(struct intel_edac_dev_info *edac_dev, int cs) (u8 (*)[MPR_PAGE_BYTES]) (&edac_dev->data->mpr.dram_0_page[0]); int i; - for (i = 0; i < MAX_DQ; ++i) + for (i = 0; i < edac_dev->data->dram_count; ++i) { inc_alert_counter(edac_dev->data->alerts, cs, i, dram[i][3]); + trace_edac_mc_dump_processed(edac_dev->sm_region >> 16, + cs, i, (int) dram[i][3]); + } } @@ -845,6 +875,9 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs) unsigned long flags; u32 regval; int i; +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + u32 node = edac_dev->sm_region >> 16; +#endif mpr->mpr_page_id = page; @@ -855,9 +888,25 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs) goto error_read; mpr->dram_0_page[i] = regval & 0xff; +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 0, + (int) mpr->dram_0_page[i]); +#endif mpr->dram_1_page[i] = ((regval & 0xff00) >> 8); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 1, + (int) mpr->dram_1_page[i]); +#endif mpr->dram_2_page[i] = ((regval & 0xff0000) >> 16); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 2, + (int) mpr->dram_2_page[i]); +#endif mpr->dram_3_page[i] = ((regval & 0xff000000) >> 24); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 3, + (int) mpr->dram_3_page[i]); +#endif if (ncr_read(edac_dev->sm_region, (SM_56XX_DENALI_CTL_59 + (0x14 * i)), @@ -865,9 +914,25 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs) goto error_read; mpr->dram_4_page[i] = regval & 0xff; +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 4, + (int) mpr->dram_4_page[i]); +#endif mpr->dram_5_page[i] = ((regval & 0xff00) >> 8); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 5, + (int) mpr->dram_5_page[i]); +#endif mpr->dram_6_page[i] = ((regval & 0xff0000) >> 16); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 6, + (int) mpr->dram_6_page[i]); +#endif mpr->dram_7_page[i] = ((regval & 0xff000000) >> 24); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 7, + (int) mpr->dram_7_page[i]); +#endif if (ncr_read(edac_dev->sm_region, (SM_56XX_DENALI_CTL_60 + (0x14 * i)), @@ -875,11 +940,27 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs) goto error_read; mpr->dram_8_page[i] = regval & 0xff; +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 8, + (int) mpr->dram_8_page[i]); +#endif if (edac_dev->data->dram_count == MAX_DQ) { mpr->dram_9_page[i] = ((regval & 0xff00) >> 8); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 9, + (int) mpr->dram_9_page[i]); +#endif mpr->dram_10_page[i] = ((regval & 0xff0000) >> 16); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 10, + (int) mpr->dram_10_page[i]); +#endif mpr->dram_11_page[i] = ((regval & 0xff000000) >> 24); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 11, + (int) mpr->dram_11_page[i]); +#endif if (ncr_read(edac_dev->sm_region, (SM_56XX_DENALI_CTL_60 + @@ -887,9 +968,25 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs) goto error_read; mpr->dram_12_page[i] = regval & 0xff; +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 12, + (int) mpr->dram_12_page[i]); +#endif mpr->dram_13_page[i] = ((regval & 0xff00) >> 8); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 13, + (int) mpr->dram_13_page[i]); +#endif mpr->dram_14_page[i] = ((regval & 0xff0000) >> 16); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 14, + (int) mpr->dram_14_page[i]); +#endif mpr->dram_15_page[i] = ((regval & 0xff000000) >> 24); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 15, + (int) mpr->dram_15_page[i]); +#endif if (ncr_read(edac_dev->sm_region, (SM_56XX_DENALI_CTL_61 + @@ -897,7 +994,15 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs) goto error_read; mpr->dram_16_page[i] = regval & 0xff; +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 16, + (int) mpr->dram_16_page[i]); +#endif mpr->dram_17_page[i] = ((regval & 0xff00) >> 8); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 17, + (int) mpr->dram_17_page[i]); +#endif } } raw_spin_lock_irqsave(&edac_dev->data->mpr_data_lock, flags); @@ -945,24 +1050,30 @@ smmon_isr_sw(int interrupt, void *device) 4, (u32 *) &denali_ctl_367)) goto error_read; - if (denali_ctl_367.int_status & 0x4) { + trace_edac_mc_int_status(dev_info->sm_region >> 16, 0, + denali_ctl_367.int_status); + + if (denali_ctl_367.int_status & INT_BIT_2) { if (ncr_read(dev_info->sm_region, SM_56XX_DENALI_CTL_366, 4, (u32 *) &denali_ctl_366)) goto error_read; + trace_edac_mc_int_status(dev_info->sm_region >> 16, 1, + denali_ctl_366.int_status); + handle_events(dev_info, &denali_ctl_366); atomic_set(&dev_info->data->event_ready, 1); wake_up(&dev_info->data->event_wq); denali_ctl_368.int_ack = - (denali_ctl_366.int_status & 0xf8ffffff); + (denali_ctl_366.int_status & (~(INT_BIT_26))); if (dev_info->is_ddr4) { - if (denali_ctl_366.int_status & 0x4000000) { + if (denali_ctl_366.int_status & INT_BIT_26) { atomic_set(&dev_info->data->dump_ready, 1); wake_up(&dev_info->data->dump_wq); - denali_ctl_368.int_ack |= 0x4000000; + denali_ctl_368.int_ack |= INT_BIT_26; } } if (ncr_write(dev_info->sm_region, SM_56XX_DENALI_CTL_368, @@ -970,12 +1081,12 @@ smmon_isr_sw(int interrupt, void *device) goto error_write; } - if (denali_ctl_367.int_status & 0x2) { + if (denali_ctl_367.int_status & INT_BIT_1) { if (dev_info->is_ddr4) { atomic_inc(&dev_info->data->dump_in_progress); wake_up(&dev_info->data->dump_wq); } - denali_ctl_369.int_ack = 0x2; + denali_ctl_369.int_ack = INT_BIT_1; if (ncr_write(dev_info->sm_region, SM_56XX_DENALI_CTL_369, 4, (u32 *) &denali_ctl_369)) goto error_write; @@ -1036,6 +1147,9 @@ static void intel_sm_alerts_error_check(struct edac_device_ctl_info *edac_dev) SM_56XX_DENALI_CTL_57, 4, (u32 *) &denali_ctl_57)) goto error_write; + + trace_edac_mc_dump_triggered(dev_info->sm_region >> 16, i); + /* wait */ wait_event(dev_info->data->dump_wq, atomic_read(&dev_info->data->dump_ready)); @@ -1221,6 +1335,7 @@ static int intel_edac_mc_probe(struct platform_device *pdev) struct sm_56xx_denali_ctl_371 denali_ctl_371; int cs_count = MAX_CS; int dram_count = MAX_DQ; + struct irq_desc *desc; count = atomic64_inc_return(&mc_counter); if ((count - 1) == MEMORY_CONTROLLERS) @@ -1402,14 +1517,14 @@ static int intel_edac_mc_probe(struct platform_device *pdev) "%s-mon", dev_info->ctl_name); dev_info->wq_events = - alloc_workqueue("%s-events", WQ_MEM_RECLAIM, 1, + alloc_workqueue("%s-events", 0, 1, (dev_info->ctl_name)); if (!dev_info->wq_events) goto err_nosysfs; if (dev_info->is_ddr4) { dev_info->wq_alerts = - alloc_workqueue("%s-alerts", WQ_MEM_RECLAIM, 1, + alloc_workqueue("%s-alerts", 0, 1, (dev_info->ctl_name)); if (!dev_info->wq_alerts) @@ -1421,8 +1536,9 @@ static int intel_edac_mc_probe(struct platform_device *pdev) INIT_WORK(&dev_info->offload_events, axxia_events_work); if (dev_info->is_ddr4) - queue_work(dev_info->wq_alerts, &dev_info->offload_alerts); - queue_work(dev_info->wq_events, &dev_info->offload_events); + queue_work_on(0, dev_info->wq_alerts, + &dev_info->offload_alerts); + queue_work_on(0, dev_info->wq_events, &dev_info->offload_events); irq = platform_get_irq(pdev, 0); if (irq < 0) { @@ -1436,8 +1552,7 @@ static int intel_edac_mc_probe(struct platform_device *pdev) if (dev_info->is_ddr4) denali_ctl_370.int_mask = SM_INT_MASK_LOW; else - denali_ctl_370.int_mask = SM_INT_MASK_LOW | - 0x04000000; + denali_ctl_370.int_mask = SM_INT_MASK_LOW | INT_BIT_26; if (ncr_write(dev_info->sm_region, SM_56XX_DENALI_CTL_370, 4, (u32 *) &denali_ctl_370)) { @@ -1488,6 +1603,9 @@ static int intel_edac_mc_probe(struct platform_device *pdev) } goto err_noirq; } + desc = irq_to_desc(irq); + sched_setaffinity(desc->action->thread->pid, &only_cpu_0); + return 0; err_noirq: diff --git a/include/trace/events/edac_cmc.h b/include/trace/events/edac_cmc.h new file mode 100644 index 0000000..143d58f --- /dev/null +++ b/include/trace/events/edac_cmc.h @@ -0,0 +1,101 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM edac_cmc + +#if !defined(_TRACE_EDAC_CMC_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_EDAC_CMC_H + +#include <linux/types.h> +#include <linux/tracepoint.h> + + +TRACE_EVENT(edac_cmc_int_status, + TP_PROTO(u32 node, u32 int_status), + + TP_ARGS(node, int_status), + + TP_STRUCT__entry( + __field(u32, node) + __field(u32, int_status) + ), + + TP_fast_assign( + __entry->node = node; + __entry->int_status = int_status; + ), + + TP_printk("CMEM(node=0x%x) int_status=0x%08x", + (u32) __entry->node, (u32) __entry->int_status) +); + +TRACE_EVENT(edac_cmc_dump_processed, + TP_PROTO(u32 node, u32 cs, u32 dram, u32 val), + + TP_ARGS(node, cs, dram, val), + + TP_STRUCT__entry( + __field(u32, node) + __field(u32, cs) + __field(u32, dram) + __field(u32, val) + ), + + TP_fast_assign( + __entry->node = node; + __entry->cs = cs; + __entry->dram = dram; + __entry->val = val; + ), + + TP_printk("CMEM(node=0x%x) cs=%d dram=%d value=0x%x", + (u32) __entry->node, (u32) __entry->cs, + (u32) __entry->dram, (u32) __entry->val) +); + +TRACE_EVENT(edac_cmc_dump_collected, + TP_PROTO(u32 node, u32 cs, u32 byte, u32 dram, u32 val), + + TP_ARGS(node, cs, byte, dram, val), + + TP_STRUCT__entry( + __field(u32, node) + __field(u32, cs) + __field(u32, byte) + __field(u32, dram) + __field(u32, val) + ), + + TP_fast_assign( + __entry->node = node; + __entry->cs = cs; + __entry->byte = byte; + __entry->dram = dram; + __entry->val = val; + ), + + TP_printk("CMEM(node=0x%x) cs=%d dram_%d_page[byte=%d]=0x%x", + (u32) __entry->node, (u32) __entry->cs, (u32) __entry->dram, + (u32) __entry->byte, (u32) __entry->val) +); + +TRACE_EVENT(edac_cmc_dump_triggered, + TP_PROTO(u32 node, u32 cs), + + TP_ARGS(node, cs), + + TP_STRUCT__entry( + __field(u32, node) + __field(u32, cs) + ), + + TP_fast_assign( + __entry->node = node; + __entry->cs = cs; + ), + + TP_printk("CMEM(node=0x%x) cs=%d", + (u32) __entry->node, (u32) __entry->cs) +); +#endif /* _TRACE_EDAC_CMC_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/include/trace/events/edac_mc.h b/include/trace/events/edac_mc.h new file mode 100644 index 0000000..fa09fda --- /dev/null +++ b/include/trace/events/edac_mc.h @@ -0,0 +1,104 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM edac_mc + +#if !defined(_TRACE_EDAC_MC_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_EDAC_MC_H + +#include <linux/types.h> +#include <linux/tracepoint.h> + + +TRACE_EVENT(edac_mc_int_status, + TP_PROTO(u32 node, u32 idx, u32 int_status), + + TP_ARGS(node, idx, int_status), + + TP_STRUCT__entry( + __field(u32, node) + __field(u32, idx) + __field(u32, int_status) + ), + + TP_fast_assign( + __entry->node = node; + __entry->idx = idx; + __entry->int_status = int_status; + ), + + TP_printk("SMEM(node=0x%x) int_status[%d]=0x%08x", + (u32) __entry->node, (u32) __entry->idx, + (u32) __entry->int_status) +); + +TRACE_EVENT(edac_mc_dump_processed, + TP_PROTO(u32 node, u32 cs, u32 dram, u32 val), + + TP_ARGS(node, cs, dram, val), + + TP_STRUCT__entry( + __field(u32, node) + __field(u32, cs) + __field(u32, dram) + __field(u32, val) + ), + + TP_fast_assign( + __entry->node = node; + __entry->cs = cs; + __entry->dram = dram; + __entry->val = val; + ), + + TP_printk("SMEM(node=0x%x) cs=%d dram=%d value=0x%x", + (u32) __entry->node, (u32) __entry->cs, + (u32) __entry->dram, (u32) __entry->val) +); + +TRACE_EVENT(edac_mc_dump_collected, + TP_PROTO(u32 node, u32 cs, u32 byte, u32 dram, u32 val), + + TP_ARGS(node, cs, byte, dram, val), + + TP_STRUCT__entry( + __field(u32, node) + __field(u32, cs) + __field(u32, byte) + __field(u32, dram) + __field(u32, val) + ), + + TP_fast_assign( + __entry->node = node; + __entry->cs = cs; + __entry->byte = byte; + __entry->dram = dram; + __entry->val = val; + ), + + TP_printk("SMEM(node=0x%x) cs=%d dram_%d_page[byte=%d]=0x%x", + (u32) __entry->node, (u32) __entry->cs, (u32) __entry->dram, + (u32) __entry->byte, (u32) __entry->val) +); + +TRACE_EVENT(edac_mc_dump_triggered, + TP_PROTO(u32 node, u32 cs), + + TP_ARGS(node, cs), + + TP_STRUCT__entry( + __field(u32, node) + __field(u32, cs) + ), + + TP_fast_assign( + __entry->node = node; + __entry->cs = cs; + ), + + TP_printk("SMEM(node=0x%x) cs=%d", + (u32) __entry->node, (u32) __entry->cs) +); +#endif /* _TRACE_EDAC_MC_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> -- 2.7.4 -- _______________________________________________ linux-yocto mailing list [email protected] https://lists.yoctoproject.org/listinfo/linux-yocto
