Re: [PATCH v3 2/4] edac: Add support for Amazon's Annapurna Labs L1 EDAC

2019-09-03 Thread Hawa, Hanna




On 9/3/2019 10:24 AM, Robert Richter wrote:

On 15.07.19 16:24:07, Hanna Hawa wrote:

Adds support for Amazon's Annapurna Labs L1 EDAC driver to detect and
report L1 errors.

Signed-off-by: Hanna Hawa 
Reviewed-by: James Morse 
---
  MAINTAINERS   |   6 ++
  drivers/edac/Kconfig  |   8 +++
  drivers/edac/Makefile |   1 +
  drivers/edac/al_l1_edac.c | 156 ++
  4 files changed, 171 insertions(+)
  create mode 100644 drivers/edac/al_l1_edac.c



diff --git a/drivers/edac/al_l1_edac.c b/drivers/edac/al_l1_edac.c
new file mode 100644
index 000..70510ea
--- /dev/null
+++ b/drivers/edac/al_l1_edac.c


[...]


+static void al_l1_edac_cpumerrsr(void *arg)


Could this being named to something meaningful, such as
*_read_status() or so?


+{
+   struct edac_device_ctl_info *edac_dev = arg;
+   int cpu, i;
+   u32 ramid, repeat, other, fatal;
+   u64 val = read_sysreg_s(ARM_CA57_CPUMERRSR_EL1);
+   char msg[AL_L1_EDAC_MSG_MAX];
+   int space, count;
+   char *p;
+
+   if (!(FIELD_GET(ARM_CA57_CPUMERRSR_VALID, val)))
+   return;


[...]


+static void al_l1_edac_check(struct edac_device_ctl_info *edac_dev)
+{
+   on_each_cpu(al_l1_edac_cpumerrsr, edac_dev, 1);
+}
+
+static int al_l1_edac_probe(struct platform_device *pdev)
+{
+   struct edac_device_ctl_info *edac_dev;
+   struct device *dev = >dev;
+   int ret;
+
+   edac_dev = edac_device_alloc_ctl_info(0, (char *)dev_name(dev), 1, "L",


This type cast looks broken. dev_name() is a constant string already.

Other drivers do not use the dynamically generated dev_name() string
here, instead a fix string such as mod_name or ctl_name could be used.
edac_device_alloc_ctl_info() later generates a unique instance name
derived from name + index.


Ack, will fix and use DRV_NAME.



Regarding the type, this seems to be an API issue of edac_device_
alloc_ctl_info() that should actually use const char* in its
interface. So if needed (from what I wrote above it is not) the type
in the argument list needs to be changed instead.


+ 1, 1, NULL, 0,
+ edac_device_alloc_index());
+   if (IS_ERR(edac_dev))
+   return -ENOMEM;


Use the original error code instead.


Actually it return NULL in case of failure, it was changed in v5 to 
check if error/NULL.





+
+   edac_dev->edac_check = al_l1_edac_check;
+   edac_dev->dev = dev;
+   edac_dev->mod_name = DRV_NAME;
+   edac_dev->dev_name = dev_name(dev);
+   edac_dev->ctl_name = "L1 cache";


Should not contain spaces and maybe a bit more specific.


L1_cache_ecc_err? or L1_cache_err?




+   platform_set_drvdata(pdev, edac_dev);
+
+   ret = edac_device_add_device(edac_dev);
+   if (ret) {
+   dev_err(dev, "Failed to add L1 edac device\n");


Move this printk below to the error path and maybe print the error
code. You do not cover the -ENOMEM failure.


Ack.



-Robert


+   goto err;
+   }
+
+   return 0;
+err:
+   edac_device_free_ctl_info(edac_dev);
+
+   return ret;
+}


Re: [PATCH v3 2/4] edac: Add support for Amazon's Annapurna Labs L1 EDAC

2019-09-03 Thread Robert Richter
On 15.07.19 16:24:07, Hanna Hawa wrote:
> Adds support for Amazon's Annapurna Labs L1 EDAC driver to detect and
> report L1 errors.
> 
> Signed-off-by: Hanna Hawa 
> Reviewed-by: James Morse 
> ---
>  MAINTAINERS   |   6 ++
>  drivers/edac/Kconfig  |   8 +++
>  drivers/edac/Makefile |   1 +
>  drivers/edac/al_l1_edac.c | 156 
> ++
>  4 files changed, 171 insertions(+)
>  create mode 100644 drivers/edac/al_l1_edac.c

> diff --git a/drivers/edac/al_l1_edac.c b/drivers/edac/al_l1_edac.c
> new file mode 100644
> index 000..70510ea
> --- /dev/null
> +++ b/drivers/edac/al_l1_edac.c

[...]

> +static void al_l1_edac_cpumerrsr(void *arg)

Could this being named to something meaningful, such as
*_read_status() or so?

> +{
> + struct edac_device_ctl_info *edac_dev = arg;
> + int cpu, i;
> + u32 ramid, repeat, other, fatal;
> + u64 val = read_sysreg_s(ARM_CA57_CPUMERRSR_EL1);
> + char msg[AL_L1_EDAC_MSG_MAX];
> + int space, count;
> + char *p;
> +
> + if (!(FIELD_GET(ARM_CA57_CPUMERRSR_VALID, val)))
> + return;

[...]

> +static void al_l1_edac_check(struct edac_device_ctl_info *edac_dev)
> +{
> + on_each_cpu(al_l1_edac_cpumerrsr, edac_dev, 1);
> +}
> +
> +static int al_l1_edac_probe(struct platform_device *pdev)
> +{
> + struct edac_device_ctl_info *edac_dev;
> + struct device *dev = >dev;
> + int ret;
> +
> + edac_dev = edac_device_alloc_ctl_info(0, (char *)dev_name(dev), 1, "L",

This type cast looks broken. dev_name() is a constant string already.

Other drivers do not use the dynamically generated dev_name() string
here, instead a fix string such as mod_name or ctl_name could be used.
edac_device_alloc_ctl_info() later generates a unique instance name
derived from name + index.

Regarding the type, this seems to be an API issue of edac_device_
alloc_ctl_info() that should actually use const char* in its
interface. So if needed (from what I wrote above it is not) the type
in the argument list needs to be changed instead.

> +   1, 1, NULL, 0,
> +   edac_device_alloc_index());
> + if (IS_ERR(edac_dev))
> + return -ENOMEM;

Use the original error code instead.

> +
> + edac_dev->edac_check = al_l1_edac_check;
> + edac_dev->dev = dev;
> + edac_dev->mod_name = DRV_NAME;
> + edac_dev->dev_name = dev_name(dev);
> + edac_dev->ctl_name = "L1 cache";

Should not contain spaces and maybe a bit more specific.

> + platform_set_drvdata(pdev, edac_dev);
> +
> + ret = edac_device_add_device(edac_dev);
> + if (ret) {
> + dev_err(dev, "Failed to add L1 edac device\n");

Move this printk below to the error path and maybe print the error
code. You do not cover the -ENOMEM failure.

-Robert

> + goto err;
> + }
> +
> + return 0;
> +err:
> + edac_device_free_ctl_info(edac_dev);
> +
> + return ret;
> +}


Re: [PATCH v3 2/4] edac: Add support for Amazon's Annapurna Labs L1 EDAC

2019-08-01 Thread Hawa, Hanna



On 7/26/2019 7:49 PM, James Morse wrote:

Hi Hanna,

On 15/07/2019 14:24, Hanna Hawa wrote:

Adds support for Amazon's Annapurna Labs L1 EDAC driver to detect and
report L1 errors.



diff --git a/drivers/edac/al_l1_edac.c b/drivers/edac/al_l1_edac.c
new file mode 100644
index 000..70510ea
--- /dev/null
+++ b/drivers/edac/al_l1_edac.c
@@ -0,0 +1,156 @@



+#include 


You need  for on-each_cpu().


+#include "edac_device.h"
+#include "edac_module.h"


You need  for the sys_reg() macro. The ARCH_ALPINE dependency 
doesn't stop
this from being built on 32bit arm, where this sys_reg() won't work/exist.


Will fix.



[...]


+static void al_l1_edac_cpumerrsr(void *arg)
+{
+   struct edac_device_ctl_info *edac_dev = arg;
+   int cpu, i;
+   u32 ramid, repeat, other, fatal;
+   u64 val = read_sysreg_s(ARM_CA57_CPUMERRSR_EL1);
+   char msg[AL_L1_EDAC_MSG_MAX];
+   int space, count;
+   char *p;
+   if (!(FIELD_GET(ARM_CA57_CPUMERRSR_VALID, val)))
+   return;
+   space = sizeof(msg);
+   p = msg;
+   count = snprintf(p, space, "CPU%d L1 %serror detected", cpu,
+(fatal) ? "Fatal " : "");
+   p += count;
+   space -= count;


snprintf() will return the number of characters it would have generated, even 
if that is
more than space. If this happen, space becomes negative, p points outside msg[] 
and msg[]
isn't NULL terminated...

It looks like you want scnprintf(), which returns the number of characters 
written to buf
instead. (I don't see how 256 characters would be printed by this code)


Will use scnprintf() instead.





+   switch (ramid) {
+   case ARM_CA57_L1_I_TAG_RAM:
+   count = snprintf(p, space, " RAMID='L1-I Tag RAM'");
+   break;
+   case ARM_CA57_L1_I_DATA_RAM:
+   count = snprintf(p, space, " RAMID='L1-I Data RAM'");
+   break;
+   case ARM_CA57_L1_D_TAG_RAM:
+   count = snprintf(p, space, " RAMID='L1-D Tag RAM'");
+   break;
+   case ARM_CA57_L1_D_DATA_RAM:
+   count = snprintf(p, space, " RAMID='L1-D Data RAM'");
+   break;
+   case ARM_CA57_L2_TLB_RAM:
+   count = snprintf(p, space, " RAMID='L2 TLB RAM'");
+   break;
+   default:
+   count = snprintf(p, space, " RAMID='unknown'");
+   break;
+   }
+
+   p += count;
+   space -= count;
+   count = snprintf(p, space,
+" repeat=%d, other=%d (CPUMERRSR_EL1=0x%llx)",
+repeat, other, val);
+
+   for (i = 0; i < repeat; i++) {
+   if (fatal)
+   edac_device_handle_ue(edac_dev, 0, 0, msg);
+   else
+   edac_device_handle_ce(edac_dev, 0, 0, msg);
+   }
+
+   write_sysreg_s(0, ARM_CA57_CPUMERRSR_EL1);


Writing 0 just after you've read the value would minimise the window where 
repeat could
have increased behind your back, or another error was counted as other, when it 
could have
been reported more accurately.


Good point, I will move the write after checking the valid bit.





+}




+static int al_l1_edac_probe(struct platform_device *pdev)
+{
+   struct edac_device_ctl_info *edac_dev;
+   struct device *dev = >dev;
+   int ret;
+
+   edac_dev = edac_device_alloc_ctl_info(0, (char *)dev_name(dev), 1, "L",
+ 1, 1, NULL, 0,
+ edac_device_alloc_index());
+   if (IS_ERR(edac_dev))


edac_device_alloc_ctl_info() returns NULL, or dev_ctl, which comes from 
kzalloc(). I think
you need to check for NULL here, IS_ERR() only catches the -errno range. (there 
is an
IS_ERR_OR_NULL() if you really needed both)


Will fix.





+   return -ENOMEM;



With the header-includes and edac_device_alloc_ctl_info() NULL check:
Reviewed-by: James Morse 


Thanks James.

Thanks,
Hanna



Thanks,

James



Re: [PATCH v3 2/4] edac: Add support for Amazon's Annapurna Labs L1 EDAC

2019-07-26 Thread James Morse
Hi Hanna,

On 15/07/2019 14:24, Hanna Hawa wrote:
> Adds support for Amazon's Annapurna Labs L1 EDAC driver to detect and
> report L1 errors.

> diff --git a/drivers/edac/al_l1_edac.c b/drivers/edac/al_l1_edac.c
> new file mode 100644
> index 000..70510ea
> --- /dev/null
> +++ b/drivers/edac/al_l1_edac.c
> @@ -0,0 +1,156 @@

> +#include 

You need  for on-each_cpu().

> +#include "edac_device.h"
> +#include "edac_module.h"

You need  for the sys_reg() macro. The ARCH_ALPINE dependency 
doesn't stop
this from being built on 32bit arm, where this sys_reg() won't work/exist.

[...]

> +static void al_l1_edac_cpumerrsr(void *arg)
> +{
> + struct edac_device_ctl_info *edac_dev = arg;
> + int cpu, i;
> + u32 ramid, repeat, other, fatal;
> + u64 val = read_sysreg_s(ARM_CA57_CPUMERRSR_EL1);
> + char msg[AL_L1_EDAC_MSG_MAX];
> + int space, count;
> + char *p;
> + if (!(FIELD_GET(ARM_CA57_CPUMERRSR_VALID, val)))
> + return;
> + space = sizeof(msg);
> + p = msg;
> + count = snprintf(p, space, "CPU%d L1 %serror detected", cpu,
> +  (fatal) ? "Fatal " : "");
> + p += count;
> + space -= count;

snprintf() will return the number of characters it would have generated, even 
if that is
more than space. If this happen, space becomes negative, p points outside msg[] 
and msg[]
isn't NULL terminated...

It looks like you want scnprintf(), which returns the number of characters 
written to buf
instead. (I don't see how 256 characters would be printed by this code)


> + switch (ramid) {
> + case ARM_CA57_L1_I_TAG_RAM:
> + count = snprintf(p, space, " RAMID='L1-I Tag RAM'");
> + break;
> + case ARM_CA57_L1_I_DATA_RAM:
> + count = snprintf(p, space, " RAMID='L1-I Data RAM'");
> + break;
> + case ARM_CA57_L1_D_TAG_RAM:
> + count = snprintf(p, space, " RAMID='L1-D Tag RAM'");
> + break;
> + case ARM_CA57_L1_D_DATA_RAM:
> + count = snprintf(p, space, " RAMID='L1-D Data RAM'");
> + break;
> + case ARM_CA57_L2_TLB_RAM:
> + count = snprintf(p, space, " RAMID='L2 TLB RAM'");
> + break;
> + default:
> + count = snprintf(p, space, " RAMID='unknown'");
> + break;
> + }
> +
> + p += count;
> + space -= count;
> + count = snprintf(p, space,
> +  " repeat=%d, other=%d (CPUMERRSR_EL1=0x%llx)",
> +  repeat, other, val);
> +
> + for (i = 0; i < repeat; i++) {
> + if (fatal)
> + edac_device_handle_ue(edac_dev, 0, 0, msg);
> + else
> + edac_device_handle_ce(edac_dev, 0, 0, msg);
> + }
> +
> + write_sysreg_s(0, ARM_CA57_CPUMERRSR_EL1);

Writing 0 just after you've read the value would minimise the window where 
repeat could
have increased behind your back, or another error was counted as other, when it 
could have
been reported more accurately.


> +}


> +static int al_l1_edac_probe(struct platform_device *pdev)
> +{
> + struct edac_device_ctl_info *edac_dev;
> + struct device *dev = >dev;
> + int ret;
> +
> + edac_dev = edac_device_alloc_ctl_info(0, (char *)dev_name(dev), 1, "L",
> +   1, 1, NULL, 0,
> +   edac_device_alloc_index());
> + if (IS_ERR(edac_dev))

edac_device_alloc_ctl_info() returns NULL, or dev_ctl, which comes from 
kzalloc(). I think
you need to check for NULL here, IS_ERR() only catches the -errno range. (there 
is an
IS_ERR_OR_NULL() if you really needed both)


> + return -ENOMEM;


With the header-includes and edac_device_alloc_ctl_info() NULL check:
Reviewed-by: James Morse 


Thanks,

James


[PATCH v3 2/4] edac: Add support for Amazon's Annapurna Labs L1 EDAC

2019-07-15 Thread Hanna Hawa
Adds support for Amazon's Annapurna Labs L1 EDAC driver to detect and
report L1 errors.

Signed-off-by: Hanna Hawa 
---
 MAINTAINERS   |   6 ++
 drivers/edac/Kconfig  |   8 +++
 drivers/edac/Makefile |   1 +
 drivers/edac/al_l1_edac.c | 156 ++
 4 files changed, 171 insertions(+)
 create mode 100644 drivers/edac/al_l1_edac.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 77eae44..fd29ea6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -743,6 +743,12 @@ F: drivers/tty/serial/altera_jtaguart.c
 F: include/linux/altera_uart.h
 F: include/linux/altera_jtaguart.h
 
+AMAZON ANNAPURNA LABS L1 EDAC
+M: Hanna Hawa 
+S: Maintained
+F: drivers/edac/al_l1_edac.c
+F: Documentation/devicetree/bindings/edac/amazon,al-l1-edac.txt
+
 AMAZON ANNAPURNA LABS THERMAL MMIO DRIVER
 M: Talel Shenhar 
 S: Maintained
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 200c04c..58b92bc 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -74,6 +74,14 @@ config EDAC_GHES
 
  In doubt, say 'Y'.
 
+config EDAC_AL_L1
+   bool "Amazon's Annapurna Labs L1 EDAC"
+   depends on ARCH_ALPINE
+   help
+ Support for L1 error detection and correction
+ for Amazon's Annapurna Labs SoCs.
+ This driver detects errors of L1 caches.
+
 config EDAC_AMD64
tristate "AMD64 (Opteron, Athlon64)"
depends on AMD_NB && EDAC_DECODE_MCE
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index 165ca65e..caa2dc9 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_EDAC_GHES)   += ghes_edac.o
 edac_mce_amd-y := mce_amd.o
 obj-$(CONFIG_EDAC_DECODE_MCE)  += edac_mce_amd.o
 
+obj-$(CONFIG_EDAC_AL_L1)   += al_l1_edac.o
 obj-$(CONFIG_EDAC_AMD76X)  += amd76x_edac.o
 obj-$(CONFIG_EDAC_CPC925)  += cpc925_edac.o
 obj-$(CONFIG_EDAC_I5000)   += i5000_edac.o
diff --git a/drivers/edac/al_l1_edac.c b/drivers/edac/al_l1_edac.c
new file mode 100644
index 000..70510ea
--- /dev/null
+++ b/drivers/edac/al_l1_edac.c
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ */
+
+#include 
+
+#include "edac_device.h"
+#include "edac_module.h"
+
+#define DRV_NAME   "al_l1_edac"
+
+/* Same bit assignments of CPUMERRSR_EL1 in ARM CA57/CA72 */
+#define ARM_CA57_CPUMERRSR_EL1 sys_reg(3, 1, 15, 2, 2)
+#define ARM_CA57_CPUMERRSR_RAM_ID  GENMASK(30, 24)
+#define  ARM_CA57_L1_I_TAG_RAM 0x00
+#define  ARM_CA57_L1_I_DATA_RAM0x01
+#define  ARM_CA57_L1_D_TAG_RAM 0x08
+#define  ARM_CA57_L1_D_DATA_RAM0x09
+#define  ARM_CA57_L2_TLB_RAM   0x18
+#define ARM_CA57_CPUMERRSR_VALID   BIT(31)
+#define ARM_CA57_CPUMERRSR_REPEAT  GENMASK_ULL(39, 32)
+#define ARM_CA57_CPUMERRSR_OTHER   GENMASK_ULL(47, 40)
+#define ARM_CA57_CPUMERRSR_FATAL   BIT_ULL(63)
+
+#define AL_L1_EDAC_MSG_MAX 256
+
+static void al_l1_edac_cpumerrsr(void *arg)
+{
+   struct edac_device_ctl_info *edac_dev = arg;
+   int cpu, i;
+   u32 ramid, repeat, other, fatal;
+   u64 val = read_sysreg_s(ARM_CA57_CPUMERRSR_EL1);
+   char msg[AL_L1_EDAC_MSG_MAX];
+   int space, count;
+   char *p;
+
+   if (!(FIELD_GET(ARM_CA57_CPUMERRSR_VALID, val)))
+   return;
+
+   cpu = smp_processor_id();
+   ramid = FIELD_GET(ARM_CA57_CPUMERRSR_RAM_ID, val);
+   repeat = FIELD_GET(ARM_CA57_CPUMERRSR_REPEAT, val);
+   other = FIELD_GET(ARM_CA57_CPUMERRSR_OTHER, val);
+   fatal = FIELD_GET(ARM_CA57_CPUMERRSR_FATAL, val);
+
+   space = sizeof(msg);
+   p = msg;
+   count = snprintf(p, space, "CPU%d L1 %serror detected", cpu,
+(fatal) ? "Fatal " : "");
+   p += count;
+   space -= count;
+
+   switch (ramid) {
+   case ARM_CA57_L1_I_TAG_RAM:
+   count = snprintf(p, space, " RAMID='L1-I Tag RAM'");
+   break;
+   case ARM_CA57_L1_I_DATA_RAM:
+   count = snprintf(p, space, " RAMID='L1-I Data RAM'");
+   break;
+   case ARM_CA57_L1_D_TAG_RAM:
+   count = snprintf(p, space, " RAMID='L1-D Tag RAM'");
+   break;
+   case ARM_CA57_L1_D_DATA_RAM:
+   count = snprintf(p, space, " RAMID='L1-D Data RAM'");
+   break;
+   case ARM_CA57_L2_TLB_RAM:
+   count = snprintf(p, space, " RAMID='L2 TLB RAM'");
+   break;
+   default:
+   count = snprintf(p, space, " RAMID='unknown'");
+   break;
+   }
+
+   p += count;
+   space -= count;
+   count = snprintf(p, space,
+