Re: [PATCH 06/19] EDAC, mc: Remove per layer counters

2019-10-14 Thread Robert Richter
On 11.10.19 07:40:31, Mauro Carvalho Chehab wrote:
> Em Thu, 10 Oct 2019 20:25:16 +
> Robert Richter  escreveu:
> 
> > Looking at how mci->{ue,ce}_per_layer[EDAC_MAX_LAYERS] is used, it
> > turns out that only the leaves in the memory hierarchy are consumed
> > (in sysfs), but not the intermediate layers, e.g.:
> > 
> >  count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx];
> > 
> > These unused counters only add complexity, remove them. The error
> > counter values are directly stored in struct dimm_info now.
> 
> Hmm... not sure if this patch is correct. I remember that there are some
> border cases on some drivers (maybe the 3-layer drivers used together
> with RDIMM memory controllers?) where some errors are not associated
> to an specific dimm, but, instead, are related to a problem at the memory
> bus.
> 
> Also, depending on how the memory controllers are organized[1], the ECC
> logic groups memory on DIMM pairs. So, when an error occur, it may be
> either at DIMM1 or DIMM2.
> 
> [1] On Intel, this happens with pre-Nehalem memory controllers.
> 
> Due to that, storing errors at the dimm struct sounds wrong, as the
> error may affect multiple DIMMs or even the entire layer.

That was my first thought too, but the counter values are not used at
all. The only exception is to provide *per-dimm* counters here:

 {ce,ue}_per_layer[n_layers-1][dimm->idx]. 

 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/edac/edac_mc_sysfs.c?id=4f5cafb5cb8471e54afdc9054d973535614f7675#n567
 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/edac/edac_mc_sysfs.c?id=4f5cafb5cb8471e54afdc9054d973535614f7675#n584

The case you mentioned above is if the mc only sends parts of the
error location (with a top, mid or low layer missing). The dimm cannot
be identified then. In this case edac_mc_handle_error() tries to find
a unique row (+ channel infomation if available and lists all possible
dimm labels in e->label. See:

 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/edac/edac_mc.c?id=4f5cafb5cb8471e54afdc9054d973535614f7675#n1153

Thus, we see a counter increment for row (and also channel if it can
be identified), but this is counted in mci->csrows array only that is
not removed by this patch.

That said, the {ue,ce}_per_layer[] arrays can be removed by keeping
the same driver functionality, esp. the case you mentioned above.

-Robert


Re: [PATCH 06/19] EDAC, mc: Remove per layer counters

2019-10-11 Thread Mauro Carvalho Chehab
Em Thu, 10 Oct 2019 20:25:16 +
Robert Richter  escreveu:

> Looking at how mci->{ue,ce}_per_layer[EDAC_MAX_LAYERS] is used, it
> turns out that only the leaves in the memory hierarchy are consumed
> (in sysfs), but not the intermediate layers, e.g.:
> 
>  count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx];
> 
> These unused counters only add complexity, remove them. The error
> counter values are directly stored in struct dimm_info now.

Hmm... not sure if this patch is correct. I remember that there are some
border cases on some drivers (maybe the 3-layer drivers used together
with RDIMM memory controllers?) where some errors are not associated
to an specific dimm, but, instead, are related to a problem at the memory
bus.

Also, depending on how the memory controllers are organized[1], the ECC
logic groups memory on DIMM pairs. So, when an error occur, it may be
either at DIMM1 or DIMM2.

[1] On Intel, this happens with pre-Nehalem memory controllers.

Due to that, storing errors at the dimm struct sounds wrong, as the
error may affect multiple DIMMs or even the entire layer.

> 
> Signed-off-by: Robert Richter 
> ---
>  drivers/edac/edac_mc.c   | 104 ---
>  drivers/edac/edac_mc_sysfs.c |  20 +++
>  drivers/edac/ghes_edac.c |   5 +-
>  include/linux/edac.h |   7 +--
>  4 files changed, 46 insertions(+), 90 deletions(-)
> 
> diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> index 45b02bb31964..c1e142643006 100644
> --- a/drivers/edac/edac_mc.c
> +++ b/drivers/edac/edac_mc.c
> @@ -315,10 +315,9 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
>   struct csrow_info *csr;
>   struct rank_info *chan;
>   struct dimm_info *dimm;
> - u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
>   unsigned int pos[EDAC_MAX_LAYERS];
> - unsigned int idx, size, tot_dimms = 1, count = 1;
> - unsigned int tot_csrows = 1, tot_channels = 1, tot_errcount = 0;
> + unsigned int idx, size, tot_dimms = 1;
> + unsigned int tot_csrows = 1, tot_channels = 1;
>   void *pvt, *p, *ptr = NULL;
>   int i, j, row, chn, n, len;
>   bool per_rank = false;
> @@ -346,19 +345,10 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
>* stringent as what the compiler would provide if we could simply
>* hardcode everything into a single struct.
>*/
> - mci = edac_align_ptr(, sizeof(*mci), 1);
> - layer = edac_align_ptr(, sizeof(*layer), n_layers);
> - for (i = 0; i < n_layers; i++) {
> - count *= layers[i].size;
> - edac_dbg(4, "errcount layer %d size %d\n", i, count);
> - ce_per_layer[i] = edac_align_ptr(, sizeof(u32), count);
> - ue_per_layer[i] = edac_align_ptr(, sizeof(u32), count);
> - tot_errcount += 2 * count;
> - }
> -
> - edac_dbg(4, "allocating %d error counters\n", tot_errcount);
> - pvt = edac_align_ptr(, sz_pvt, 1);
> - size = ((unsigned long)pvt) + sz_pvt;
> + mci = edac_align_ptr(, sizeof(*mci), 1);
> + layer   = edac_align_ptr(, sizeof(*layer), n_layers);
> + pvt = edac_align_ptr(, sz_pvt, 1);
> + size= ((unsigned long)pvt) + sz_pvt;
>  
>   edac_dbg(1, "allocating %u bytes for mci data (%d %s, %d 
> csrows/channels)\n",
>size,
> @@ -374,10 +364,6 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
>* rather than an imaginary chunk of memory located at address 0.
>*/
>   layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned 
> long)layer));
> - for (i = 0; i < n_layers; i++) {
> - mci->ce_per_layer[i] = (u32 *)((char *)mci + ((unsigned 
> long)ce_per_layer[i]));
> - mci->ue_per_layer[i] = (u32 *)((char *)mci + ((unsigned 
> long)ue_per_layer[i]));
> - }
>   pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL;
>  
>   /* setup index and various internal pointers */
> @@ -908,53 +894,31 @@ const char *edac_layer_name[] = {
>  EXPORT_SYMBOL_GPL(edac_layer_name);
>  
>  static void edac_inc_ce_error(struct mem_ctl_info *mci,
> -   bool enable_per_layer_report,
> const int pos[EDAC_MAX_LAYERS],
> const u16 count)
>  {
> - int i, index = 0;
> + struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]);
>  
>   mci->ce_mc += count;
>  
> - if (!enable_per_layer_report) {
> + if (dimm)
> + dimm->ce_count += count;
> + else
>   mci->ce_noinfo_count += count;
> - return;
> - }
> -
> - for (i = 0; i < mci->n_layers; i++) {
> - if (pos[i] < 0)
> - break;
> - index += pos[i];
> - mci->ce_per_layer[i][index] += count;
> -
> - if (i < mci->n_layers - 1)
> - index *= mci->layers[i + 1].size;
> - }
>  }
>  

[PATCH 06/19] EDAC, mc: Remove per layer counters

2019-10-10 Thread Robert Richter
Looking at how mci->{ue,ce}_per_layer[EDAC_MAX_LAYERS] is used, it
turns out that only the leaves in the memory hierarchy are consumed
(in sysfs), but not the intermediate layers, e.g.:

 count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx];

These unused counters only add complexity, remove them. The error
counter values are directly stored in struct dimm_info now.

Signed-off-by: Robert Richter 
---
 drivers/edac/edac_mc.c   | 104 ---
 drivers/edac/edac_mc_sysfs.c |  20 +++
 drivers/edac/ghes_edac.c |   5 +-
 include/linux/edac.h |   7 +--
 4 files changed, 46 insertions(+), 90 deletions(-)

diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 45b02bb31964..c1e142643006 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -315,10 +315,9 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
struct csrow_info *csr;
struct rank_info *chan;
struct dimm_info *dimm;
-   u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
unsigned int pos[EDAC_MAX_LAYERS];
-   unsigned int idx, size, tot_dimms = 1, count = 1;
-   unsigned int tot_csrows = 1, tot_channels = 1, tot_errcount = 0;
+   unsigned int idx, size, tot_dimms = 1;
+   unsigned int tot_csrows = 1, tot_channels = 1;
void *pvt, *p, *ptr = NULL;
int i, j, row, chn, n, len;
bool per_rank = false;
@@ -346,19 +345,10 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
 * stringent as what the compiler would provide if we could simply
 * hardcode everything into a single struct.
 */
-   mci = edac_align_ptr(, sizeof(*mci), 1);
-   layer = edac_align_ptr(, sizeof(*layer), n_layers);
-   for (i = 0; i < n_layers; i++) {
-   count *= layers[i].size;
-   edac_dbg(4, "errcount layer %d size %d\n", i, count);
-   ce_per_layer[i] = edac_align_ptr(, sizeof(u32), count);
-   ue_per_layer[i] = edac_align_ptr(, sizeof(u32), count);
-   tot_errcount += 2 * count;
-   }
-
-   edac_dbg(4, "allocating %d error counters\n", tot_errcount);
-   pvt = edac_align_ptr(, sz_pvt, 1);
-   size = ((unsigned long)pvt) + sz_pvt;
+   mci = edac_align_ptr(, sizeof(*mci), 1);
+   layer   = edac_align_ptr(, sizeof(*layer), n_layers);
+   pvt = edac_align_ptr(, sz_pvt, 1);
+   size= ((unsigned long)pvt) + sz_pvt;
 
edac_dbg(1, "allocating %u bytes for mci data (%d %s, %d 
csrows/channels)\n",
 size,
@@ -374,10 +364,6 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
 * rather than an imaginary chunk of memory located at address 0.
 */
layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned 
long)layer));
-   for (i = 0; i < n_layers; i++) {
-   mci->ce_per_layer[i] = (u32 *)((char *)mci + ((unsigned 
long)ce_per_layer[i]));
-   mci->ue_per_layer[i] = (u32 *)((char *)mci + ((unsigned 
long)ue_per_layer[i]));
-   }
pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL;
 
/* setup index and various internal pointers */
@@ -908,53 +894,31 @@ const char *edac_layer_name[] = {
 EXPORT_SYMBOL_GPL(edac_layer_name);
 
 static void edac_inc_ce_error(struct mem_ctl_info *mci,
- bool enable_per_layer_report,
  const int pos[EDAC_MAX_LAYERS],
  const u16 count)
 {
-   int i, index = 0;
+   struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]);
 
mci->ce_mc += count;
 
-   if (!enable_per_layer_report) {
+   if (dimm)
+   dimm->ce_count += count;
+   else
mci->ce_noinfo_count += count;
-   return;
-   }
-
-   for (i = 0; i < mci->n_layers; i++) {
-   if (pos[i] < 0)
-   break;
-   index += pos[i];
-   mci->ce_per_layer[i][index] += count;
-
-   if (i < mci->n_layers - 1)
-   index *= mci->layers[i + 1].size;
-   }
 }
 
 static void edac_inc_ue_error(struct mem_ctl_info *mci,
-   bool enable_per_layer_report,
const int pos[EDAC_MAX_LAYERS],
const u16 count)
 {
-   int i, index = 0;
+   struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]);
 
mci->ue_mc += count;
 
-   if (!enable_per_layer_report) {
+   if (dimm)
+   dimm->ue_count += count;
+   else
mci->ue_noinfo_count += count;
-   return;
-   }
-
-   for (i = 0; i < mci->n_layers; i++) {
-   if (pos[i] < 0)
-   break;
-   index += pos[i];
-   mci->ue_per_layer[i][index] += count;
-
-   if (i