Re: [PATCH V3 01/10] acpi: apei: read ack upon ghes record consumption

2016-10-13 Thread Baicar, Tyler

Hello Punit,

Thank you for the feedback! Responses below

On 10/12/2016 9:39 AM, Punit Agrawal wrote:

Hi Tyler,

A few comments below.

Tyler Baicar  writes:


A RAS (Reliability, Availability, Serviceability) controller
may be a separate processor running in parallel with OS
execution, and may generate error records for consumption by
the OS. If the RAS controller produces multiple error records,
then they may be overwritten before the OS has consumed them.

The Generic Hardware Error Source (GHES) v2 structure
introduces the capability for the OS to acknowledge the
consumption of the error record generated by the RAS
controller. A RAS controller supporting GHESv2 shall wait for
the acknowledgment before writing a new error record, thus
eliminating the race condition.

Signed-off-by: Jonathan (Zhixiong) Zhang 
Signed-off-by: Richard Ruigrok 
Signed-off-by: Tyler Baicar 
Signed-off-by: Naveen Kaje 
---
  drivers/acpi/apei/ghes.c | 41 +
  drivers/acpi/apei/hest.c |  7 +--
  include/acpi/ghes.h  |  1 +
  3 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 60746ef..3021f0e 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -45,6 +45,7 @@
  #include 
  #include 
  
+#include 

  #include 
  #include 
  #include 
@@ -244,10 +245,22 @@ static struct ghes *ghes_new(struct acpi_hest_generic 
*generic)
struct ghes *ghes;
unsigned int error_block_length;
int rc;
+   struct acpi_hest_header *hest_hdr;
  
  	ghes = kzalloc(sizeof(*ghes), GFP_KERNEL);

if (!ghes)
return ERR_PTR(-ENOMEM);
+
+   hest_hdr = (struct acpi_hest_header *)generic;
+   if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR_V2) {
+   ghes->generic_v2 = (struct acpi_hest_generic_v2 *)generic;
+   rc = apei_map_generic_address(
+   >generic_v2->read_ack_register);
+   if (rc)
+   goto err_unmap;
+   } else
+   ghes->generic_v2 = NULL;

Since you kzalloc ghes, shouldn't ghes->generic_v2 be NULL already?
Yes, the documentation says kzalloc returns memory set to zero, so I 
will remove this else statement.

+
ghes->generic = generic;
rc = apei_map_generic_address(>error_status_address);
if (rc)
@@ -270,6 +283,9 @@ static struct ghes *ghes_new(struct acpi_hest_generic 
*generic)
  
  err_unmap:

apei_unmap_generic_address(>error_status_address);
+   if (ghes->generic_v2)
+   apei_unmap_generic_address(
+   >generic_v2->read_ack_register);
  err_free:
kfree(ghes);
return ERR_PTR(rc);
@@ -279,6 +295,9 @@ static void ghes_fini(struct ghes *ghes)
  {
kfree(ghes->estatus);
apei_unmap_generic_address(>generic->error_status_address);
+   if (ghes->generic_v2)
+   apei_unmap_generic_address(
+   >generic_v2->read_ack_register);
  }
  
  static inline int ghes_severity(int severity)

@@ -648,6 +667,22 @@ static void ghes_estatus_cache_add(
rcu_read_unlock();
  }
  
+static int ghes_do_read_ack(struct acpi_hest_generic_v2 *generic_v2)

+{
+   int rc;
+   u64 val = 0;
+
+   rc = apei_read(, _v2->read_ack_register);
+   if (rc)
+   return rc;
+   val &= generic_v2->read_ack_preserve <<
+   generic_v2->read_ack_register.bit_offset;
+   val |= generic_v2->read_ack_write;

Reading the spec, it is not clear whether you need the left shift
above.

Having said that, if you do it for read_ack_preserve, do you also need
to left shift read_ack_write by read_ack_register.bit_offset?
Good catch, it looks like the read_ack_write should also get this shift. 
I'm using a shift of 0 so I didn't catch this in testing :)

+   rc = apei_write(val, _v2->read_ack_register);
+
+   return rc;
+}
+
  static int ghes_proc(struct ghes *ghes)
  {
int rc;
@@ -660,6 +695,12 @@ static int ghes_proc(struct ghes *ghes)
ghes_estatus_cache_add(ghes->generic, ghes->estatus);
}
ghes_do_proc(ghes, ghes->estatus);
+
+   if (ghes->generic_v2) {
+   rc = ghes_do_read_ack(ghes->generic_v2);
+   if (rc)
+   return rc;
+   }
  out:
ghes_clear_estatus(ghes);
return 0;
diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c
index 792a0d9..ef725a9 100644
--- a/drivers/acpi/apei/hest.c
+++ b/drivers/acpi/apei/hest.c
@@ -52,6 +52,7 @@ static const int hest_esrc_len_tab[ACPI_HEST_TYPE_RESERVED] = 
{
[ACPI_HEST_TYPE_AER_ENDPOINT] = sizeof(struct acpi_hest_aer),
[ACPI_HEST_TYPE_AER_BRIDGE] = sizeof(struct acpi_hest_aer_bridge),
[ACPI_HEST_TYPE_GENERIC_ERROR] = 

Re: [PATCH V3 01/10] acpi: apei: read ack upon ghes record consumption

2016-10-13 Thread Baicar, Tyler

Hello Punit,

Thank you for the feedback! Responses below

On 10/12/2016 9:39 AM, Punit Agrawal wrote:

Hi Tyler,

A few comments below.

Tyler Baicar  writes:


A RAS (Reliability, Availability, Serviceability) controller
may be a separate processor running in parallel with OS
execution, and may generate error records for consumption by
the OS. If the RAS controller produces multiple error records,
then they may be overwritten before the OS has consumed them.

The Generic Hardware Error Source (GHES) v2 structure
introduces the capability for the OS to acknowledge the
consumption of the error record generated by the RAS
controller. A RAS controller supporting GHESv2 shall wait for
the acknowledgment before writing a new error record, thus
eliminating the race condition.

Signed-off-by: Jonathan (Zhixiong) Zhang 
Signed-off-by: Richard Ruigrok 
Signed-off-by: Tyler Baicar 
Signed-off-by: Naveen Kaje 
---
  drivers/acpi/apei/ghes.c | 41 +
  drivers/acpi/apei/hest.c |  7 +--
  include/acpi/ghes.h  |  1 +
  3 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 60746ef..3021f0e 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -45,6 +45,7 @@
  #include 
  #include 
  
+#include 

  #include 
  #include 
  #include 
@@ -244,10 +245,22 @@ static struct ghes *ghes_new(struct acpi_hest_generic 
*generic)
struct ghes *ghes;
unsigned int error_block_length;
int rc;
+   struct acpi_hest_header *hest_hdr;
  
  	ghes = kzalloc(sizeof(*ghes), GFP_KERNEL);

if (!ghes)
return ERR_PTR(-ENOMEM);
+
+   hest_hdr = (struct acpi_hest_header *)generic;
+   if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR_V2) {
+   ghes->generic_v2 = (struct acpi_hest_generic_v2 *)generic;
+   rc = apei_map_generic_address(
+   >generic_v2->read_ack_register);
+   if (rc)
+   goto err_unmap;
+   } else
+   ghes->generic_v2 = NULL;

Since you kzalloc ghes, shouldn't ghes->generic_v2 be NULL already?
Yes, the documentation says kzalloc returns memory set to zero, so I 
will remove this else statement.

+
ghes->generic = generic;
rc = apei_map_generic_address(>error_status_address);
if (rc)
@@ -270,6 +283,9 @@ static struct ghes *ghes_new(struct acpi_hest_generic 
*generic)
  
  err_unmap:

apei_unmap_generic_address(>error_status_address);
+   if (ghes->generic_v2)
+   apei_unmap_generic_address(
+   >generic_v2->read_ack_register);
  err_free:
kfree(ghes);
return ERR_PTR(rc);
@@ -279,6 +295,9 @@ static void ghes_fini(struct ghes *ghes)
  {
kfree(ghes->estatus);
apei_unmap_generic_address(>generic->error_status_address);
+   if (ghes->generic_v2)
+   apei_unmap_generic_address(
+   >generic_v2->read_ack_register);
  }
  
  static inline int ghes_severity(int severity)

@@ -648,6 +667,22 @@ static void ghes_estatus_cache_add(
rcu_read_unlock();
  }
  
+static int ghes_do_read_ack(struct acpi_hest_generic_v2 *generic_v2)

+{
+   int rc;
+   u64 val = 0;
+
+   rc = apei_read(, _v2->read_ack_register);
+   if (rc)
+   return rc;
+   val &= generic_v2->read_ack_preserve <<
+   generic_v2->read_ack_register.bit_offset;
+   val |= generic_v2->read_ack_write;

Reading the spec, it is not clear whether you need the left shift
above.

Having said that, if you do it for read_ack_preserve, do you also need
to left shift read_ack_write by read_ack_register.bit_offset?
Good catch, it looks like the read_ack_write should also get this shift. 
I'm using a shift of 0 so I didn't catch this in testing :)

+   rc = apei_write(val, _v2->read_ack_register);
+
+   return rc;
+}
+
  static int ghes_proc(struct ghes *ghes)
  {
int rc;
@@ -660,6 +695,12 @@ static int ghes_proc(struct ghes *ghes)
ghes_estatus_cache_add(ghes->generic, ghes->estatus);
}
ghes_do_proc(ghes, ghes->estatus);
+
+   if (ghes->generic_v2) {
+   rc = ghes_do_read_ack(ghes->generic_v2);
+   if (rc)
+   return rc;
+   }
  out:
ghes_clear_estatus(ghes);
return 0;
diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c
index 792a0d9..ef725a9 100644
--- a/drivers/acpi/apei/hest.c
+++ b/drivers/acpi/apei/hest.c
@@ -52,6 +52,7 @@ static const int hest_esrc_len_tab[ACPI_HEST_TYPE_RESERVED] = 
{
[ACPI_HEST_TYPE_AER_ENDPOINT] = sizeof(struct acpi_hest_aer),
[ACPI_HEST_TYPE_AER_BRIDGE] = sizeof(struct acpi_hest_aer_bridge),
[ACPI_HEST_TYPE_GENERIC_ERROR] = sizeof(struct acpi_hest_generic),
+   [ACPI_HEST_TYPE_GENERIC_ERROR_V2] = sizeof(struct acpi_hest_generic_v2),
  };
  
  

Re: [PATCH V3 01/10] acpi: apei: read ack upon ghes record consumption

2016-10-12 Thread Punit Agrawal
Hi Tyler,

A few comments below.

Tyler Baicar  writes:

> A RAS (Reliability, Availability, Serviceability) controller
> may be a separate processor running in parallel with OS
> execution, and may generate error records for consumption by
> the OS. If the RAS controller produces multiple error records,
> then they may be overwritten before the OS has consumed them.
>
> The Generic Hardware Error Source (GHES) v2 structure
> introduces the capability for the OS to acknowledge the
> consumption of the error record generated by the RAS
> controller. A RAS controller supporting GHESv2 shall wait for
> the acknowledgment before writing a new error record, thus
> eliminating the race condition.
>
> Signed-off-by: Jonathan (Zhixiong) Zhang 
> Signed-off-by: Richard Ruigrok 
> Signed-off-by: Tyler Baicar 
> Signed-off-by: Naveen Kaje 
> ---
>  drivers/acpi/apei/ghes.c | 41 +
>  drivers/acpi/apei/hest.c |  7 +--
>  include/acpi/ghes.h  |  1 +
>  3 files changed, 47 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 60746ef..3021f0e 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -45,6 +45,7 @@
>  #include 
>  #include 
>  
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -244,10 +245,22 @@ static struct ghes *ghes_new(struct acpi_hest_generic 
> *generic)
>   struct ghes *ghes;
>   unsigned int error_block_length;
>   int rc;
> + struct acpi_hest_header *hest_hdr;
>  
>   ghes = kzalloc(sizeof(*ghes), GFP_KERNEL);
>   if (!ghes)
>   return ERR_PTR(-ENOMEM);
> +
> + hest_hdr = (struct acpi_hest_header *)generic;
> + if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR_V2) {
> + ghes->generic_v2 = (struct acpi_hest_generic_v2 *)generic;
> + rc = apei_map_generic_address(
> + >generic_v2->read_ack_register);
> + if (rc)
> + goto err_unmap;
> + } else
> + ghes->generic_v2 = NULL;

Since you kzalloc ghes, shouldn't ghes->generic_v2 be NULL already?

> +
>   ghes->generic = generic;
>   rc = apei_map_generic_address(>error_status_address);
>   if (rc)
> @@ -270,6 +283,9 @@ static struct ghes *ghes_new(struct acpi_hest_generic 
> *generic)
>  
>  err_unmap:
>   apei_unmap_generic_address(>error_status_address);
> + if (ghes->generic_v2)
> + apei_unmap_generic_address(
> + >generic_v2->read_ack_register);
>  err_free:
>   kfree(ghes);
>   return ERR_PTR(rc);
> @@ -279,6 +295,9 @@ static void ghes_fini(struct ghes *ghes)
>  {
>   kfree(ghes->estatus);
>   apei_unmap_generic_address(>generic->error_status_address);
> + if (ghes->generic_v2)
> + apei_unmap_generic_address(
> + >generic_v2->read_ack_register);
>  }
>  
>  static inline int ghes_severity(int severity)
> @@ -648,6 +667,22 @@ static void ghes_estatus_cache_add(
>   rcu_read_unlock();
>  }
>  
> +static int ghes_do_read_ack(struct acpi_hest_generic_v2 *generic_v2)
> +{
> + int rc;
> + u64 val = 0;
> +
> + rc = apei_read(, _v2->read_ack_register);
> + if (rc)
> + return rc;
> + val &= generic_v2->read_ack_preserve <<
> + generic_v2->read_ack_register.bit_offset;
> + val |= generic_v2->read_ack_write;

Reading the spec, it is not clear whether you need the left shift
above.

Having said that, if you do it for read_ack_preserve, do you also need
to left shift read_ack_write by read_ack_register.bit_offset?

> + rc = apei_write(val, _v2->read_ack_register);
> +
> + return rc;
> +}
> +
>  static int ghes_proc(struct ghes *ghes)
>  {
>   int rc;
> @@ -660,6 +695,12 @@ static int ghes_proc(struct ghes *ghes)
>   ghes_estatus_cache_add(ghes->generic, ghes->estatus);
>   }
>   ghes_do_proc(ghes, ghes->estatus);
> +
> + if (ghes->generic_v2) {
> + rc = ghes_do_read_ack(ghes->generic_v2);
> + if (rc)
> + return rc;
> + }
>  out:
>   ghes_clear_estatus(ghes);
>   return 0;
> diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c
> index 792a0d9..ef725a9 100644
> --- a/drivers/acpi/apei/hest.c
> +++ b/drivers/acpi/apei/hest.c
> @@ -52,6 +52,7 @@ static const int hest_esrc_len_tab[ACPI_HEST_TYPE_RESERVED] 
> = {
>   [ACPI_HEST_TYPE_AER_ENDPOINT] = sizeof(struct acpi_hest_aer),
>   [ACPI_HEST_TYPE_AER_BRIDGE] = sizeof(struct acpi_hest_aer_bridge),
>   [ACPI_HEST_TYPE_GENERIC_ERROR] = sizeof(struct acpi_hest_generic),
> + [ACPI_HEST_TYPE_GENERIC_ERROR_V2] = sizeof(struct acpi_hest_generic_v2),
>  };
>  
>  static int hest_esrc_len(struct acpi_hest_header *hest_hdr)
> @@ -146,7 +147,8 @@ static int __init 

Re: [PATCH V3 01/10] acpi: apei: read ack upon ghes record consumption

2016-10-12 Thread Punit Agrawal
Hi Tyler,

A few comments below.

Tyler Baicar  writes:

> A RAS (Reliability, Availability, Serviceability) controller
> may be a separate processor running in parallel with OS
> execution, and may generate error records for consumption by
> the OS. If the RAS controller produces multiple error records,
> then they may be overwritten before the OS has consumed them.
>
> The Generic Hardware Error Source (GHES) v2 structure
> introduces the capability for the OS to acknowledge the
> consumption of the error record generated by the RAS
> controller. A RAS controller supporting GHESv2 shall wait for
> the acknowledgment before writing a new error record, thus
> eliminating the race condition.
>
> Signed-off-by: Jonathan (Zhixiong) Zhang 
> Signed-off-by: Richard Ruigrok 
> Signed-off-by: Tyler Baicar 
> Signed-off-by: Naveen Kaje 
> ---
>  drivers/acpi/apei/ghes.c | 41 +
>  drivers/acpi/apei/hest.c |  7 +--
>  include/acpi/ghes.h  |  1 +
>  3 files changed, 47 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 60746ef..3021f0e 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -45,6 +45,7 @@
>  #include 
>  #include 
>  
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -244,10 +245,22 @@ static struct ghes *ghes_new(struct acpi_hest_generic 
> *generic)
>   struct ghes *ghes;
>   unsigned int error_block_length;
>   int rc;
> + struct acpi_hest_header *hest_hdr;
>  
>   ghes = kzalloc(sizeof(*ghes), GFP_KERNEL);
>   if (!ghes)
>   return ERR_PTR(-ENOMEM);
> +
> + hest_hdr = (struct acpi_hest_header *)generic;
> + if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR_V2) {
> + ghes->generic_v2 = (struct acpi_hest_generic_v2 *)generic;
> + rc = apei_map_generic_address(
> + >generic_v2->read_ack_register);
> + if (rc)
> + goto err_unmap;
> + } else
> + ghes->generic_v2 = NULL;

Since you kzalloc ghes, shouldn't ghes->generic_v2 be NULL already?

> +
>   ghes->generic = generic;
>   rc = apei_map_generic_address(>error_status_address);
>   if (rc)
> @@ -270,6 +283,9 @@ static struct ghes *ghes_new(struct acpi_hest_generic 
> *generic)
>  
>  err_unmap:
>   apei_unmap_generic_address(>error_status_address);
> + if (ghes->generic_v2)
> + apei_unmap_generic_address(
> + >generic_v2->read_ack_register);
>  err_free:
>   kfree(ghes);
>   return ERR_PTR(rc);
> @@ -279,6 +295,9 @@ static void ghes_fini(struct ghes *ghes)
>  {
>   kfree(ghes->estatus);
>   apei_unmap_generic_address(>generic->error_status_address);
> + if (ghes->generic_v2)
> + apei_unmap_generic_address(
> + >generic_v2->read_ack_register);
>  }
>  
>  static inline int ghes_severity(int severity)
> @@ -648,6 +667,22 @@ static void ghes_estatus_cache_add(
>   rcu_read_unlock();
>  }
>  
> +static int ghes_do_read_ack(struct acpi_hest_generic_v2 *generic_v2)
> +{
> + int rc;
> + u64 val = 0;
> +
> + rc = apei_read(, _v2->read_ack_register);
> + if (rc)
> + return rc;
> + val &= generic_v2->read_ack_preserve <<
> + generic_v2->read_ack_register.bit_offset;
> + val |= generic_v2->read_ack_write;

Reading the spec, it is not clear whether you need the left shift
above.

Having said that, if you do it for read_ack_preserve, do you also need
to left shift read_ack_write by read_ack_register.bit_offset?

> + rc = apei_write(val, _v2->read_ack_register);
> +
> + return rc;
> +}
> +
>  static int ghes_proc(struct ghes *ghes)
>  {
>   int rc;
> @@ -660,6 +695,12 @@ static int ghes_proc(struct ghes *ghes)
>   ghes_estatus_cache_add(ghes->generic, ghes->estatus);
>   }
>   ghes_do_proc(ghes, ghes->estatus);
> +
> + if (ghes->generic_v2) {
> + rc = ghes_do_read_ack(ghes->generic_v2);
> + if (rc)
> + return rc;
> + }
>  out:
>   ghes_clear_estatus(ghes);
>   return 0;
> diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c
> index 792a0d9..ef725a9 100644
> --- a/drivers/acpi/apei/hest.c
> +++ b/drivers/acpi/apei/hest.c
> @@ -52,6 +52,7 @@ static const int hest_esrc_len_tab[ACPI_HEST_TYPE_RESERVED] 
> = {
>   [ACPI_HEST_TYPE_AER_ENDPOINT] = sizeof(struct acpi_hest_aer),
>   [ACPI_HEST_TYPE_AER_BRIDGE] = sizeof(struct acpi_hest_aer_bridge),
>   [ACPI_HEST_TYPE_GENERIC_ERROR] = sizeof(struct acpi_hest_generic),
> + [ACPI_HEST_TYPE_GENERIC_ERROR_V2] = sizeof(struct acpi_hest_generic_v2),
>  };
>  
>  static int hest_esrc_len(struct acpi_hest_header *hest_hdr)
> @@ -146,7 +147,8 @@ static int __init hest_parse_ghes_count(struct 
> acpi_hest_header *hest_hdr, void
>  {
>   int *count = data;
>  
> - if (hest_hdr->type