Re: [Patch v2] efi: cper: Add support for printing Firmware Error Record Reference

2020-05-12 Thread Punit Agrawal
Ard Biesheuvel  writes:

> On Tue, 12 May 2020 at 06:55, Punit Agrawal
>  wrote:
>>
>> While debugging a boot failure, the following unknown error record was
>> seen in the boot logs.
>>
>> <...>
>> BERT: Error records from previous boot:
>> [Hardware Error]: event severity: fatal
>> [Hardware Error]:  Error 0, type: fatal
>> [Hardware Error]:   section type: unknown, 
>> 81212a96-09ed-4996-9471-8d729c8e69ed
>> [Hardware Error]:   section length: 0x290
>> [Hardware Error]:   : 0001   00020002  
>> 
>> [Hardware Error]:   0010: 00020002 001f 0320   
>>  ...
>> [Hardware Error]:   0020:      
>> 
>> [Hardware Error]:   0030:      
>> 
>> <...>
>>
>> On further investigation, it was found that the error record with
>> UUID (81212a96-09ed-4996-9471-8d729c8e69ed) has been defined in the
>> UEFI Specification at least since v2.4 and has recently had additional
>> fields defined in v2.7 Section N.2.10 Firmware Error Record Reference.
>>
>> Add support for parsing and printing the defined fields to give users
>> a chance to figure out what went wrong.
>>
>> Signed-off-by: Punit Agrawal 
>> Cc: Ard Biesheuvel 
>> Cc: "Rafael J. Wysocki" 
>> Cc: Borislav Petkov 
>> Cc: James Morse 
>> Cc: linux-a...@vger.kernel.org
>> Cc: linux-...@vger.kernel.org
>> ---
>> Hi Ard,
>>
>> I've updated the patch based on your feedback.
>>
>> As you noted, some aspects of the spec make it a bit tricky to support
>> all revisions in a nice way (e.g., size check) but this version should
>> fix existing issues.
>>
>> Thanks,
>> Punit
>>
>> v1[0] -> v2:
>> * Simplified error record structure definition
>> * Fixed size check
>> * Added comment to clarify offset calculation for dumped data
>> * Style fixes for multiline if blocks
>
> Thanks. I will queue this as a fix.

Thanks!

Just for my understanding - are you planning to send this for v5.7 or
v5.8? There's no rush, so I am fine either ways.

[...]



Re: [Patch v2] efi: cper: Add support for printing Firmware Error Record Reference

2020-05-12 Thread Ard Biesheuvel
On Tue, 12 May 2020 at 06:55, Punit Agrawal
 wrote:
>
> While debugging a boot failure, the following unknown error record was
> seen in the boot logs.
>
> <...>
> BERT: Error records from previous boot:
> [Hardware Error]: event severity: fatal
> [Hardware Error]:  Error 0, type: fatal
> [Hardware Error]:   section type: unknown, 
> 81212a96-09ed-4996-9471-8d729c8e69ed
> [Hardware Error]:   section length: 0x290
> [Hardware Error]:   : 0001   00020002  
> 
> [Hardware Error]:   0010: 00020002 001f 0320   
>  ...
> [Hardware Error]:   0020:      
> 
> [Hardware Error]:   0030:      
> 
> <...>
>
> On further investigation, it was found that the error record with
> UUID (81212a96-09ed-4996-9471-8d729c8e69ed) has been defined in the
> UEFI Specification at least since v2.4 and has recently had additional
> fields defined in v2.7 Section N.2.10 Firmware Error Record Reference.
>
> Add support for parsing and printing the defined fields to give users
> a chance to figure out what went wrong.
>
> Signed-off-by: Punit Agrawal 
> Cc: Ard Biesheuvel 
> Cc: "Rafael J. Wysocki" 
> Cc: Borislav Petkov 
> Cc: James Morse 
> Cc: linux-a...@vger.kernel.org
> Cc: linux-...@vger.kernel.org
> ---
> Hi Ard,
>
> I've updated the patch based on your feedback.
>
> As you noted, some aspects of the spec make it a bit tricky to support
> all revisions in a nice way (e.g., size check) but this version should
> fix existing issues.
>
> Thanks,
> Punit
>
> v1[0] -> v2:
> * Simplified error record structure definition
> * Fixed size check
> * Added comment to clarify offset calculation for dumped data
> * Style fixes for multiline if blocks
>

Thanks. I will queue this as a fix.


> [0] 
> https://lkml.kernel.org/lkml/20200427085242.2380614-1-punit1.agra...@toshiba.co.jp/
> ---
>  drivers/firmware/efi/cper.c | 62 +
>  include/linux/cper.h|  9 ++
>  2 files changed, 71 insertions(+)
>
> diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
> index 9d2512913d25..f564e15fbc7e 100644
> --- a/drivers/firmware/efi/cper.c
> +++ b/drivers/firmware/efi/cper.c
> @@ -407,6 +407,58 @@ static void cper_print_pcie(const char *pfx, const 
> struct cper_sec_pcie *pcie,
> }
>  }
>
> +static const char * const fw_err_rec_type_strs[] = {
> +   "IPF SAL Error Record",
> +   "SOC Firmware Error Record Type1 (Legacy CrashLog Support)",
> +   "SOC Firmware Error Record Type2",
> +};
> +
> +static void cper_print_fw_err(const char *pfx,
> + struct acpi_hest_generic_data *gdata,
> + const struct cper_sec_fw_err_rec_ref *fw_err)
> +{
> +   void *buf = acpi_hest_get_payload(gdata);
> +   u32 offset, length = gdata->error_data_length;
> +
> +   printk("%s""Firmware Error Record Type: %s\n", pfx,
> +  fw_err->record_type < ARRAY_SIZE(fw_err_rec_type_strs) ?
> +  fw_err_rec_type_strs[fw_err->record_type] : "unknown");
> +   printk("%s""Revision: %d\n", pfx, fw_err->revision);
> +
> +   /* Record Type based on UEFI 2.7 */
> +   if (fw_err->revision == 0) {
> +   printk("%s""Record Identifier: %08llx\n", pfx,
> +  fw_err->record_identifier);
> +   } else if (fw_err->revision == 2) {
> +   printk("%s""Record Identifier: %pUl\n", pfx,
> +  &fw_err->record_identifier_guid);
> +   }
> +
> +   /*
> +* The FW error record may contain trailing data beyond the
> +* structure defined by the specification. As the fields
> +* defined (and hence the offset of any trailing data) vary
> +* with the revision, set the offset to account for this
> +* variation.
> +*/
> +   if (fw_err->revision == 0) {
> +   /* record_identifier_guid not defined */
> +   offset = offsetof(struct cper_sec_fw_err_rec_ref,
> + record_identifier_guid);
> +   } else if (fw_err->revision == 1) {
> +   /* record_identifier not defined */
> +   offset = offsetof(struct cper_sec_fw_err_rec_ref,
> + record_identifier);
> +   } else {
> +   offset = sizeof(*fw_err);
> +   }
> +
> +   buf += offset;
> +   length -= offset;
> +
> +   print_hex_dump(pfx, "", DUMP_PREFIX_OFFSET, 16, 4, buf, length, true);
> +}
> +
>  static void cper_print_tstamp(const char *pfx,
>struct acpi_hest_generic_data_v300 *gdata)
>  {
> @@ -494,6 +546,16 @@ cper_estatus_print_section(const char *pfx, struct 
> acpi_hest_generic_data *gdata
> else
> goto err_section_too_small;
>  #endif

[Patch v2] efi: cper: Add support for printing Firmware Error Record Reference

2020-05-11 Thread Punit Agrawal
While debugging a boot failure, the following unknown error record was
seen in the boot logs.

<...>
BERT: Error records from previous boot:
[Hardware Error]: event severity: fatal
[Hardware Error]:  Error 0, type: fatal
[Hardware Error]:   section type: unknown, 
81212a96-09ed-4996-9471-8d729c8e69ed
[Hardware Error]:   section length: 0x290
[Hardware Error]:   : 0001   00020002  

[Hardware Error]:   0010: 00020002 001f 0320    
...
[Hardware Error]:   0020:      

[Hardware Error]:   0030:      

<...>

On further investigation, it was found that the error record with
UUID (81212a96-09ed-4996-9471-8d729c8e69ed) has been defined in the
UEFI Specification at least since v2.4 and has recently had additional
fields defined in v2.7 Section N.2.10 Firmware Error Record Reference.

Add support for parsing and printing the defined fields to give users
a chance to figure out what went wrong.

Signed-off-by: Punit Agrawal 
Cc: Ard Biesheuvel 
Cc: "Rafael J. Wysocki" 
Cc: Borislav Petkov 
Cc: James Morse 
Cc: linux-a...@vger.kernel.org
Cc: linux-...@vger.kernel.org
---
Hi Ard,

I've updated the patch based on your feedback.

As you noted, some aspects of the spec make it a bit tricky to support
all revisions in a nice way (e.g., size check) but this version should
fix existing issues.

Thanks,
Punit

v1[0] -> v2:
* Simplified error record structure definition
* Fixed size check
* Added comment to clarify offset calculation for dumped data
* Style fixes for multiline if blocks

[0] 
https://lkml.kernel.org/lkml/20200427085242.2380614-1-punit1.agra...@toshiba.co.jp/
---
 drivers/firmware/efi/cper.c | 62 +
 include/linux/cper.h|  9 ++
 2 files changed, 71 insertions(+)

diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index 9d2512913d25..f564e15fbc7e 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -407,6 +407,58 @@ static void cper_print_pcie(const char *pfx, const struct 
cper_sec_pcie *pcie,
}
 }
 
+static const char * const fw_err_rec_type_strs[] = {
+   "IPF SAL Error Record",
+   "SOC Firmware Error Record Type1 (Legacy CrashLog Support)",
+   "SOC Firmware Error Record Type2",
+};
+
+static void cper_print_fw_err(const char *pfx,
+ struct acpi_hest_generic_data *gdata,
+ const struct cper_sec_fw_err_rec_ref *fw_err)
+{
+   void *buf = acpi_hest_get_payload(gdata);
+   u32 offset, length = gdata->error_data_length;
+
+   printk("%s""Firmware Error Record Type: %s\n", pfx,
+  fw_err->record_type < ARRAY_SIZE(fw_err_rec_type_strs) ?
+  fw_err_rec_type_strs[fw_err->record_type] : "unknown");
+   printk("%s""Revision: %d\n", pfx, fw_err->revision);
+
+   /* Record Type based on UEFI 2.7 */
+   if (fw_err->revision == 0) {
+   printk("%s""Record Identifier: %08llx\n", pfx,
+  fw_err->record_identifier);
+   } else if (fw_err->revision == 2) {
+   printk("%s""Record Identifier: %pUl\n", pfx,
+  &fw_err->record_identifier_guid);
+   }
+
+   /*
+* The FW error record may contain trailing data beyond the
+* structure defined by the specification. As the fields
+* defined (and hence the offset of any trailing data) vary
+* with the revision, set the offset to account for this
+* variation.
+*/
+   if (fw_err->revision == 0) {
+   /* record_identifier_guid not defined */
+   offset = offsetof(struct cper_sec_fw_err_rec_ref,
+ record_identifier_guid);
+   } else if (fw_err->revision == 1) {
+   /* record_identifier not defined */
+   offset = offsetof(struct cper_sec_fw_err_rec_ref,
+ record_identifier);
+   } else {
+   offset = sizeof(*fw_err);
+   }
+
+   buf += offset;
+   length -= offset;
+
+   print_hex_dump(pfx, "", DUMP_PREFIX_OFFSET, 16, 4, buf, length, true);
+}
+
 static void cper_print_tstamp(const char *pfx,
   struct acpi_hest_generic_data_v300 *gdata)
 {
@@ -494,6 +546,16 @@ cper_estatus_print_section(const char *pfx, struct 
acpi_hest_generic_data *gdata
else
goto err_section_too_small;
 #endif
+   } else if (guid_equal(sec_type, &CPER_SEC_FW_ERR_REC_REF)) {
+   struct cper_sec_fw_err_rec_ref *fw_err = 
acpi_hest_get_payload(gdata);
+
+   printk("%ssection_type: Firmware Error Record Reference\n",
+  newpfx);
+   /* The minimal FW Error Record contains 16 bytes */
+