Re: [RESEND PATCH v6 8/8] hw/mem/cxl_type3: Add CXL RAS Error Injection Support.

2023-03-14 Thread Jonathan Cameron via



> > diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h
> > index 7e5ad65c1d..d589f78202 100644
> > --- a/include/hw/cxl/cxl_device.h
> > +++ b/include/hw/cxl/cxl_device.h
> > @@ -232,6 +232,14 @@ REG64(CXL_MEM_DEV_STS, 0)
> >  FIELD(CXL_MEM_DEV_STS, MBOX_READY, 4, 1)
> >  FIELD(CXL_MEM_DEV_STS, RESET_NEEDED, 5, 3)
> >  
> > +typedef struct CXLError {
> > +QTAILQ_ENTRY(CXLError) node;
> > +int type; /* Error code as per FE definition */
> > +uint32_t header[32];  
> Instead of using 32 here, would it be better to use
> CXL_RAS_ERR_HEADER_NUM?

Yes, that would be better.  Please send a patch.

> > +} CXLError;



Re: [RESEND PATCH v6 8/8] hw/mem/cxl_type3: Add CXL RAS Error Injection Support.

2023-03-07 Thread Michael S. Tsirkin
On Tue, Mar 07, 2023 at 07:26:41PM +, Fan Ni wrote:
> > +typedef struct CXLError {
> > +QTAILQ_ENTRY(CXLError) node;
> > +int type; /* Error code as per FE definition */
> > +uint32_t header[32];
> Instead of using 32 here, would it be better to use
> CXL_RAS_ERR_HEADER_NUM?

merged as is, fix on top pls.

-- 
MST




Re: [RESEND PATCH v6 8/8] hw/mem/cxl_type3: Add CXL RAS Error Injection Support.

2023-03-07 Thread Fan Ni
On Thu, Mar 02, 2023 at 01:37:09PM +, Jonathan Cameron wrote:
> CXL uses PCI AER Internal errors to signal to the host that an error has
> occurred. The host can then read more detailed status from the CXL RAS
> capability.
> 
> For uncorrectable errors: support multiple injection in one operation
> as this is needed to reliably test multiple header logging support in an
> OS. The equivalent feature doesn't exist for correctable errors, so only
> one error need be injected at a time.
> 
> Note:
>  - Header content needs to be manually specified in a fashion that
>matches the specification for what can be in the header for each
>error type.
> 
> Injection via QMP:
> { "execute": "qmp_capabilities" }
> ...
> { "execute": "cxl-inject-uncorrectable-errors",
>   "arguments": {
> "path": "/machine/peripheral/cxl-pmem0",
> "errors": [
> {
> "type": "cache-address-parity",
> "header": [ 3, 4]
> },
> {
> "type": "cache-data-parity",
> "header": 
> [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
> },
> {
> "type": "internal",
> "header": [ 1, 2, 4]
> }
> ]
>   }}
> ...
> { "execute": "cxl-inject-correctable-error",
> "arguments": {
> "path": "/machine/peripheral/cxl-pmem0",
> "type": "physical"
> } }
> 
> Signed-off-by: Jonathan Cameron 
> ---

Reviewed-by: Fan Ni 

One minor thing, see below in "typedef struct CXLError".

> v6: (Thanks to Philippe Mathieu-Daudé)
> - Add Since entries in cxl.json
> - Add error prints in the stub functions so that if they are called without
>   CONFIG_CXL_MEM_DEVICE then we get a useful print rather than just silently
>   eating them.
> 
> ---
>  hw/cxl/cxl-component-utils.c   |   4 +-
>  hw/mem/cxl_type3.c | 281 +
>  hw/mem/cxl_type3_stubs.c   |  17 ++
>  hw/mem/meson.build |   2 +
>  include/hw/cxl/cxl_component.h |  26 +++
>  include/hw/cxl/cxl_device.h|  11 ++
>  qapi/cxl.json  | 128 +++
>  qapi/meson.build   |   1 +
>  qapi/qapi-schema.json  |   1 +
>  9 files changed, 470 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/cxl/cxl-component-utils.c b/hw/cxl/cxl-component-utils.c
> index 737b4764b9..b665d4f565 100644
> --- a/hw/cxl/cxl-component-utils.c
> +++ b/hw/cxl/cxl-component-utils.c
> @@ -142,16 +142,18 @@ static void ras_init_common(uint32_t *reg_state, 
> uint32_t *write_msk)
>   * be handled as RO.
>   */
>  stl_le_p(reg_state + R_CXL_RAS_UNC_ERR_STATUS, 0);
> +stl_le_p(write_msk + R_CXL_RAS_UNC_ERR_STATUS, 0x1cfff);
>  /* Bits 12-13 and 17-31 reserved in CXL 2.0 */
>  stl_le_p(reg_state + R_CXL_RAS_UNC_ERR_MASK, 0x1cfff);
>  stl_le_p(write_msk + R_CXL_RAS_UNC_ERR_MASK, 0x1cfff);
>  stl_le_p(reg_state + R_CXL_RAS_UNC_ERR_SEVERITY, 0x1cfff);
>  stl_le_p(write_msk + R_CXL_RAS_UNC_ERR_SEVERITY, 0x1cfff);
>  stl_le_p(reg_state + R_CXL_RAS_COR_ERR_STATUS, 0);
> +stl_le_p(write_msk + R_CXL_RAS_COR_ERR_STATUS, 0x7f);
>  stl_le_p(reg_state + R_CXL_RAS_COR_ERR_MASK, 0x7f);
>  stl_le_p(write_msk + R_CXL_RAS_COR_ERR_MASK, 0x7f);
>  /* CXL switches and devices must set */
> -stl_le_p(reg_state + R_CXL_RAS_ERR_CAP_CTRL, 0x00);
> +stl_le_p(reg_state + R_CXL_RAS_ERR_CAP_CTRL, 0x200);
>  }
>  
>  static void hdm_init_common(uint32_t *reg_state, uint32_t *write_msk,
> diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
> index 6cdd988d1d..abe60b362c 100644
> --- a/hw/mem/cxl_type3.c
> +++ b/hw/mem/cxl_type3.c
> @@ -1,6 +1,7 @@
>  #include "qemu/osdep.h"
>  #include "qemu/units.h"
>  #include "qemu/error-report.h"
> +#include "qapi/qapi-commands-cxl.h"
>  #include "hw/mem/memory-device.h"
>  #include "hw/mem/pc-dimm.h"
>  #include "hw/pci/pci.h"
> @@ -323,6 +324,66 @@ static void hdm_decoder_commit(CXLType3Dev *ct3d, int 
> which)
>  ARRAY_FIELD_DP32(cache_mem, CXL_HDM_DECODER0_CTRL, COMMITTED, 1);
>  }
>  
> +static int ct3d_qmp_uncor_err_to_cxl(CxlUncorErrorType qmp_err)
> +{
> +switch (qmp_err) {
> +case CXL_UNCOR_ERROR_TYPE_CACHE_DATA_PARITY:
> +return CXL_RAS_UNC_ERR_CACHE_DATA_PARITY;
> +case CXL_UNCOR_ERROR_TYPE_CACHE_ADDRESS_PARITY:
> +return CXL_RAS_UNC_ERR_CACHE_ADDRESS_PARITY;
> +case CXL_UNCOR_ERROR_TYPE_CACHE_BE_PARITY:
> +return CXL_RAS_UNC_ERR_CACHE_BE_PARITY;
> +case CXL_UNCOR_ERROR_TYPE_CACHE_DATA_ECC:
> +return CXL_RAS_UNC_ERR_CACHE_DATA_ECC;
> +case CXL_UNCOR_ERROR_TYPE_MEM_DATA_PARITY:
> +return CXL_RAS_UNC_ERR_MEM_DATA_PARITY;
> +case CXL_UNCOR_ERROR_TYPE_MEM_ADDRESS_PARITY:
> +return CXL_RAS_UNC_ERR_MEM_ADDRESS_PARITY;
> +case CXL_UNCOR_ERROR_TYPE_MEM_BE_PARITY:
> +return CXL_RAS_UNC_ERR_MEM_BE_PARITY;
> +case CXL_UNCOR_ERROR_TYPE_MEM_DATA_ECC:
> +return 

Re: [RESEND PATCH v6 8/8] hw/mem/cxl_type3: Add CXL RAS Error Injection Support.

2023-03-07 Thread Michael S. Tsirkin
On Thu, Mar 02, 2023 at 01:37:09PM +, Jonathan Cameron wrote:
> CXL uses PCI AER Internal errors to signal to the host that an error has
> occurred. The host can then read more detailed status from the CXL RAS
> capability.
> 
> For uncorrectable errors: support multiple injection in one operation
> as this is needed to reliably test multiple header logging support in an
> OS. The equivalent feature doesn't exist for correctable errors, so only
> one error need be injected at a time.
> 
> Note:
>  - Header content needs to be manually specified in a fashion that
>matches the specification for what can be in the header for each
>error type.
> 
> Injection via QMP:
> { "execute": "qmp_capabilities" }
> ...
> { "execute": "cxl-inject-uncorrectable-errors",
>   "arguments": {
> "path": "/machine/peripheral/cxl-pmem0",
> "errors": [
> {
> "type": "cache-address-parity",
> "header": [ 3, 4]
> },
> {
> "type": "cache-data-parity",
> "header": 
> [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
> },
> {
> "type": "internal",
> "header": [ 1, 2, 4]
> }
> ]
>   }}
> ...
> { "execute": "cxl-inject-correctable-error",
> "arguments": {
> "path": "/machine/peripheral/cxl-pmem0",
> "type": "physical"
> } }
> 
> Signed-off-by: Jonathan Cameron 

I will assume the silence of QAPI maintainers implies acceptance.

> ---
> v6: (Thanks to Philippe Mathieu-Daudé)
> - Add Since entries in cxl.json
> - Add error prints in the stub functions so that if they are called without
>   CONFIG_CXL_MEM_DEVICE then we get a useful print rather than just silently
>   eating them.
> 
> ---
>  hw/cxl/cxl-component-utils.c   |   4 +-
>  hw/mem/cxl_type3.c | 281 +
>  hw/mem/cxl_type3_stubs.c   |  17 ++
>  hw/mem/meson.build |   2 +
>  include/hw/cxl/cxl_component.h |  26 +++
>  include/hw/cxl/cxl_device.h|  11 ++
>  qapi/cxl.json  | 128 +++
>  qapi/meson.build   |   1 +
>  qapi/qapi-schema.json  |   1 +
>  9 files changed, 470 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/cxl/cxl-component-utils.c b/hw/cxl/cxl-component-utils.c
> index 737b4764b9..b665d4f565 100644
> --- a/hw/cxl/cxl-component-utils.c
> +++ b/hw/cxl/cxl-component-utils.c
> @@ -142,16 +142,18 @@ static void ras_init_common(uint32_t *reg_state, 
> uint32_t *write_msk)
>   * be handled as RO.
>   */
>  stl_le_p(reg_state + R_CXL_RAS_UNC_ERR_STATUS, 0);
> +stl_le_p(write_msk + R_CXL_RAS_UNC_ERR_STATUS, 0x1cfff);
>  /* Bits 12-13 and 17-31 reserved in CXL 2.0 */
>  stl_le_p(reg_state + R_CXL_RAS_UNC_ERR_MASK, 0x1cfff);
>  stl_le_p(write_msk + R_CXL_RAS_UNC_ERR_MASK, 0x1cfff);
>  stl_le_p(reg_state + R_CXL_RAS_UNC_ERR_SEVERITY, 0x1cfff);
>  stl_le_p(write_msk + R_CXL_RAS_UNC_ERR_SEVERITY, 0x1cfff);
>  stl_le_p(reg_state + R_CXL_RAS_COR_ERR_STATUS, 0);
> +stl_le_p(write_msk + R_CXL_RAS_COR_ERR_STATUS, 0x7f);
>  stl_le_p(reg_state + R_CXL_RAS_COR_ERR_MASK, 0x7f);
>  stl_le_p(write_msk + R_CXL_RAS_COR_ERR_MASK, 0x7f);
>  /* CXL switches and devices must set */
> -stl_le_p(reg_state + R_CXL_RAS_ERR_CAP_CTRL, 0x00);
> +stl_le_p(reg_state + R_CXL_RAS_ERR_CAP_CTRL, 0x200);
>  }
>  
>  static void hdm_init_common(uint32_t *reg_state, uint32_t *write_msk,
> diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
> index 6cdd988d1d..abe60b362c 100644
> --- a/hw/mem/cxl_type3.c
> +++ b/hw/mem/cxl_type3.c
> @@ -1,6 +1,7 @@
>  #include "qemu/osdep.h"
>  #include "qemu/units.h"
>  #include "qemu/error-report.h"
> +#include "qapi/qapi-commands-cxl.h"
>  #include "hw/mem/memory-device.h"
>  #include "hw/mem/pc-dimm.h"
>  #include "hw/pci/pci.h"
> @@ -323,6 +324,66 @@ static void hdm_decoder_commit(CXLType3Dev *ct3d, int 
> which)
>  ARRAY_FIELD_DP32(cache_mem, CXL_HDM_DECODER0_CTRL, COMMITTED, 1);
>  }
>  
> +static int ct3d_qmp_uncor_err_to_cxl(CxlUncorErrorType qmp_err)
> +{
> +switch (qmp_err) {
> +case CXL_UNCOR_ERROR_TYPE_CACHE_DATA_PARITY:
> +return CXL_RAS_UNC_ERR_CACHE_DATA_PARITY;
> +case CXL_UNCOR_ERROR_TYPE_CACHE_ADDRESS_PARITY:
> +return CXL_RAS_UNC_ERR_CACHE_ADDRESS_PARITY;
> +case CXL_UNCOR_ERROR_TYPE_CACHE_BE_PARITY:
> +return CXL_RAS_UNC_ERR_CACHE_BE_PARITY;
> +case CXL_UNCOR_ERROR_TYPE_CACHE_DATA_ECC:
> +return CXL_RAS_UNC_ERR_CACHE_DATA_ECC;
> +case CXL_UNCOR_ERROR_TYPE_MEM_DATA_PARITY:
> +return CXL_RAS_UNC_ERR_MEM_DATA_PARITY;
> +case CXL_UNCOR_ERROR_TYPE_MEM_ADDRESS_PARITY:
> +return CXL_RAS_UNC_ERR_MEM_ADDRESS_PARITY;
> +case CXL_UNCOR_ERROR_TYPE_MEM_BE_PARITY:
> +return CXL_RAS_UNC_ERR_MEM_BE_PARITY;
> +case CXL_UNCOR_ERROR_TYPE_MEM_DATA_ECC:
> +return CXL_RAS_UNC_ERR_MEM_DATA_ECC;
> +case