On 9/1/2025 4:25 AM, Cédric Le Goater wrote:
On 8/25/25 23:24, Farhan Ali wrote:
Add an s390x specific callback for vfio error handling. For s390x pci
devices,
we have platform specific error information. We need to retrieve this
error
information for passthrough devices. This is done via a memory region
which
exposes that information.
Once this error information is retrieved we can then inject an error
into
the guest, and let the guest drive the recovery.
Signed-off-by: Farhan Ali <al...@linux.ibm.com>
---
hw/s390x/s390-pci-bus.c | 5 +++
hw/s390x/s390-pci-vfio.c | 76 ++++++++++++++++++++++++++++++++
include/hw/s390x/s390-pci-bus.h | 1 +
include/hw/s390x/s390-pci-vfio.h | 2 +
4 files changed, 84 insertions(+)
diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
index f87d2748b6..af42eb9938 100644
--- a/hw/s390x/s390-pci-bus.c
+++ b/hw/s390x/s390-pci-bus.c
@@ -158,6 +158,8 @@ static void
s390_pci_perform_unplug(S390PCIBusDevice *pbdev)
{
HotplugHandler *hotplug_ctrl;
+ qemu_mutex_destroy(&pbdev->err_handler_lock);
+
if (pbdev->pft == ZPCI_PFT_ISM) {
notifier_remove(&pbdev->shutdown_notifier);
}
@@ -1140,6 +1142,7 @@ static void s390_pcihost_plug(HotplugHandler
*hotplug_dev, DeviceState *dev,
pbdev->iommu->pbdev = pbdev;
pbdev->state = ZPCI_FS_DISABLED;
set_pbdev_info(pbdev);
+ qemu_mutex_init(&pbdev->err_handler_lock);
if (object_dynamic_cast(OBJECT(dev), "vfio-pci")) {
/*
@@ -1164,6 +1167,8 @@ static void s390_pcihost_plug(HotplugHandler
*hotplug_dev, DeviceState *dev,
pbdev->iommu->dma_limit = s390_pci_start_dma_count(s,
pbdev);
/* Fill in CLP information passed via the vfio region */
s390_pci_get_clp_info(pbdev);
+ /* Setup error handler for error recovery */
+ s390_pci_setup_err_handler(pbdev);
This can fail. Please add an 'Error **' parameter and change the returned
value to bool.
I wanted to avoid hard failing here as we can have mismatch in kernel
and QEMU support for the feature. For example we can have a newer QEMU
version with the feature running on an older kernel. So wanted to treat
any error in setting up the error handler would be more of an info/warn
message.
if (!pbdev->interp) {
/* Do vfio passthrough but intercept for I/O */
pbdev->fh |= FH_SHM_VFIO;
diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c
index aaf91319b4..87ecd06a81 100644
--- a/hw/s390x/s390-pci-vfio.c
+++ b/hw/s390x/s390-pci-vfio.c
@@ -10,6 +10,7 @@
*/
#include "qemu/osdep.h"
+#include "qemu/error-report.h"
#include <sys/ioctl.h>
#include <linux/vfio.h>
@@ -103,6 +104,60 @@ void s390_pci_end_dma_count(S390pciState *s,
S390PCIDMACount *cnt)
}
}
+static int s390_pci_get_feature_err(VFIOPCIDevice *vfio_pci,
+ struct
vfio_device_feature_zpci_err *err)
Please add an 'Error **' parameter and change the returned value to bool.
Ack, will change this.
+{
+ int ret;
+ uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
+ sizeof(struct
vfio_device_feature_zpci_err),
+ sizeof(uint64_t))] = {};
+ struct vfio_device_feature *feature = (struct
vfio_device_feature *)buf;
+
+ feature->argsz = sizeof(buf);
+ feature->flags = VFIO_DEVICE_FEATURE_GET |
VFIO_DEVICE_FEATURE_ZPCI_ERROR;
+ ret = vfio_pci->vbasedev.io_ops->device_feature(&vfio_pci->vbasedev,
+ feature);
Please introduce vfio helpers to hide the internal indirection :
->vbasedev.io_ops->device_feature(...)
Should we define the helpers in include/hw/vfio/vfio-device.h and should
we define a generic helper like vfio_device_get_feature(VFIODevice
*vdev, struct vfio_device_feature *feat) ?
+
+ if (ret) {
+ error_report("Failed feature get
VFIO_DEVICE_FEATURE_ZPCI_ERROR"
+ " (rc=%d)", ret);
+ return ret;
+ }
+
+ memcpy(err, (struct vfio_device_feature_zpci_err *) feature->data,
+ sizeof(struct vfio_device_feature_zpci_err));
+ return 0;
+}
+
+static void s390_pci_err_handler(VFIOPCIDevice *vfio_pci)
+{
+ S390PCIBusDevice *pbdev;
+ struct vfio_device_feature_zpci_err err;
+ int ret;
+
+ pbdev = s390_pci_find_dev_by_target(s390_get_phb(),
+ DEVICE(&vfio_pci->pdev)->id);
+
+ QEMU_LOCK_GUARD(&pbdev->err_handler_lock);
+
+ ret = s390_pci_get_feature_err(vfio_pci, &err);
+ if (ret) {
+ return;
+ }
+
+ pbdev->state = ZPCI_FS_ERROR;
+ s390_pci_generate_error_event(err.pec, pbdev->fh, pbdev->fid, 0,
0);
+
+ while (err.pending_errors) {
+ ret = s390_pci_get_feature_err(vfio_pci, &err);
+ if (ret) {
+ return;
+ }
+ s390_pci_generate_error_event(err.pec, pbdev->fh,
pbdev->fid, 0, 0);
+ }
+ return;
+}
+
static void s390_pci_read_base(S390PCIBusDevice *pbdev,
struct vfio_device_info *info)
{
@@ -369,3 +424,24 @@ void s390_pci_get_clp_info(S390PCIBusDevice *pbdev)
s390_pci_read_util(pbdev, info);
s390_pci_read_pfip(pbdev, info);
}
+
+void s390_pci_setup_err_handler(S390PCIBusDevice *pbdev)
+{
+ int ret;
+ VFIOPCIDevice *vfio_pci = container_of(pbdev->pdev,
VFIOPCIDevice, pdev);
+ uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
+ sizeof(uint64_t))] = {};
+ struct vfio_device_feature *feature = (struct
vfio_device_feature *)buf;
+
+ feature->argsz = sizeof(buf);
+ feature->flags = VFIO_DEVICE_FEATURE_PROBE |
VFIO_DEVICE_FEATURE_ZPCI_ERROR;
+
+ ret = vfio_pci->vbasedev.io_ops->device_feature(&vfio_pci->vbasedev,
+ feature);
Please introduce vfio helpers to hide the internal indirection :
->vbasedev.io_ops->device_feature(...)
+
+ if (ret) {
Shouldn't we test the return value to decide if the error is
an unimplemented feature or an unexpected error ?
Yeah, I think it makes sense separate out error for unimplemented
feature (ENOTTY) vs any other unexpected error. Will change this.
Thanks,
C.
+ info_report("Automated error recovery not available for
passthrough device");
+ return;
+ }
+ vfio_pci->arch_err_handler = s390_pci_err_handler;
+}
diff --git a/include/hw/s390x/s390-pci-bus.h
b/include/hw/s390x/s390-pci-bus.h
index 04944d4fed..3795e0bbfc 100644
--- a/include/hw/s390x/s390-pci-bus.h
+++ b/include/hw/s390x/s390-pci-bus.h
@@ -364,6 +364,7 @@ struct S390PCIBusDevice {
bool forwarding_assist;
bool aif;
bool rtr_avail;
+ QemuMutex err_handler_lock;
QTAILQ_ENTRY(S390PCIBusDevice) link;
};
diff --git a/include/hw/s390x/s390-pci-vfio.h
b/include/hw/s390x/s390-pci-vfio.h
index ae1b126ff7..66b274293c 100644
--- a/include/hw/s390x/s390-pci-vfio.h
+++ b/include/hw/s390x/s390-pci-vfio.h
@@ -22,6 +22,7 @@ S390PCIDMACount
*s390_pci_start_dma_count(S390pciState *s,
void s390_pci_end_dma_count(S390pciState *s, S390PCIDMACount *cnt);
bool s390_pci_get_host_fh(S390PCIBusDevice *pbdev, uint32_t *fh);
void s390_pci_get_clp_info(S390PCIBusDevice *pbdev);
+void s390_pci_setup_err_handler(S390PCIBusDevice *pbdev);
#else
static inline bool s390_pci_update_dma_avail(int fd, unsigned int
*avail)
{
@@ -39,6 +40,7 @@ static inline bool
s390_pci_get_host_fh(S390PCIBusDevice *pbdev, uint32_t *fh)
return false;
}
static inline void s390_pci_get_clp_info(S390PCIBusDevice *pbdev) { }
+static inline void s390_pci_setup_err_handler(S390PCIBusDevice
*pbdev) { }
#endif
#endif