date:20220216

[PATCH v6 15/19] vfio-user: handle device interrupts

2022-02-16 Thread Jagannathan Raman

Forward remote device's interrupts to the guest

Signed-off-by: Elena Ufimtseva 
Signed-off-by: John G Johnson 
Signed-off-by: Jagannathan Raman 
---
 include/hw/pci/pci.h  |   6 ++
 include/hw/remote/vfio-user-obj.h |   6 ++
 hw/pci/msi.c  |  13 +++-
 hw/pci/msix.c |  12 +++-
 hw/remote/machine.c   |  11 +--
 hw/remote/vfio-user-obj.c | 107 ++
 stubs/vfio-user-obj.c |   6 ++
 MAINTAINERS   |   1 +
 hw/remote/trace-events|   1 +
 stubs/meson.build |   1 +
 10 files changed, 158 insertions(+), 6 deletions(-)
 create mode 100644 include/hw/remote/vfio-user-obj.h
 create mode 100644 stubs/vfio-user-obj.c

diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index c3f3c90473..d42d526a48 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -129,6 +129,8 @@ typedef uint32_t PCIConfigReadFunc(PCIDevice *pci_dev,
 typedef void PCIMapIORegionFunc(PCIDevice *pci_dev, int region_num,
 pcibus_t addr, pcibus_t size, int type);
 typedef void PCIUnregisterFunc(PCIDevice *pci_dev);
+typedef void PCIMSINotify(PCIDevice *pci_dev, unsigned vector);
+typedef void PCIMSIxNotify(PCIDevice *pci_dev, unsigned vector);
 
 typedef struct PCIIORegion {
 pcibus_t addr; /* current PCI mapping address. -1 means not mapped */
@@ -323,6 +325,10 @@ struct PCIDevice {
 /* Space to store MSIX table & pending bit array */
 uint8_t *msix_table;
 uint8_t *msix_pba;
+
+PCIMSINotify *msi_notify;
+PCIMSIxNotify *msix_notify;
+
 /* MemoryRegion container for msix exclusive BAR setup */
 MemoryRegion msix_exclusive_bar;
 /* Memory Regions for MSIX table and pending bit entries. */
diff --git a/include/hw/remote/vfio-user-obj.h 
b/include/hw/remote/vfio-user-obj.h
new file mode 100644
index 00..87ab78b875
--- /dev/null
+++ b/include/hw/remote/vfio-user-obj.h
@@ -0,0 +1,6 @@
+#ifndef VFIO_USER_OBJ_H
+#define VFIO_USER_OBJ_H
+
+void vfu_object_set_bus_irq(PCIBus *pci_bus);
+
+#endif
diff --git a/hw/pci/msi.c b/hw/pci/msi.c
index 47d2b0f33c..93f5e400cc 100644
--- a/hw/pci/msi.c
+++ b/hw/pci/msi.c
@@ -51,6 +51,8 @@
  */
 bool msi_nonbroken;
 
+static void pci_msi_notify(PCIDevice *dev, unsigned int vector);
+
 /* If we get rid of cap allocator, we won't need this. */
 static inline uint8_t msi_cap_sizeof(uint16_t flags)
 {
@@ -225,6 +227,8 @@ int msi_init(struct PCIDevice *dev, uint8_t offset,
 dev->msi_cap = config_offset;
 dev->cap_present |= QEMU_PCI_CAP_MSI;
 
+dev->msi_notify = pci_msi_notify;
+
 pci_set_word(dev->config + msi_flags_off(dev), flags);
 pci_set_word(dev->wmask + msi_flags_off(dev),
  PCI_MSI_FLAGS_QSIZE | PCI_MSI_FLAGS_ENABLE);
@@ -307,7 +311,7 @@ bool msi_is_masked(const PCIDevice *dev, unsigned int 
vector)
 return mask & (1U << vector);
 }
 
-void msi_notify(PCIDevice *dev, unsigned int vector)
+static void pci_msi_notify(PCIDevice *dev, unsigned int vector)
 {
 uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
 bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
@@ -332,6 +336,13 @@ void msi_notify(PCIDevice *dev, unsigned int vector)
 msi_send_message(dev, msg);
 }
 
+void msi_notify(PCIDevice *dev, unsigned int vector)
+{
+if (dev->msi_notify) {
+dev->msi_notify(dev, vector);
+}
+}
+
 void msi_send_message(PCIDevice *dev, MSIMessage msg)
 {
 MemTxAttrs attrs = {};
diff --git a/hw/pci/msix.c b/hw/pci/msix.c
index ae9331cd0b..1c71e67f53 100644
--- a/hw/pci/msix.c
+++ b/hw/pci/msix.c
@@ -31,6 +31,8 @@
 #define MSIX_ENABLE_MASK (PCI_MSIX_FLAGS_ENABLE >> 8)
 #define MSIX_MASKALL_MASK (PCI_MSIX_FLAGS_MASKALL >> 8)
 
+static void pci_msix_notify(PCIDevice *dev, unsigned vector);
+
 MSIMessage msix_get_message(PCIDevice *dev, unsigned vector)
 {
 uint8_t *table_entry = dev->msix_table + vector * PCI_MSIX_ENTRY_SIZE;
@@ -334,6 +336,7 @@ int msix_init(struct PCIDevice *dev, unsigned short 
nentries,
 dev->msix_table = g_malloc0(table_size);
 dev->msix_pba = g_malloc0(pba_size);
 dev->msix_entry_used = g_malloc0(nentries * sizeof *dev->msix_entry_used);
+dev->msix_notify = pci_msix_notify;
 
 msix_mask_all(dev, nentries);
 
@@ -485,7 +488,7 @@ int msix_enabled(PCIDevice *dev)
 }
 
 /* Send an MSI-X message */
-void msix_notify(PCIDevice *dev, unsigned vector)
+static void pci_msix_notify(PCIDevice *dev, unsigned vector)
 {
 MSIMessage msg;
 
@@ -503,6 +506,13 @@ void msix_notify(PCIDevice *dev, unsigned vector)
 msi_send_message(dev, msg);
 }
 
+void msix_notify(PCIDevice *dev, unsigned vector)
+{
+if (dev->msix_notify) {
+dev->msix_notify(dev, vector);
+}
+}
+
 void msix_reset(PCIDevice *dev)
 {
 if (!msix_present(dev)) {
diff --git a/hw/remote/machine.c b/hw/remote/machine.c
index db4ae30710..a8b4a3aef3 100644
--- a/hw/remote/machine.c
+++

[PATCH v6 05/19] remote/machine: add vfio-user property

2022-02-16 Thread Jagannathan Raman

Add vfio-user to x-remote machine. It is a boolean, which indicates if
the machine supports vfio-user protocol. The machine configures the bus
differently vfio-user and multiprocess protocols, so this property
informs it on how to configure the bus.

This property should be short lived. Once vfio-user fully replaces
multiprocess, this property could be removed.

Signed-off-by: Elena Ufimtseva 
Signed-off-by: John G Johnson 
Signed-off-by: Jagannathan Raman 
---
 include/hw/remote/machine.h |  2 ++
 hw/remote/machine.c | 23 +++
 2 files changed, 25 insertions(+)

diff --git a/include/hw/remote/machine.h b/include/hw/remote/machine.h
index 2a2a33c4b2..8d0fa98d33 100644
--- a/include/hw/remote/machine.h
+++ b/include/hw/remote/machine.h
@@ -22,6 +22,8 @@ struct RemoteMachineState {
 
 RemotePCIHost *host;
 RemoteIOHubState iohub;
+
+bool vfio_user;
 };
 
 /* Used to pass to co-routine device and ioc. */
diff --git a/hw/remote/machine.c b/hw/remote/machine.c
index 0c5bd4f923..a9a75e170f 100644
--- a/hw/remote/machine.c
+++ b/hw/remote/machine.c
@@ -59,6 +59,25 @@ static void remote_machine_init(MachineState *machine)
 qbus_set_hotplug_handler(BUS(pci_host->bus), OBJECT(s));
 }
 
+static bool remote_machine_get_vfio_user(Object *obj, Error **errp)
+{
+RemoteMachineState *s = REMOTE_MACHINE(obj);
+
+return s->vfio_user;
+}
+
+static void remote_machine_set_vfio_user(Object *obj, bool value, Error **errp)
+{
+RemoteMachineState *s = REMOTE_MACHINE(obj);
+
+if (phase_check(PHASE_MACHINE_CREATED)) {
+error_setg(errp, "Error enabling vfio-user - machine already created");
+return;
+}
+
+s->vfio_user = value;
+}
+
 static void remote_machine_class_init(ObjectClass *oc, void *data)
 {
 MachineClass *mc = MACHINE_CLASS(oc);
@@ -68,6 +87,10 @@ static void remote_machine_class_init(ObjectClass *oc, void 
*data)
 mc->desc = "Experimental remote machine";
 
 hc->unplug = qdev_simple_device_unplug_cb;
+
+object_class_property_add_bool(oc, "vfio-user",
+   remote_machine_get_vfio_user,
+   remote_machine_set_vfio_user);
 }
 
 static const TypeInfo remote_machine = {
-- 
2.20.1

[PATCH v6 09/19] vfio-user: find and init PCI device

2022-02-16 Thread Jagannathan Raman

Find the PCI device with specified id. Initialize the device context
with the QEMU PCI device

Signed-off-by: Elena Ufimtseva 
Signed-off-by: John G Johnson 
Signed-off-by: Jagannathan Raman 
Reviewed-by: Stefan Hajnoczi 
---
 hw/remote/vfio-user-obj.c | 59 +++
 1 file changed, 59 insertions(+)

diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
index 496e6c8038..9c76913545 100644
--- a/hw/remote/vfio-user-obj.c
+++ b/hw/remote/vfio-user-obj.c
@@ -44,6 +44,8 @@
 #include "qemu/notify.h"
 #include "sysemu/sysemu.h"
 #include "libvfio-user.h"
+#include "hw/qdev-core.h"
+#include "hw/pci/pci.h"
 
 #define TYPE_VFU_OBJECT "x-vfio-user-server"
 OBJECT_DECLARE_TYPE(VfuObject, VfuObjectClass, VFU_OBJECT)
@@ -89,6 +91,10 @@ struct VfuObject {
 Notifier machine_done;
 
 vfu_ctx_t *vfu_ctx;
+
+PCIDevice *pci_dev;
+
+Error *unplug_blocker;
 };
 
 static void vfu_object_init_ctx(VfuObject *o, Error **errp);
@@ -163,6 +169,9 @@ static void vfu_object_machine_done(Notifier *notifier, 
void *data)
 static void vfu_object_init_ctx(VfuObject *o, Error **errp)
 {
 ERRP_GUARD();
+DeviceState *dev = NULL;
+vfu_pci_type_t pci_type = VFU_PCI_TYPE_CONVENTIONAL;
+int ret;
 
 if (o->vfu_ctx || !o->socket || !o->device ||
 !phase_check(PHASE_MACHINE_READY)) {
@@ -181,6 +190,48 @@ static void vfu_object_init_ctx(VfuObject *o, Error **errp)
 error_setg(errp, "vfu: Failed to create context - %s", 
strerror(errno));
 return;
 }
+
+dev = qdev_find_recursive(sysbus_get_default(), o->device);
+if (dev == NULL) {
+error_setg(errp, "vfu: Device %s not found", o->device);
+goto fail;
+}
+
+if (!object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
+error_setg(errp, "vfu: %s not a PCI device", o->device);
+goto fail;
+}
+
+o->pci_dev = PCI_DEVICE(dev);
+
+if (pci_is_express(o->pci_dev)) {
+pci_type = VFU_PCI_TYPE_EXPRESS;
+}
+
+ret = vfu_pci_init(o->vfu_ctx, pci_type, PCI_HEADER_TYPE_NORMAL, 0);
+if (ret < 0) {
+error_setg(errp,
+   "vfu: Failed to attach PCI device %s to context - %s",
+   o->device, strerror(errno));
+goto fail;
+}
+
+error_setg(>unplug_blocker,
+   "vfu: %s for %s must be deleted before unplugging",
+   TYPE_VFU_OBJECT, o->device);
+qdev_add_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
+
+return;
+
+fail:
+vfu_destroy_ctx(o->vfu_ctx);
+if (o->unplug_blocker && o->pci_dev) {
+qdev_del_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
+error_free(o->unplug_blocker);
+o->unplug_blocker = NULL;
+}
+o->vfu_ctx = NULL;
+o->pci_dev = NULL;
 }
 
 static void vfu_object_init(Object *obj)
@@ -221,6 +272,14 @@ static void vfu_object_finalize(Object *obj)
 
 o->device = NULL;
 
+if (o->unplug_blocker && o->pci_dev) {
+qdev_del_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
+error_free(o->unplug_blocker);
+o->unplug_blocker = NULL;
+}
+
+o->pci_dev = NULL;
+
 if (!k->nr_devs && k->auto_shutdown) {
 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
 }
-- 
2.20.1

[PATCH v6 11/19] vfio-user: handle PCI config space accesses

2022-02-16 Thread Jagannathan Raman

Define and register handlers for PCI config space accesses

Signed-off-by: Elena Ufimtseva 
Signed-off-by: John G Johnson 
Signed-off-by: Jagannathan Raman 
Reviewed-by: Stefan Hajnoczi 
---
 hw/remote/vfio-user-obj.c | 45 +++
 hw/remote/trace-events|  2 ++
 2 files changed, 47 insertions(+)

diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
index 384ec4612d..4c4280d603 100644
--- a/hw/remote/vfio-user-obj.c
+++ b/hw/remote/vfio-user-obj.c
@@ -47,6 +47,7 @@
 #include "qapi/qapi-events-misc.h"
 #include "qemu/notify.h"
 #include "qemu/thread.h"
+#include "qemu/main-loop.h"
 #include "sysemu/sysemu.h"
 #include "libvfio-user.h"
 #include "hw/qdev-core.h"
@@ -217,6 +218,39 @@ retry_attach:
 qemu_set_fd_handler(o->vfu_poll_fd, vfu_object_ctx_run, NULL, o);
 }
 
+static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf,
+ size_t count, loff_t offset,
+ const bool is_write)
+{
+VfuObject *o = vfu_get_private(vfu_ctx);
+uint32_t pci_access_width = sizeof(uint32_t);
+size_t bytes = count;
+uint32_t val = 0;
+char *ptr = buf;
+int len;
+
+while (bytes > 0) {
+len = (bytes > pci_access_width) ? pci_access_width : bytes;
+if (is_write) {
+memcpy(, ptr, len);
+pci_host_config_write_common(o->pci_dev, offset,
+ pci_config_size(o->pci_dev),
+ val, len);
+trace_vfu_cfg_write(offset, val);
+} else {
+val = pci_host_config_read_common(o->pci_dev, offset,
+  pci_config_size(o->pci_dev), 
len);
+memcpy(ptr, , len);
+trace_vfu_cfg_read(offset, val);
+}
+offset += len;
+ptr += len;
+bytes -= len;
+}
+
+return count;
+}
+
 /*
  * TYPE_VFU_OBJECT depends on the availability of the 'socket' and 'device'
  * properties. It also depends on devices instantiated in QEMU. These
@@ -293,6 +327,17 @@ static void vfu_object_init_ctx(VfuObject *o, Error **errp)
TYPE_VFU_OBJECT, o->device);
 qdev_add_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
 
+ret = vfu_setup_region(o->vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX,
+   pci_config_size(o->pci_dev), _object_cfg_access,
+   VFU_REGION_FLAG_RW | VFU_REGION_FLAG_ALWAYS_CB,
+   NULL, 0, -1, 0);
+if (ret < 0) {
+error_setg(errp,
+   "vfu: Failed to setup config space handlers for %s- %s",
+   o->device, strerror(errno));
+goto fail;
+}
+
 ret = vfu_realize_ctx(o->vfu_ctx);
 if (ret < 0) {
 error_setg(errp, "vfu: Failed to realize device %s- %s",
diff --git a/hw/remote/trace-events b/hw/remote/trace-events
index 7da12f0d96..2ef7884346 100644
--- a/hw/remote/trace-events
+++ b/hw/remote/trace-events
@@ -5,3 +5,5 @@ mpqemu_recv_io_error(int cmd, int size, int nfds) "failed to 
receive %d size %d,
 
 # vfio-user-obj.c
 vfu_prop(const char *prop, const char *val) "vfu: setting %s as %s"
+vfu_cfg_read(uint32_t offset, uint32_t val) "vfu: cfg: 0x%u -> 0x%x"
+vfu_cfg_write(uint32_t offset, uint32_t val) "vfu: cfg: 0x%u <- 0x%x"
-- 
2.20.1

[PATCH v6 04/19] remote/machine: add HotplugHandler for remote machine

2022-02-16 Thread Jagannathan Raman

Allow hotplugging of PCI(e) devices to remote machine

Signed-off-by: Elena Ufimtseva 
Signed-off-by: John G Johnson 
Signed-off-by: Jagannathan Raman 
---
 hw/remote/machine.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/hw/remote/machine.c b/hw/remote/machine.c
index 952105eab5..0c5bd4f923 100644
--- a/hw/remote/machine.c
+++ b/hw/remote/machine.c
@@ -21,6 +21,7 @@
 #include "qapi/error.h"
 #include "hw/pci/pci_host.h"
 #include "hw/remote/iohub.h"
+#include "hw/qdev-core.h"
 
 static void remote_machine_init(MachineState *machine)
 {
@@ -54,14 +55,19 @@ static void remote_machine_init(MachineState *machine)
 
 pci_bus_irqs(pci_host->bus, remote_iohub_set_irq, remote_iohub_map_irq,
  >iohub, REMOTE_IOHUB_NB_PIRQS);
+
+qbus_set_hotplug_handler(BUS(pci_host->bus), OBJECT(s));
 }
 
 static void remote_machine_class_init(ObjectClass *oc, void *data)
 {
 MachineClass *mc = MACHINE_CLASS(oc);
+HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
 
 mc->init = remote_machine_init;
 mc->desc = "Experimental remote machine";
+
+hc->unplug = qdev_simple_device_unplug_cb;
 }
 
 static const TypeInfo remote_machine = {
@@ -69,6 +75,10 @@ static const TypeInfo remote_machine = {
 .parent = TYPE_MACHINE,
 .instance_size = sizeof(RemoteMachineState),
 .class_init = remote_machine_class_init,
+.interfaces = (InterfaceInfo[]) {
+{ TYPE_HOTPLUG_HANDLER },
+{ }
+}
 };
 
 static void remote_machine_register_types(void)
-- 
2.20.1

[PATCH v6 00/19] vfio-user server in QEMU

2022-02-16 Thread Jagannathan Raman

Hi,

This is v6 of the server side changes to enable vfio-user in QEMU.

Thank you very much for your feedback for the last revision which
helped to streamline the overall design. We've made the following
changes to this revision:

[PATCH v6 03/19] qdev: unplug blocker for devices
  - removed test which prevented an unplug blocker
from getting added if migration was in progress
  - added comments to function

[PATCH v6 04/19] remote/machine: add HotplugHandler for remote machine
  - changed commit message prefix from vfio-user to "remote/machine"

[PATCH v6 05/19] remote/machine: add vfio-user property
  - new in this series

[PATCH v6 07/19] vfio-user: define vfio-user-server object
  - fixed typo noted in the review
  - moved error message before setting "o->socket = NULL" in
vfu_object_set_socket()
  - added "vfio-user=on" to the usage comment at the top of file

[PATCH v6 08/19] vfio-user: instantiate vfio-user context
  - added error message to the object set property message when
server is already running

[PATCH v6 09/19] vfio-user: find and init PCI device
  - added more detailed error message for device unplug blocker

[PATCH v6 10/19] vfio-user: run vfio-user context
  - send ID of device in VFU_CLIENT_HANGUP instead of path
  - disable FD handler in object finalize

[PATCH v6 12/19] vfio-user: IOMMU support for remote device
  - new in this series

[PATCH v6 13/19] vfio-user: handle DMA mappings
  - Setup IOMMU for remote machine if vfio-user is enabled
  - Map/Unmap the DMA regions in the IOMMU address space in
dma_register()/dma_unregister() using
pci_device_iommu_address_space() function

[PATCH v6 14/19] vfio-user: handle PCI BAR accesses
  - vfu_object_bar_rw() - directly access the bar region
instead of accessing via address_space_rw()
  - register handler for PCI ROM region
  - set read only flags for read only MemoryRegions with
vfu_setup_region()

[PATCH v6 15/19] vfio-user: handle device interrupts
  - setup separate PCI bus map_irq and set_irq for
vfio-user during remote machine init
  - index hash table using PCI bud device function numbers

[PATCH v6 16/19] vfio-user: handle device interrupts
  - new in this series

[PATCH v6 17/19] vfio-user: register handlers to facilitate migration
  - enable streaming for migration data instead pre-determining
the migration data size at boot
  - dropped migrated_devs static variable to track the number of
devices migrated
  - added helper functions to independently start stop block and
network devices
  - updated qemu_remote_savevm() to migrate data of all the
devices under the target device

[PATCH v6 18/19] vfio-user: handle reset of remote device
  - new in this series

[PATCH v6 19/19] vfio-user: avocado tests for vfio-user
  - use QMP command for hotplug instead of HMP command
  - confirm the state of source and destination VMs after migration
  - testing megasas device instead of lsi53c895a as lsi53c895a
doesn't seem to support IOMMU, which is enabled by default
on the server

We dropped the following patches from the previous revision:
  - pci: isolated address space for PCI bus
  - pci: create and free isolated PCI buses
  - vfio-user: set qdev bus callbacks for remote machine

We are looking forward to your comments.

Thank you very much!

Jagannathan Raman (19):
  configure, meson: override C compiler for cmake
  tests/avocado: Specify target VM argument to helper routines
  qdev: unplug blocker for devices
  remote/machine: add HotplugHandler for remote machine
  remote/machine: add vfio-user property
  vfio-user: build library
  vfio-user: define vfio-user-server object
  vfio-user: instantiate vfio-user context
  vfio-user: find and init PCI device
  vfio-user: run vfio-user context
  vfio-user: handle PCI config space accesses
  vfio-user: IOMMU support for remote device
  vfio-user: handle DMA mappings
  vfio-user: handle PCI BAR accesses
  vfio-user: handle device interrupts
  softmmu/vl: defer backend init
  vfio-user: register handlers to facilitate migration
  vfio-user: handle reset of remote device
  vfio-user: avocado tests for vfio-user

 configure  |   21 +-
 meson.build|   44 +-
 qapi/misc.json |   23 +
 qapi/qom.json  |   20 +-
 include/block/block.h  |1 +
 include/exec/memory.h  |3 +
 include/hw/pci/pci.h   |6 +
 include/hw/qdev-core.h |   35 +
 include/hw/remote/iommu.h  |   18 +
 include/hw/remote/machine.h|2 +
 include/hw/remote/vfio-user-obj.h  |6 +
 include/migration/vmstate.h|2 +
 include/sysemu/sysemu.h|4 +
 migration/savevm.h |2 +
 block.c|5 +
 block/block-backend.c  |

[PATCH v6 01/19] configure, meson: override C compiler for cmake

2022-02-16 Thread Jagannathan Raman

The compiler path that cmake gets from meson is corrupted. It results in
the following error:
| -- The C compiler identification is unknown
| CMake Error at CMakeLists.txt:35 (project):
| The CMAKE_C_COMPILER:
| /opt/rh/devtoolset-9/root/bin/cc;-m64;-mcx16
| is not a full path to an existing compiler tool.

Explicitly specify the C compiler for cmake to avoid this error

Signed-off-by: Jagannathan Raman 
Acked-by: Paolo Bonzini 
---
 configure | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/configure b/configure
index 3a29eff5cc..9a326eda1e 100755
--- a/configure
+++ b/configure
@@ -3726,6 +3726,8 @@ if test "$skip_meson" = no; then
   echo "cpp_args = [$(meson_quote $CXXFLAGS $EXTRA_CXXFLAGS)]" >> $cross
   echo "c_link_args = [$(meson_quote $CFLAGS $LDFLAGS $EXTRA_CFLAGS 
$EXTRA_LDFLAGS)]" >> $cross
   echo "cpp_link_args = [$(meson_quote $CXXFLAGS $LDFLAGS $EXTRA_CXXFLAGS 
$EXTRA_LDFLAGS)]" >> $cross
+  echo "[cmake]" >> $cross
+  echo "CMAKE_C_COMPILER = [$(meson_quote $cc $CPU_CFLAGS)]" >> $cross
   echo "[binaries]" >> $cross
   echo "c = [$(meson_quote $cc $CPU_CFLAGS)]" >> $cross
   test -n "$cxx" && echo "cpp = [$(meson_quote $cxx $CPU_CFLAGS)]" >> $cross
-- 
2.20.1

Re: [PATCH 3/3] x86: Switch to q35 as the default machine type

2022-02-16 Thread Thomas Huth


On 16/02/2022 18.57, Dr. David Alan Gilbert wrote:

* Daniel P. Berrangé (berra...@redhat.com) wrote:

On Wed, Feb 16, 2022 at 05:40:44PM +, Dr. David Alan Gilbert wrote:

* Thomas Huth (th...@redhat.com) wrote:

On 16/02/2022 12.01, Dr. David Alan Gilbert wrote:

* Gerd Hoffmann (kra...@redhat.com) wrote:

Hi,

Given the semantic differences from 'i440fx', changing the default
machine type has effects that are equivalent to breaking command
line syntax compatibility, which is something we've always tried
to avoid.


And if we are fine breaking backward compatibility I'd rather *not* pick
a default, effectively making -M $something mandatory, similar to arm.


Oh, that's probably easy to do;  what are other peoples thoughts on
that?


I agree with Gerd. Getting rid of a default machine on x86 is likely better
than silently changing it to q35. But I'd maybe say that this should go
through the deprecation process first?


So just adding something like the following under 'System emulator
machines':

x86 default machine type


x86 currently defaults to the very old ```pc``` machine type
which is based on the very old ```i440f``` chipset.  This default
will be removed and the user will be required to specify a machine
type explicitly using -M; users are encouraged to switch to the
not quite as old ```q35``` machine types.


(This option is going to take a lot more work switching all the
test cases over; in my world here I'd only changed the tests that broke
on q35, now everything is going to need to specify a type).


which is still nothing compared to how many users will be impacted
and the docs all over the internet we'll never be able to change, which
give illustrations using qemu command lines without '-M pc'


What's your prreference - it sounds like you'd prefer to leave the
default as 'pc' ?

aarch's message is:
qemu-system-aarch64: No machine specified, and there is no default
Use -machine help to list supported machines

We could add a:
'Use -machine pc for the old default behaviour'


Sounds like a good idea, but then I'd also do:

'Use -M pc for the old default behaviour, or -M q35 for a more modern 
machine' or so.


 Thomas

Re: [PATCH v8 3/3] qapi/monitor: allow VNC display id in set/expire_password

2022-02-16 Thread Fabian Ebner

Am 09.02.22 um 15:07 schrieb Markus Armbruster:
> Fabian Ebner  writes:
> 
>> From: Stefan Reiter 
>>
>> It is possible to specify more than one VNC server on the command line,
>> either with an explicit ID or the auto-generated ones à la "default",
>> "vnc2", "vnc3", ...
>>
>> It is not possible to change the password on one of these extra VNC
>> displays though. Fix this by adding a "display" parameter to the
>> "set_password" and "expire_password" QMP and HMP commands.
>>
>> For HMP, the display is specified using the "-d" value flag.
>>
>> For QMP, the schema is updated to explicitly express the supported
>> variants of the commands with protocol-discriminated unions.
>>
>> Suggested-by: Markus Armbruster 
> 
> Did I suggest this feature?  I don't remember...  Most likely, I merely
> suggested using a union.  Mind if I drop this tag?
> 

Yes, Stefan might've put the tag because of the suggested approach. I'll
drop it.

>> Signed-off-by: Stefan Reiter 
>> [FE: update "Since: " from 6.2 to 7.0
>>  set {has_}connected for VNC in hmp_set_password]
>> Signed-off-by: Fabian Ebner 
>> ---
>>
>> v7 -> v8:
>> * add missing # in the description for @ExpirePasswordOptions
>> * other changes are already mentioned above
>>
>>  hmp-commands.hx|  24 +-
>>  monitor/hmp-cmds.c |  39 
>>  monitor/qmp-cmds.c |  34 ++
>>  qapi/ui.json   | 110 -
>>  4 files changed, 145 insertions(+), 62 deletions(-)
>>
>> diff --git a/hmp-commands.hx b/hmp-commands.hx
>> index 70a9136ac2..cc2f4bdeba 100644
>> --- a/hmp-commands.hx
>> +++ b/hmp-commands.hx
>> @@ -1514,33 +1514,35 @@ ERST
>>  
>>  {
>>  .name   = "set_password",
>> -.args_type  = "protocol:s,password:s,connected:s?",
>> -.params = "protocol password action-if-connected",
>> +.args_type  = "protocol:s,password:s,display:-dV,connected:s?",
>> +.params = "protocol password [-d display] 
>> [action-if-connected]",
>>  .help   = "set spice/vnc password",
>>  .cmd= hmp_set_password,
>>  },
>>  
>>  SRST
>> -``set_password [ vnc | spice ] password [ action-if-connected ]``
>> -  Change spice/vnc password.  *action-if-connected* specifies what
>> -  should happen in case a connection is established: *fail* makes the
>> -  password change fail.  *disconnect* changes the password and
>> +``set_password [ vnc | spice ] password [ -d display ] [ 
>> action-if-connected ]``
> 
> This is the first flag with an argument in HMP.  The alternative is
> another optional argument.
> 
> PRO optional argument: no need for PATCH 1.
> 
> PRO flag with argument: can specify the display without
> action-if-connected.
> 
> Dave, this is your call to make.
> 

I'll go ahead with v9 once the decision is made.

8<

>> diff --git a/qapi/ui.json b/qapi/ui.json
>> index e112409211..089f05c702 100644
>> --- a/qapi/ui.json
>> +++ b/qapi/ui.json
>> @@ -38,20 +38,61 @@
>>'data': [ 'keep', 'fail', 'disconnect' ] }
>>  
>>  ##
>> -# @set_password:
>> +# @SetPasswordOptions:
>>  #
>> -# Sets the password of a remote display session.
>> +# General options for set_password.
> 
> Actually, all the options there are.  Let's drop "General".
> 

Ok.

>>  #
>>  # @protocol: - 'vnc' to modify the VNC server password
>>  #- 'spice' to modify the Spice server password
>>  #
>>  # @password: the new password
>>  #
>> -# @connected: how to handle existing clients when changing the
>> -# password.  If nothing is specified, defaults to 'keep'
>> -# 'fail' to fail the command if clients are connected
>> -# 'disconnect' to disconnect existing clients
>> -# 'keep' to maintain existing clients
>> +# Since: 7.0
>> +#
>> +##
>> +{ 'union': 'SetPasswordOptions',
>> +  'base': { 'protocol': 'DisplayProtocol',
>> +'password': 'str' },
>> +  'discriminator': 'protocol',
>> +  'data': { 'vnc': 'SetPasswordOptionsVnc',
>> +'spice': 'SetPasswordOptionsSpice' } }
>> +
>> +##
>> +# @SetPasswordOptionsSpice:
>> +#
>> +# Options for set_password specific to the SPICE procotol.
>> +#
>> +# @connected: How to handle existing clients when changing the
>> +# password. If nothing is specified, defaults to 'keep'.
>> +#
>> +# Since: 7.0
>> +#
>> +##
>> +{ 'struct': 'SetPasswordOptionsSpice',
>> +  'data': { '*connected': 'SetPasswordAction' } }
>> +
>> +##
>> +# @SetPasswordOptionsVnc:
>> +#
>> +# Options for set_password specific to the VNC procotol.
>> +#
>> +# @display: The id of the display where the password should be changed.
>> +#   Defaults to the first.
>> +#
>> +# @connected: How to handle existing clients when changing the
>> +# password.
> 
> Neglects to document the default, unlike SetPasswordOptionsSpice above.
> 

Will add it in v9.

>> +#
>> +# Since: 7.0
>> +#
>> +##
>> +{ 'struct': 'SetPasswordOptionsVnc',
>> +  'data': {

Re: [PATCH 2/2] Allow VIRTIO_F_IN_ORDER to be negotiated for vdpa devices

2022-02-16 Thread Michael S. Tsirkin

On Tue, Feb 15, 2022 at 12:52:31PM +0530, Gautam Dawar wrote:
> This patch adds the ability to negotiate VIRTIO_F_IN_ORDER bit
> for vhost-vdpa backend when the underlying device supports this
> feature.
> This would aid in reaping performance benefits with HW devices
> that implement this feature. At the same time, it shouldn't have
> any negative impact as vhost-vdpa backend doesn't involve any
> userspace virtqueue operations.
> 
> Signed-off-by: Gautam Dawar 

Having features that hardware implements but qemu does not
means we can't migrate between them.
So I'd rather see a userspace implementation.

> ---
>  hw/net/virtio-net.c | 10 ++
>  net/vhost-vdpa.c|  1 +
>  2 files changed, 11 insertions(+)
> 
> diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> index cf8ab0f8af..a1089d06f6 100644
> --- a/hw/net/virtio-net.c
> +++ b/hw/net/virtio-net.c
> @@ -3507,11 +3507,21 @@ static void virtio_net_device_realize(DeviceState 
> *dev, Error **errp)
>  nc->rxfilter_notify_enabled = 1;
>  
> if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
> +uint64_t features = BIT_ULL(VIRTIO_F_IN_ORDER);
>  struct virtio_net_config netcfg = {};
> +
>  memcpy(, >nic_conf.macaddr, ETH_ALEN);
>  vhost_net_set_config(get_vhost_net(nc->peer),
>  (uint8_t *), 0, ETH_ALEN, VHOST_SET_CONFIG_TYPE_MASTER);
> +
> + /*
> + * For vhost-vdpa, if underlying device supports IN_ORDER feature,
> + * make it available for negotiation.
> + */
> + features = vhost_net_get_features(get_vhost_net(nc->peer), features);
> + n->host_features |= features;
>  }
> +
>  QTAILQ_INIT(>rsc_chains);
>  n->qdev = dev;
>  
> diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
> index 25dd6dd975..2886cba5ec 100644
> --- a/net/vhost-vdpa.c
> +++ b/net/vhost-vdpa.c
> @@ -62,6 +62,7 @@ const int vdpa_feature_bits[] = {
>  VIRTIO_NET_F_CTRL_VQ,
>  VIRTIO_F_IOMMU_PLATFORM,
>  VIRTIO_F_RING_PACKED,
> +VIRTIO_F_IN_ORDER,
>  VIRTIO_NET_F_RSS,
>  VIRTIO_NET_F_HASH_REPORT,
>  VIRTIO_NET_F_GUEST_ANNOUNCE,
> -- 
> 2.30.1

Re: QEMU's Haiku CI image

2022-02-16 Thread Thomas Huth


On 16/02/2022 20.21, Daniel P. Berrangé wrote:

[...] The main issue is that for non-Linux,
we don't have full automation for building the VM templates. We need
someone to prepare the image by getting it able to run and expose
SSH, whereupon we can provision the build-deps.


That's easy: In QEMU build folder, type:

 make vm-build-netbsd
 make vm-build-openbsd
 make vm-build-haiku.x86_64

... and then you can find the images in the ~/.cache/qemu-vm/images/ folder.

 Thomas

Re: [PATCH v4 2/2] target/riscv: Enable Zicbo[m,z,p] instructions

2022-02-16 Thread Weiwei Li



在 2022/2/17 上午11:59, Christoph Müllner 写道:



On Thu, Feb 17, 2022 at 3:15 AM Weiwei Li > wrote:



在 2022/2/16 下午11:48, Christoph Muellner 写道:
> diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
> index 39ffb883fc..04500fe352 100644
> --- a/target/riscv/cpu.c
> +++ b/target/riscv/cpu.c
> @@ -764,6 +764,10 @@ static Property riscv_cpu_properties[] = {
>       DEFINE_PROP_BOOL("Counters", RISCVCPU, cfg.ext_counters,
true),
>       DEFINE_PROP_BOOL("Zifencei", RISCVCPU, cfg.ext_ifencei, true),
>       DEFINE_PROP_BOOL("Zicsr", RISCVCPU, cfg.ext_icsr, true),
> +    DEFINE_PROP_BOOL("zicbom", RISCVCPU, cfg.ext_icbom, true),
> +    DEFINE_PROP_BOOL("zicboz", RISCVCPU, cfg.ext_icboz, true),
> +    DEFINE_PROP_UINT16("cbom_blocksize", RISCVCPU,
cfg.cbom_blocksize, 64),
> +    DEFINE_PROP_UINT16("cboz_blocksize", RISCVCPU,
cfg.cboz_blocksize, 64),
Why use two different cache block size here? Is there any new spec
update for this?


No, we are talking about the same specification.

Section 2.7 states the following:
"""
The initial set of CMO extensions requires the following information 
to be discovered by software:

* The size of the cache block for management and prefetch instructions
* The size of the cache block for zero instructions
* CBIE support at each privilege level
"""

So at least the spec authors did differentiate between the two block 
sizes as well.



OK. This seems a little unreasonable from personal point.


>       DEFINE_PROP_BOOL("Zfh", RISCVCPU, cfg.ext_zfh, false),
>       DEFINE_PROP_BOOL("Zfhmin", RISCVCPU, cfg.ext_zfhmin, false),
>       DEFINE_PROP_BOOL("Zve32f", RISCVCPU, cfg.ext_zve32f, false),
> +
> +/* helper_zicbom_access
> + *
> + * Check access permissions (LOAD, STORE or FETCH as specified
in section
> + * 2.5.2 of the CMO specification) for Zicbom, raising either store
> + * page-fault (non-virtualised) or store guest-page fault
(virtualised).
> + */
> +static void helper_zicbom_access(CPURISCVState *env,
target_ulong address,
> +                                 uintptr_t ra)
> +{
> +    int ret;
> +    void* phost;
> +    int mmu_idx = cpu_mmu_index(env, false);
> +
> +    /* Get the size of the cache block for management
instructions. */
> +    RISCVCPU *cpu = env_archcpu(env);
> +    uint16_t cbomlen = cpu->cfg.cbom_blocksize;
> +
> +    /* Mask off low-bits to align-down to the cache-block. */
> +    address &= ~(cbomlen - 1);
> +
> +    /* A cache-block management instruction is permitted to access
> +     * the specified cache block whenever a load instruction, store
> +     * instruction, or instruction fetch is permitted to access the
> +     * corresponding physical addresses.
> +     */
> +    ret = probe_access_range_flags(env, address, cbomlen,
MMU_DATA_LOAD,
> +                                   mmu_idx, true, , ra);
> +    if (ret == TLB_INVALID_MASK)
> +        ret = probe_access_range_flags(env, address, cbomlen,
MMU_INST_FETCH,
> +                                       mmu_idx, true, , ra);
> +    if (ret == TLB_INVALID_MASK)
> +        probe_access_range_flags(env, address, cbomlen,
MMU_DATA_STORE,
> +                                 mmu_idx, false, , ra);
> +}
> +


I think it's a little different here. Probe_access_range_flags may
trigger different execptions for different access_type. For example:

If  the page for the address  is executable and readable but not
writable,  and the access cannot pass the pmp check for all
access_type,

it may trigger access fault for load/fetch access, and trigger page
fault for  store access.


Just to be clear:
The patch does not trigger any fault for LOAD or FETCH because 
nonfault is set

to true (6th argument of probe_access_range_flags()).
Only the last call to probe_access_range_flags() raises an exception.

Section 2.5.2 states the following:
"""
If access to the cache block is not permitted, a cache-block management
instruction raises a store page fault or store guest-page fault 
exception if address translation does not permit any

access or raises a store access fault exception otherwise.
"""

In your scenario we have (1...allowed; 0...not allowed):
* read: perm:1, pmp:0
* fetch: perm:1: pmp:0
* write: perm:0, pmp:0

Address translation would allow read and fetch access, but PMP blocks 
that.
So the "does not permit any"-part is wrong, therefore we should raise 
a store page fault.


There is debate between us here. I think the opposite of "any" here is 
"permit one of access type" not "permit all access types".


 And from your above code,  it also will ignore check for fetch and 
write, if read access is permitted(the only difference with my example 
is that read also pass PMP check).


So if Address

Re: [PATCH 1/2] linux headers: update against Linux 5.17-rc4

2022-02-16 Thread Jason Wang

On Tue, Feb 15, 2022 at 3:23 PM Gautam Dawar  wrote:
>
> This update is done to bring in the definition of VIRTIO_F_IN_ORDER
> from Linux kernel's include/uapi/linux/virtio_config.h.
> A patch was recently published to add VIRTIO_F_IN_ORDER's definition
> in the Linux kernel on top of version 5.17-rc4
>
> Signed-off-by: Gautam Dawar 

I may miss something, this might only work if the kernel patch has
been merged. AFAIK, it's not merged yet.

Thanks

> ---
>  include/standard-headers/asm-x86/kvm_para.h   |   1 +
>  include/standard-headers/drm/drm_fourcc.h |  11 ++
>  include/standard-headers/linux/ethtool.h  |   1 +
>  include/standard-headers/linux/fuse.h |  60 +++-
>  include/standard-headers/linux/pci_regs.h | 142 +-
>  .../standard-headers/linux/virtio_config.h|   6 +
>  include/standard-headers/linux/virtio_gpio.h  |  72 +
>  include/standard-headers/linux/virtio_i2c.h   |  47 ++
>  include/standard-headers/linux/virtio_iommu.h |   8 +-
>  .../standard-headers/linux/virtio_pcidev.h|  65 
>  include/standard-headers/linux/virtio_scmi.h  |  24 +++
>  linux-headers/asm-generic/unistd.h|   5 +-
>  linux-headers/asm-mips/unistd_n32.h   |   2 +
>  linux-headers/asm-mips/unistd_n64.h   |   2 +
>  linux-headers/asm-mips/unistd_o32.h   |   2 +
>  linux-headers/asm-powerpc/unistd_32.h |   2 +
>  linux-headers/asm-powerpc/unistd_64.h |   2 +
>  linux-headers/asm-riscv/bitsperlong.h |  14 ++
>  linux-headers/asm-riscv/mman.h|   1 +
>  linux-headers/asm-riscv/unistd.h  |  44 ++
>  linux-headers/asm-s390/unistd_32.h|   2 +
>  linux-headers/asm-s390/unistd_64.h|   2 +
>  linux-headers/asm-x86/kvm.h   |  19 ++-
>  linux-headers/asm-x86/unistd_32.h |   1 +
>  linux-headers/asm-x86/unistd_64.h |   1 +
>  linux-headers/asm-x86/unistd_x32.h|   1 +
>  linux-headers/linux/kvm.h |  18 +++
>  27 files changed, 479 insertions(+), 76 deletions(-)
>  create mode 100644 include/standard-headers/linux/virtio_gpio.h
>  create mode 100644 include/standard-headers/linux/virtio_i2c.h
>  create mode 100644 include/standard-headers/linux/virtio_pcidev.h
>  create mode 100644 include/standard-headers/linux/virtio_scmi.h
>  create mode 100644 linux-headers/asm-riscv/bitsperlong.h
>  create mode 100644 linux-headers/asm-riscv/mman.h
>  create mode 100644 linux-headers/asm-riscv/unistd.h
>
> diff --git a/include/standard-headers/asm-x86/kvm_para.h 
> b/include/standard-headers/asm-x86/kvm_para.h
> index 204cfb8640..f0235e58a1 100644
> --- a/include/standard-headers/asm-x86/kvm_para.h
> +++ b/include/standard-headers/asm-x86/kvm_para.h
> @@ -8,6 +8,7 @@
>   * should be used to determine that a VM is running under KVM.
>   */
>  #define KVM_CPUID_SIGNATURE0x4000
> +#define KVM_SIGNATURE "KVMKVMKVM\0\0\0"
>
>  /* This CPUID returns two feature bitmaps in eax, edx. Before enabling
>   * a particular paravirtualization, the appropriate feature bit should
> diff --git a/include/standard-headers/drm/drm_fourcc.h 
> b/include/standard-headers/drm/drm_fourcc.h
> index 2c025cb4fe..4888f85f69 100644
> --- a/include/standard-headers/drm/drm_fourcc.h
> +++ b/include/standard-headers/drm/drm_fourcc.h
> @@ -313,6 +313,13 @@ extern "C" {
>   */
>  #define DRM_FORMAT_P016fourcc_code('P', '0', '1', '6') /* 
> 2x2 subsampled Cr:Cb plane 16 bits per channel */
>
> +/* 2 plane YCbCr420.
> + * 3 10 bit components and 2 padding bits packed into 4 bytes.
> + * index 0 = Y plane, [31:0] x:Y2:Y1:Y0 2:10:10:10 little endian
> + * index 1 = Cr:Cb plane, [63:0] x:Cr2:Cb2:Cr1:x:Cb1:Cr0:Cb0 
> [2:10:10:10:2:10:10:10] little endian
> + */
> +#define DRM_FORMAT_P030fourcc_code('P', '0', '3', '0') /* 
> 2x2 subsampled Cr:Cb plane 10 bits per channel packed */
> +
>  /* 3 plane non-subsampled (444) YCbCr
>   * 16 bits per component, but only 10 bits are used and 6 bits are padded
>   * index 0: Y plane, [15:0] Y:x [10:6] little endian
> @@ -853,6 +860,10 @@ drm_fourcc_canonicalize_nvidia_format_mod(uint64_t 
> modifier)
>   * and UV.  Some SAND-using hardware stores UV in a separate tiled
>   * image from Y to reduce the column height, which is not supported
>   * with these modifiers.
> + *
> + * The DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT modifier is also
> + * supported for DRM_FORMAT_P030 where the columns remain as 128 bytes
> + * wide, but as this is a 10 bpp format that translates to 96 pixels.
>   */
>
>  #define DRM_FORMAT_MOD_BROADCOM_SAND32_COL_HEIGHT(v) \
> diff --git a/include/standard-headers/linux/ethtool.h 
> b/include/standard-headers/linux/ethtool.h
> index 688eb8dc39..38d5a4cd6e 100644
> --- a/include/standard-headers/linux/ethtool.h
> +++ b/include/standard-headers/linux/ethtool.h
> @@ -231,6 +231,7 @@ enum tunable_id {
> ETHTOOL_RX_COPYBREAK,
>

Re: [PATCH 2/2] Allow VIRTIO_F_IN_ORDER to be negotiated for vdpa devices

2022-02-16 Thread Jason Wang

On Tue, Feb 15, 2022 at 3:23 PM Gautam Dawar  wrote:
>
> This patch adds the ability to negotiate VIRTIO_F_IN_ORDER bit
> for vhost-vdpa backend when the underlying device supports this
> feature.
> This would aid in reaping performance benefits with HW devices
> that implement this feature. At the same time, it shouldn't have
> any negative impact as vhost-vdpa backend doesn't involve any
> userspace virtqueue operations.
>
> Signed-off-by: Gautam Dawar 
> ---
>  hw/net/virtio-net.c | 10 ++
>  net/vhost-vdpa.c|  1 +
>  2 files changed, 11 insertions(+)
>
> diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> index cf8ab0f8af..a1089d06f6 100644
> --- a/hw/net/virtio-net.c
> +++ b/hw/net/virtio-net.c
> @@ -3507,11 +3507,21 @@ static void virtio_net_device_realize(DeviceState 
> *dev, Error **errp)
>  nc->rxfilter_notify_enabled = 1;
>
> if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
> +uint64_t features = BIT_ULL(VIRTIO_F_IN_ORDER);
>  struct virtio_net_config netcfg = {};
> +
>  memcpy(, >nic_conf.macaddr, ETH_ALEN);
>  vhost_net_set_config(get_vhost_net(nc->peer),
>  (uint8_t *), 0, ETH_ALEN, VHOST_SET_CONFIG_TYPE_MASTER);
> +
> +   /*
> + * For vhost-vdpa, if underlying device supports IN_ORDER feature,
> + * make it available for negotiation.
> + */
> +   features = vhost_net_get_features(get_vhost_net(nc->peer), features);
> +   n->host_features |= features;

This looks like a hack, considering we will finally support in_order.
I wonder if it's better to

1) introduce command line parameters "in_order"
2) fail without vhost-vdpa

?

Thanks

>  }
> +
>  QTAILQ_INIT(>rsc_chains);
>  n->qdev = dev;
>
> diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
> index 25dd6dd975..2886cba5ec 100644
> --- a/net/vhost-vdpa.c
> +++ b/net/vhost-vdpa.c
> @@ -62,6 +62,7 @@ const int vdpa_feature_bits[] = {
>  VIRTIO_NET_F_CTRL_VQ,
>  VIRTIO_F_IOMMU_PLATFORM,
>  VIRTIO_F_RING_PACKED,
> +VIRTIO_F_IN_ORDER,
>  VIRTIO_NET_F_RSS,
>  VIRTIO_NET_F_HASH_REPORT,
>  VIRTIO_NET_F_GUEST_ANNOUNCE,
> --
> 2.30.1
>

Re: Call for GSoC and Outreachy project ideas for summer 2022

2022-02-16 Thread Alice Frosi

On Fri, Jan 28, 2022 at 6:04 PM Stefan Hajnoczi  wrote:
>
> Dear QEMU, KVM, and rust-vmm communities,
> QEMU will apply for Google Summer of Code 2022
> (https://summerofcode.withgoogle.com/) and has been accepted into
> Outreachy May-August 2022 (https://www.outreachy.org/). You can now
> submit internship project ideas for QEMU, KVM, and rust-vmm!
>
> If you have experience contributing to QEMU, KVM, or rust-vmm you can
> be a mentor. It's a great way to give back and you get to work with
> people who are just starting out in open source.
>
> Please reply to this email by February 21st with your project ideas.
>
> Good project ideas are suitable for remote work by a competent
> programmer who is not yet familiar with the codebase. In
> addition, they are:
> - Well-defined - the scope is clear
> - Self-contained - there are few dependencies
> - Uncontroversial - they are acceptable to the community
> - Incremental - they produce deliverables along the way
>
> Feel free to post ideas even if you are unable to mentor the project.
> It doesn't hurt to share the idea!
>

I'd like to propose this idea:

Title: Create encrypted storage using VM-based container runtimes

Cryptsetup requires root privileges in order to be able to encrypt
storage with luks. However, privileged containers are generally
discouraged for security reasons. A possible solution to avoid extra
privileges is using VM-based container runtimes (e.g crun with libkrun
or kata-containers) and running inside the Virtual Machine the tools
for the storage encryption.

This internship focus on a PoC for integrating and extending crun with
libkrun in order to be able to create encrypted storage. The initial
step will focus on creating encrypted images to demonstrate the
feasibility and the necessary changes in the stack. If the timeframe
allows it, an interesting follow-up of the first step is the
encryption of persistent storage using block-based PVCs.

Language: C, rust, golang
Skills: containers and virtualization would be a big plus
I won't put a level but the intern needs to be willing to dig into
different source codes like crun (written in C), libkrun (written in
Rust) and possibly podman or other kubernetes/containers projects
(written in go)
Mentor: Alice Frosi, Co-mentor: Sergio Lopez Pascual

Let me know if the idea sounds feasible to you!

Many thanks,

Alice

Re: [PATCH v3 0/7] malta: Fix PCI IRQ levels to be preserved during migration, cleanup

2022-02-16 Thread Michael S. Tsirkin

On Wed, Feb 16, 2022 at 11:45:12PM +0100, Bernhard Beschow wrote:
> Tested with [1]:
> 
>   qemu-system-mipsel -M malta -kernel vmlinux-3.2.0-4-4kc-malta -hda \
>   debian_wheezy_mipsel_standard.qcow2 -append "root=/dev/sda1 console=tty0"
> 
> It was possible to log in as root and `poweroff` the machine.
> 
> Moreover, I ran:
> 
>   :$ make check
>   Ok: 569
>   Expected Fail:  0
>   Fail:   0
>   Unexpected Pass:0
>   Skipped:178
>   Timeout:0
> 
> [1] https://people.debian.org/~aurel32/qemu/mips/
> 

Who's merging this? I assume mips guys?

> v3:
>   The migration bug now gets fixed in gt64xxx_pci before any cleanup. As
> suggested by PMM the patch is based on commit e735b55a8c11.
>   The code movement patch now moves the already fixed code. I might be a bit
> too conservative here by removing Philippe's Reviewed-By tag.
>   As suggested by BALATON Zoltan, the redundant i8259[] attribute is now
> resolved immediately after the code movement. As a side effect, it also
> removes moved code which doesn't adhere to the coding style (local loop
> variable).
>   To address BALATON Zoltan's comment and to reduce the number of required
> Reviewed-By's, only piix4_set_irq() is modified to expect own DeviceState
> paremeter. Up to v2, all remaining set_irq() functions were changed this
> way.
>   The patch resolving piix4's singleton variable got split into two patches:
> One which resolves the singleton variable and one which replaces magic
> constants. The split patches should be more comprehensible.
>   Suggested by BALATON Zoltan, I took a chance to resolve gt64120_register(),
> a method akin to the legacy init functions we're trying to get rid of.
> 
> v2:
>   isa/piix4: Fix PCI IRQ levels to be preserved in VMState
>   isa/piix4: Resolve redundant i8259[] attribute
> 
> Bernhard Beschow (7):
>   hw/mips/gt64xxx_pci: Fix PCI IRQ levels to be preserved during
> migration
>   malta: Move PCI interrupt handling from gt64xxx_pci to piix4
>   hw/isa/piix4: Resolve redundant i8259[] attribute
>   hw/isa/piix4: Pass PIIX4State as opaque parameter for piix4_set_irq()
>   hw/isa/piix4: Resolve global instance variable
>   hw/isa/piix4: Replace some magic IRQ constants
>   hw/mips/gt64xxx_pci: Resolve gt64120_register()
> 
>  hw/isa/piix4.c| 54 +--
>  hw/mips/gt64xxx_pci.c | 80 +++
>  hw/mips/malta.c   | 17 
>  include/hw/mips/mips.h|  3 --
>  include/hw/southbridge/piix.h |  2 -
>  5 files changed, 65 insertions(+), 91 deletions(-)
> 
> -- 
> 2.35.1
> 
> 
>

Re: [PATCH v3 6/7] hw/isa/piix4: Replace some magic IRQ constants

2022-02-16 Thread Michael S. Tsirkin

On Wed, Feb 16, 2022 at 11:45:18PM +0100, Bernhard Beschow wrote:
> This is a follow-up on patch "malta: Move PCI interrupt handling from
> gt64xxx_pci to piix4". gt64xxx_pci used magic constants, and probably
> didn't want to use piix4-specific constants. Now that the interrupt
> handing resides in piix4, its constants can be used.
> 
> Signed-off-by: Bernhard Beschow 

Acked-by: Michael S. Tsirkin 

> ---
>  hw/isa/piix4.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/isa/piix4.c b/hw/isa/piix4.c
> index 2e9b5ccada..f876c71750 100644
> --- a/hw/isa/piix4.c
> +++ b/hw/isa/piix4.c
> @@ -61,10 +61,10 @@ static void piix4_set_irq(void *opaque, int irq_num, int 
> level)
>  /* now we change the pic irq level according to the piix irq mappings */
>  /* XXX: optimize */
>  pic_irq = s->dev.config[PIIX_PIRQCA + irq_num];
> -if (pic_irq < 16) {
> +if (pic_irq < ISA_NUM_IRQS) {
>  /* The pic level is the logical OR of all the PCI irqs mapped to it. 
> */
>  pic_level = 0;
> -for (i = 0; i < 4; i++) {
> +for (i = 0; i < PIIX_NUM_PIRQS; i++) {
>  if (pic_irq == s->dev.config[PIIX_PIRQCA + i]) {
>  pic_level |= pci_bus_get_irq_level(bus, i);
>  }
> @@ -315,7 +315,7 @@ DeviceState *piix4_create(PCIBus *pci_bus, ISABus 
> **isa_bus, I2CBus **smbus)
> NULL, 0, NULL);
>  }
>  
> -pci_bus_irqs(pci_bus, piix4_set_irq, pci_slot_get_pirq, s, 4);
> +pci_bus_irqs(pci_bus, piix4_set_irq, pci_slot_get_pirq, s, 
> PIIX_NUM_PIRQS);
>  
>  return dev;
>  }
> -- 
> 2.35.1
> 
> 
>

Re: [PATCH v3 4/7] hw/isa/piix4: Pass PIIX4State as opaque parameter for piix4_set_irq()

2022-02-16 Thread Michael S. Tsirkin

On Wed, Feb 16, 2022 at 11:45:16PM +0100, Bernhard Beschow wrote:
> Passing PIIX4State rather than just the qemu_irq allows for resolving
> the global piix4_dev variable.
> 
> Signed-off-by: Bernhard Beschow 
> Reviewed-by: Peter Maydell 
> Reviewed-by: Philippe Mathieu-Daudé 

Acked-by: Michael S. Tsirkin 

> ---
>  hw/isa/piix4.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/isa/piix4.c b/hw/isa/piix4.c
> index 179968b18e..caa2002e2c 100644
> --- a/hw/isa/piix4.c
> +++ b/hw/isa/piix4.c
> @@ -57,7 +57,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(PIIX4State, PIIX4_PCI_DEVICE)
>  static void piix4_set_irq(void *opaque, int irq_num, int level)
>  {
>  int i, pic_irq, pic_level;
> -qemu_irq *pic = opaque;
> +PIIX4State *s = opaque;
>  PCIBus *bus = pci_get_bus(piix4_dev);
>  
>  /* now we change the pic irq level according to the piix irq mappings */
> @@ -71,7 +71,7 @@ static void piix4_set_irq(void *opaque, int irq_num, int 
> level)
>  pic_level |= pci_bus_get_irq_level(bus, i);
>  }
>  }
> -qemu_set_irq(pic[pic_irq], pic_level);
> +qemu_set_irq(s->isa[pic_irq], pic_level);
>  }
>  }
>  
> @@ -319,7 +319,7 @@ DeviceState *piix4_create(PCIBus *pci_bus, ISABus 
> **isa_bus, I2CBus **smbus)
> NULL, 0, NULL);
>  }
>  
> -pci_bus_irqs(pci_bus, piix4_set_irq, pci_slot_get_pirq, s->isa, 4);
> +pci_bus_irqs(pci_bus, piix4_set_irq, pci_slot_get_pirq, s, 4);
>  
>  return dev;
>  }
> -- 
> 2.35.1
> 
> 
>

Re: [PATCH v3 3/7] hw/isa/piix4: Resolve redundant i8259[] attribute

2022-02-16 Thread Michael S. Tsirkin

On Wed, Feb 16, 2022 at 11:45:15PM +0100, Bernhard Beschow wrote:
> This is a follow-up on patch "malta: Move PCI interrupt handling from
> gt64xxx_pci to piix4" where i8259[] was moved from MaltaState to
> PIIX4State to make the code movement more obvious. However, i8259[]
> seems redundant to *isa, so remove it.
> 
> Signed-off-by: Bernhard Beschow 

Acked-by: Michael S. Tsirkin 

> ---
>  hw/isa/piix4.c | 7 +--
>  1 file changed, 1 insertion(+), 6 deletions(-)
> 
> diff --git a/hw/isa/piix4.c b/hw/isa/piix4.c
> index 196b56e69c..179968b18e 100644
> --- a/hw/isa/piix4.c
> +++ b/hw/isa/piix4.c
> @@ -45,7 +45,6 @@ struct PIIX4State {
>  PCIDevice dev;
>  qemu_irq cpu_intr;
>  qemu_irq *isa;
> -qemu_irq i8259[ISA_NUM_IRQS];
>  
>  RTCState rtc;
>  /* Reset Control Register */
> @@ -320,11 +319,7 @@ DeviceState *piix4_create(PCIBus *pci_bus, ISABus 
> **isa_bus, I2CBus **smbus)
> NULL, 0, NULL);
>  }
>  
> -pci_bus_irqs(pci_bus, piix4_set_irq, pci_slot_get_pirq, s->i8259, 4);
> -
> -for (int i = 0; i < ISA_NUM_IRQS; i++) {
> -s->i8259[i] = qdev_get_gpio_in_named(dev, "isa", i);
> -}
> +pci_bus_irqs(pci_bus, piix4_set_irq, pci_slot_get_pirq, s->isa, 4);
>  
>  return dev;
>  }
> -- 
> 2.35.1
> 
> 
>

Re: [PATCH v3 5/7] hw/isa/piix4: Resolve global instance variable

2022-02-16 Thread Michael S. Tsirkin

On Wed, Feb 16, 2022 at 11:45:17PM +0100, Bernhard Beschow wrote:
> Now that piix4_set_irq's opaque parameter references own PIIX4State,
> piix4_dev becomes redundant.
> 
> Signed-off-by: Bernhard Beschow 
> Reviewed-by: Philippe Mathieu-Daudé 

Acked-by: Michael S. Tsirkin 

> ---
>  hw/isa/piix4.c| 10 +++---
>  include/hw/southbridge/piix.h |  2 --
>  2 files changed, 3 insertions(+), 9 deletions(-)
> 
> diff --git a/hw/isa/piix4.c b/hw/isa/piix4.c
> index caa2002e2c..2e9b5ccada 100644
> --- a/hw/isa/piix4.c
> +++ b/hw/isa/piix4.c
> @@ -39,8 +39,6 @@
>  #include "sysemu/runstate.h"
>  #include "qom/object.h"
>  
> -PCIDevice *piix4_dev;
> -
>  struct PIIX4State {
>  PCIDevice dev;
>  qemu_irq cpu_intr;
> @@ -58,16 +56,16 @@ static void piix4_set_irq(void *opaque, int irq_num, int 
> level)
>  {
>  int i, pic_irq, pic_level;
>  PIIX4State *s = opaque;
> -PCIBus *bus = pci_get_bus(piix4_dev);
> +PCIBus *bus = pci_get_bus(>dev);
>  
>  /* now we change the pic irq level according to the piix irq mappings */
>  /* XXX: optimize */
> -pic_irq = piix4_dev->config[PIIX_PIRQCA + irq_num];
> +pic_irq = s->dev.config[PIIX_PIRQCA + irq_num];
>  if (pic_irq < 16) {
>  /* The pic level is the logical OR of all the PCI irqs mapped to it. 
> */
>  pic_level = 0;
>  for (i = 0; i < 4; i++) {
> -if (pic_irq == piix4_dev->config[PIIX_PIRQCA + i]) {
> +if (pic_irq == s->dev.config[PIIX_PIRQCA + i]) {
>  pic_level |= pci_bus_get_irq_level(bus, i);
>  }
>  }
> @@ -219,8 +217,6 @@ static void piix4_realize(PCIDevice *dev, Error **errp)
>  return;
>  }
>  isa_init_irq(ISA_DEVICE(>rtc), >rtc.irq, RTC_ISA_IRQ);
> -
> -piix4_dev = dev;
>  }
>  
>  static void piix4_init(Object *obj)
> diff --git a/include/hw/southbridge/piix.h b/include/hw/southbridge/piix.h
> index 6387f2b612..f63f83e5c6 100644
> --- a/include/hw/southbridge/piix.h
> +++ b/include/hw/southbridge/piix.h
> @@ -70,8 +70,6 @@ typedef struct PIIXState PIIX3State;
>  DECLARE_INSTANCE_CHECKER(PIIX3State, PIIX3_PCI_DEVICE,
>   TYPE_PIIX3_PCI_DEVICE)
>  
> -extern PCIDevice *piix4_dev;
> -
>  PIIX3State *piix3_create(PCIBus *pci_bus, ISABus **isa_bus);
>  
>  DeviceState *piix4_create(PCIBus *pci_bus, ISABus **isa_bus, I2CBus **smbus);
> -- 
> 2.35.1

Re: [PATCH v2 3/8] x86: Grant AMX permission for guest

2022-02-16 Thread Yang Zhong

On Wed, Feb 16, 2022 at 10:04:29PM -0800, Yang Zhong wrote:
> Kernel allocates 4K xstate buffer by default. For XSAVE features
> which require large state component (e.g. AMX), Linux kernel
> dynamically expands the xstate buffer only after the process has
> acquired the necessary permissions. Those are called dynamically-
> enabled XSAVE features (or dynamic xfeatures).
> 
> There are separate permissions for native tasks and guests.
> 
> Qemu should request the guest permissions for dynamic xfeatures
> which will be exposed to the guest. This only needs to be done
> once before the first vcpu is created.
> 
> KVM implemented one new ARCH_GET_XCOMP_SUPP system attribute API to
> get host side supported_xcr0 and Qemu can decide if it can request
> dynamically enabled XSAVE features permission.
> https://lore.kernel.org/all/20220126152210.3044876-1-pbonz...@redhat.com/
> 
> Suggested-by: Paolo Bonzini 
> Signed-off-by: Yang Zhong 
> Signed-off-by: Jing Liu 
> ---
>  target/i386/cpu.h |  7 +++
>  target/i386/cpu.c | 43 +++
>  target/i386/kvm/kvm-cpu.c | 12 +--
>  target/i386/kvm/kvm.c | 20 ++
>  4 files changed, 76 insertions(+), 6 deletions(-)
> 
> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index 06d2d6bccf..d4ad0f56bd 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> @@ -549,6 +549,13 @@ typedef enum X86Seg {
>  #define XSTATE_ZMM_Hi256_MASK   (1ULL << XSTATE_ZMM_Hi256_BIT)
>  #define XSTATE_Hi16_ZMM_MASK(1ULL << XSTATE_Hi16_ZMM_BIT)
>  #define XSTATE_PKRU_MASK(1ULL << XSTATE_PKRU_BIT)
> +#define XSTATE_XTILE_CFG_MASK   (1ULL << XSTATE_XTILE_CFG_BIT)
> +#define XSTATE_XTILE_DATA_MASK  (1ULL << XSTATE_XTILE_DATA_BIT)
> +#define XFEATURE_XTILE_MASK (XSTATE_XTILE_CFG_MASK \
> + | XSTATE_XTILE_DATA_MASK)
> +
> +#define ARCH_GET_XCOMP_GUEST_PERM   0x1024
> +#define ARCH_REQ_XCOMP_GUEST_PERM   0x1025
>  
>  #define ESA_FEATURE_ALIGN64_BIT 1
>  
> diff --git a/target/i386/cpu.c b/target/i386/cpu.c
> index ea7e8f9081..377d993438 100644
> --- a/target/i386/cpu.c
> +++ b/target/i386/cpu.c
> @@ -43,6 +43,8 @@
>  #include "disas/capstone.h"
>  #include "cpu-internal.h"
>  
> +#include 
> +
>  /* Helpers for building CPUID[2] descriptors: */
>  
>  struct CPUID2CacheDescriptorInfo {
> @@ -6000,12 +6002,47 @@ static void x86_cpu_adjust_feat_level(X86CPU *cpu, 
> FeatureWord w)
>  }
>  }
>  
> +static void kvm_request_xsave_components(X86CPU *cpu, uint64_t mask)
> +{
> +KVMState *s = kvm_state;
> +uint64_t bitmask;
> +long rc;
> +
> +if ((mask & XSTATE_XTILE_DATA_MASK) == XSTATE_XTILE_DATA_MASK) {
> +bitmask = kvm_arch_get_supported_cpuid(s, 0xd, 0, R_EAX);
> +if (!(bitmask & XSTATE_XTILE_DATA_MASK)) {

   Paolo, last time you suggested below changes for here:

   rc = kvm_arch_get_supported_cpuid(s, 0xd, 0,
  (xdata_bit < 32 ? R_EAX : R_EDX));
   if (!(rc & BIT(xdata_bit & 31)) {
  ...
   }   

  Since I used "mask" as parameter here, so I had to directly use R_EAX here.
  Please review and if need change it to like "(xdata_bit < 32 ? R_EAX : 
R_EDX)",
  I will change this in next version, thanks!

  Yang


> +warn_report("no amx support from supported_xcr0, "
> +"bitmask:0x%lx", bitmask);
> +return;
> +}
> +
> +rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM,
> +  XSTATE_XTILE_DATA_BIT);
> +if (rc) {
> +/*
> + * The older kernel version(<5.15) can't support
> + * ARCH_REQ_XCOMP_GUEST_PERM and directly return.
> + */
> +return;
> +}
> +
> +rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, );
> +if (rc) {
> +warn_report("prctl(ARCH_GET_XCOMP_GUEST_PERM) error: %ld", rc);
> +} else if (!(bitmask & XFEATURE_XTILE_MASK)) {
> +warn_report("prctl(ARCH_REQ_XCOMP_GUEST_PERM) failure "
> +"and bitmask=0x%lx", bitmask);
> +}
> +}
> +}
> +
>  /* Calculate XSAVE components based on the configured CPU feature flags */
>  static void x86_cpu_enable_xsave_components(X86CPU *cpu)
>  {
>  CPUX86State *env = >env;
>  int i;
>  uint64_t mask;
> +static bool request_perm;
>  
>  if (!(env->features[FEAT_1_ECX] & CPUID_EXT_XSAVE)) {
>  env->features[FEAT_XSAVE_COMP_LO] = 0;
> @@ -6021,6 +6058,12 @@ static void x86_cpu_enable_xsave_components(X86CPU 
> *cpu)
>  }
>  }
>  
> +/* Only request permission for first vcpu */
> +if (kvm_enabled() && !request_perm) {
> +kvm_request_xsave_components(cpu, mask);
> +request_perm = true;
> +}
> +
>  env->features[FEAT_XSAVE_COMP_LO] = mask;
>  env->features[FEAT_XSAVE_COMP_HI] = mask

Re: [PATCH v5 01/18] configure, meson: override C compiler for cmake

2022-02-16 Thread Jag Raman



> On Jan 20, 2022, at 8:27 AM, Paolo Bonzini  wrote:
> 
> On 1/19/22 22:41, Jagannathan Raman wrote:
>> The compiler path that cmake gets from meson is corrupted. It results in
>> the following error:
>> | -- The C compiler identification is unknown
>> | CMake Error at CMakeLists.txt:35 (project):
>> | The CMAKE_C_COMPILER:
>> | /opt/rh/devtoolset-9/root/bin/cc;-m64;-mcx16
>> | is not a full path to an existing compiler tool.
>> Explicitly specify the C compiler for cmake to avoid this error
>> Signed-off-by: Jagannathan Raman 
>> Acked-by: Paolo Bonzini 
> 
> This should not be needed anymore, as the bug in Meson has been fixed.

Hi Paolo,

I’m able to see the bug with latest QEMU. The fix doesn’t appear to be
available with meson version 0.59.3, which is what QEMU is
presently using.

Thank you!
--
Jag

> 
> Paolo
> 
>>  configure | 2 ++
>>  1 file changed, 2 insertions(+)
>> diff --git a/configure b/configure
>> index e1a31fb332..6a865f8713 100755
>> --- a/configure
>> +++ b/configure
>> @@ -3747,6 +3747,8 @@ if test "$skip_meson" = no; then
>>echo "cpp_args = [$(meson_quote $CXXFLAGS $EXTRA_CXXFLAGS)]" >> $cross
>>echo "c_link_args = [$(meson_quote $CFLAGS $LDFLAGS $EXTRA_CFLAGS 
>> $EXTRA_LDFLAGS)]" >> $cross
>>echo "cpp_link_args = [$(meson_quote $CXXFLAGS $LDFLAGS $EXTRA_CXXFLAGS 
>> $EXTRA_LDFLAGS)]" >> $cross
>> +  echo "[cmake]" >> $cross
>> +  echo "CMAKE_C_COMPILER = [$(meson_quote $cc $CPU_CFLAGS)]" >> $cross
>>echo "[binaries]" >> $cross
>>echo "c = [$(meson_quote $cc $CPU_CFLAGS)]" >> $cross
>>test -n "$cxx" && echo "cpp = [$(meson_quote $cxx $CPU_CFLAGS)]" >> $cross
>

[PATCH v2 8/8] linux-header: Sync the linux headers

2022-02-16 Thread Yang Zhong

This patch will be dropped once Qemu sync linux 5.17 header.
Making all linux-headers changes here are only for maintainers
to easily remove those changes once those patches are queued.

Signed-off-by: Yang Zhong 
---
 linux-headers/asm-x86/kvm.h | 17 +
 linux-headers/linux/kvm.h   |  4 
 2 files changed, 21 insertions(+)

diff --git a/linux-headers/asm-x86/kvm.h b/linux-headers/asm-x86/kvm.h
index 5a776a08f7..17735430db 100644
--- a/linux-headers/asm-x86/kvm.h
+++ b/linux-headers/asm-x86/kvm.h
@@ -375,7 +375,21 @@ struct kvm_debugregs {
 
 /* for KVM_CAP_XSAVE */
 struct kvm_xsave {
+   /*
+* KVM_GET_XSAVE2 and KVM_SET_XSAVE write and read as many bytes
+* as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+* respectively, when invoked on the vm file descriptor.
+*
+* The size value returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+* will always be at least 4096. Currently, it is only greater
+* than 4096 if a dynamic feature has been enabled with
+* ``arch_prctl()``, but this may change in the future.
+*
+* The offsets of the state save areas in struct kvm_xsave follow
+* the contents of CPUID leaf 0xD on the host.
+*/
__u32 region[1024];
+   __u32 extra[0];
 };
 
 #define KVM_MAX_XCRS   16
@@ -438,6 +452,9 @@ struct kvm_sync_regs {
 
 #define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE0x0001
 
+/* attributes for system fd (group 0) */
+#define KVM_X86_XCOMP_GUEST_SUPP   0
+
 struct kvm_vmx_nested_state_data {
__u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
__u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index 02c5e7b7bb..54ce7e6d90 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -1130,6 +1130,8 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_BINARY_STATS_FD 203
 #define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204
 #define KVM_CAP_ARM_MTE 205
+#define KVM_CAP_XSAVE2  208
+#define KVM_CAP_SYS_ATTRIBUTES 209
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1677,6 +1679,8 @@ struct kvm_xen_hvm_attr {
 #define KVM_GET_SREGS2 _IOR(KVMIO,  0xcc, struct kvm_sregs2)
 #define KVM_SET_SREGS2 _IOW(KVMIO,  0xcd, struct kvm_sregs2)
 
+#define KVM_GET_XSAVE2   _IOR(KVMIO,  0xcf, struct kvm_xsave)
+
 struct kvm_xen_vcpu_attr {
__u16 type;
__u16 pad[3];

[PATCH v2 7/8] x86: Support XFD and AMX xsave data migration

2022-02-16 Thread Yang Zhong

From: Zeng Guang 

XFD(eXtended Feature Disable) allows to enable a
feature on xsave state while preventing specific
user threads from using the feature.

Support save and restore XFD MSRs if CPUID.D.1.EAX[4]
enumerate to be valid. Likewise migrate the MSRs and
related xsave state necessarily.

Signed-off-by: Zeng Guang 
Signed-off-by: Wei Wang 
Signed-off-by: Yang Zhong 
---
 target/i386/cpu.h |  9 +
 target/i386/kvm/kvm.c | 18 ++
 target/i386/machine.c | 42 ++
 3 files changed, 69 insertions(+)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index de9da38e42..509c16323a 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -505,6 +505,9 @@ typedef enum X86Seg {
 
 #define MSR_VM_HSAVE_PA 0xc0010117
 
+#define MSR_IA32_XFD0x01c4
+#define MSR_IA32_XFD_ERR0x01c5
+
 #define MSR_IA32_BNDCFGS0x0d90
 #define MSR_IA32_XSS0x0da0
 #define MSR_IA32_UMWAIT_CONTROL 0xe1
@@ -873,6 +876,8 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS];
 #define CPUID_7_1_EAX_AVX_VNNI  (1U << 4)
 /* AVX512 BFloat16 Instruction */
 #define CPUID_7_1_EAX_AVX512_BF16   (1U << 5)
+/* XFD Extend Feature Disabled */
+#define CPUID_D_1_EAX_XFD   (1U << 4)
 
 /* Packets which contain IP payload have LIP values */
 #define CPUID_14_0_ECX_LIP  (1U << 31)
@@ -1617,6 +1622,10 @@ typedef struct CPUX86State {
 uint64_t msr_rtit_cr3_match;
 uint64_t msr_rtit_addrs[MAX_RTIT_ADDRS];
 
+/* Per-VCPU XFD MSRs */
+uint64_t msr_xfd;
+uint64_t msr_xfd_err;
+
 /* exception/interrupt handling */
 int error_code;
 int exception_is_int;
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index ff064e3d8f..3dd24b6b0e 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -3275,6 +3275,13 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
   env->msr_ia32_sgxlepubkeyhash[3]);
 }
 
+if (env->features[FEAT_XSAVE] & CPUID_D_1_EAX_XFD) {
+kvm_msr_entry_add(cpu, MSR_IA32_XFD,
+  env->msr_xfd);
+kvm_msr_entry_add(cpu, MSR_IA32_XFD_ERR,
+  env->msr_xfd_err);
+}
+
 /* Note: MSR_IA32_FEATURE_CONTROL is written separately, see
  *   kvm_put_msr_feature_control. */
 }
@@ -3667,6 +3674,11 @@ static int kvm_get_msrs(X86CPU *cpu)
 kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH3, 0);
 }
 
+if (env->features[FEAT_XSAVE] & CPUID_D_1_EAX_XFD) {
+kvm_msr_entry_add(cpu, MSR_IA32_XFD, 0);
+kvm_msr_entry_add(cpu, MSR_IA32_XFD_ERR, 0);
+}
+
 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, cpu->kvm_msr_buf);
 if (ret < 0) {
 return ret;
@@ -3963,6 +3975,12 @@ static int kvm_get_msrs(X86CPU *cpu)
 env->msr_ia32_sgxlepubkeyhash[index - MSR_IA32_SGXLEPUBKEYHASH0] =
msrs[i].data;
 break;
+case MSR_IA32_XFD:
+env->msr_xfd = msrs[i].data;
+break;
+case MSR_IA32_XFD_ERR:
+env->msr_xfd_err = msrs[i].data;
+break;
 }
 }
 
diff --git a/target/i386/machine.c b/target/i386/machine.c
index 6202f47793..1f9d0c46f1 100644
--- a/target/i386/machine.c
+++ b/target/i386/machine.c
@@ -1483,6 +1483,46 @@ static const VMStateDescription vmstate_pdptrs = {
 }
 };
 
+static bool xfd_msrs_needed(void *opaque)
+{
+X86CPU *cpu = opaque;
+CPUX86State *env = >env;
+
+return !!(env->features[FEAT_XSAVE] & CPUID_D_1_EAX_XFD);
+}
+
+static const VMStateDescription vmstate_msr_xfd = {
+.name = "cpu/msr_xfd",
+.version_id = 1,
+.minimum_version_id = 1,
+.needed = xfd_msrs_needed,
+.fields = (VMStateField[]) {
+VMSTATE_UINT64(env.msr_xfd, X86CPU),
+VMSTATE_UINT64(env.msr_xfd_err, X86CPU),
+VMSTATE_END_OF_LIST()
+}
+};
+
+static bool amx_xtile_needed(void *opaque)
+{
+X86CPU *cpu = opaque;
+CPUX86State *env = >env;
+
+return !!(env->features[FEAT_7_0_EDX] & CPUID_7_0_EDX_AMX_TILE);
+}
+
+static const VMStateDescription vmstate_amx_xtile = {
+.name = "cpu/intel_amx_xtile",
+.version_id = 1,
+.minimum_version_id = 1,
+.needed = amx_xtile_needed,
+.fields = (VMStateField[]) {
+VMSTATE_UINT8_ARRAY(env.xtilecfg, X86CPU, 64),
+VMSTATE_UINT8_ARRAY(env.xtiledata, X86CPU, 8192),
+VMSTATE_END_OF_LIST()
+}
+};
+
 const VMStateDescription vmstate_x86_cpu = {
 .name = "cpu",
 .version_id = 12,
@@ -1622,6 +1662,8 @@ const VMStateDescription vmstate_x86_cpu = {
 _msr_tsx_ctrl,
 _msr_intel_sgx,
 _pdptrs,
+_msr_xfd,
+_amx_xtile,
 NULL
 }
 };

[PATCH v2 6/8] x86: add support for KVM_CAP_XSAVE2 and AMX state migration

2022-02-16 Thread Yang Zhong

From: Jing Liu 

When dynamic xfeatures (e.g. AMX) are used by the guest, the xsave
area would be larger than 4KB. KVM_GET_XSAVE2 and KVM_SET_XSAVE
under KVM_CAP_XSAVE2 works with a xsave buffer larger than 4KB.
Always use the new ioctls under KVM_CAP_XSAVE2 when KVM supports it.

Signed-off-by: Jing Liu 
Signed-off-by: Zeng Guang 
Signed-off-by: Wei Wang 
Signed-off-by: Yang Zhong 
---
 target/i386/cpu.h  |  4 
 target/i386/kvm/kvm.c  | 42 --
 target/i386/xsave_helper.c | 33 ++
 3 files changed, 64 insertions(+), 15 deletions(-)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index f7fc2e97a6..de9da38e42 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1528,6 +1528,10 @@ typedef struct CPUX86State {
 uint64_t opmask_regs[NB_OPMASK_REGS];
 YMMReg zmmh_regs[CPU_NB_REGS];
 ZMMReg hi16_zmm_regs[CPU_NB_REGS];
+#ifdef TARGET_X86_64
+uint8_t xtilecfg[64];
+uint8_t xtiledata[8192];
+#endif
 
 /* sysenter registers */
 uint32_t sysenter_cs;
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 8562d3d138..ff064e3d8f 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -122,6 +122,7 @@ static uint32_t num_architectural_pmu_gp_counters;
 static uint32_t num_architectural_pmu_fixed_counters;
 
 static int has_xsave;
+static int has_xsave2;
 static int has_xcrs;
 static int has_pit_state2;
 static int has_sregs2;
@@ -1585,6 +1586,26 @@ static Error *invtsc_mig_blocker;
 
 #define KVM_MAX_CPUID_ENTRIES  100
 
+static void kvm_init_xsave(CPUX86State *env)
+{
+if (has_xsave2) {
+env->xsave_buf_len = QEMU_ALIGN_UP(has_xsave2, 4096);
+} else if (has_xsave) {
+env->xsave_buf_len = sizeof(struct kvm_xsave);
+} else {
+return;
+}
+
+env->xsave_buf = qemu_memalign(4096, env->xsave_buf_len);
+memset(env->xsave_buf, 0, env->xsave_buf_len);
+ /*
+  * The allocated storage must be large enough for all of the
+  * possible XSAVE state components.
+  */
+assert(kvm_arch_get_supported_cpuid(kvm_state, 0xd, 0, R_ECX) <=
+   env->xsave_buf_len);
+}
+
 int kvm_arch_init_vcpu(CPUState *cs)
 {
 struct {
@@ -1614,6 +1635,8 @@ int kvm_arch_init_vcpu(CPUState *cs)
 
 cpuid_i = 0;
 
+has_xsave2 = kvm_check_extension(cs->kvm_state, KVM_CAP_XSAVE2);
+
 r = kvm_arch_set_tsc_khz(cs);
 if (r < 0) {
 return r;
@@ -2003,19 +2026,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
 if (r) {
 goto fail;
 }
-
-if (has_xsave) {
-env->xsave_buf_len = sizeof(struct kvm_xsave);
-env->xsave_buf = qemu_memalign(4096, env->xsave_buf_len);
-memset(env->xsave_buf, 0, env->xsave_buf_len);
-
-/*
- * The allocated storage must be large enough for all of the
- * possible XSAVE state components.
- */
-assert(kvm_arch_get_supported_cpuid(kvm_state, 0xd, 0, R_ECX)
-   <= env->xsave_buf_len);
-}
+kvm_init_xsave(env);
 
 max_nested_state_len = kvm_max_nested_state_length();
 if (max_nested_state_len > 0) {
@@ -3319,13 +3330,14 @@ static int kvm_get_xsave(X86CPU *cpu)
 {
 CPUX86State *env = >env;
 void *xsave = env->xsave_buf;
-int ret;
+int type, ret;
 
 if (!has_xsave) {
 return kvm_get_fpu(cpu);
 }
 
-ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XSAVE, xsave);
+type = has_xsave2 ? KVM_GET_XSAVE2 : KVM_GET_XSAVE;
+ret = kvm_vcpu_ioctl(CPU(cpu), type, xsave);
 if (ret < 0) {
 return ret;
 }
diff --git a/target/i386/xsave_helper.c b/target/i386/xsave_helper.c
index ac61a96344..b6a004505f 100644
--- a/target/i386/xsave_helper.c
+++ b/target/i386/xsave_helper.c
@@ -5,6 +5,7 @@
 #include "qemu/osdep.h"
 
 #include "cpu.h"
+#include 
 
 void x86_cpu_xsave_all_areas(X86CPU *cpu, void *buf, uint32_t buflen)
 {
@@ -126,6 +127,22 @@ void x86_cpu_xsave_all_areas(X86CPU *cpu, void *buf, 
uint32_t buflen)
 
 memcpy(pkru, >pkru, sizeof(env->pkru));
 }
+
+e = _ext_save_areas[XSTATE_XTILE_CFG_BIT];
+if (e->size && e->offset) {
+XSaveXTILECFG *tilecfg = buf + e->offset;
+
+memcpy(tilecfg, >xtilecfg, sizeof(env->xtilecfg));
+}
+
+if (buflen > sizeof(struct kvm_xsave)) {
+e = _ext_save_areas[XSTATE_XTILE_DATA_BIT];
+if (e->size && e->offset && buflen >= e->size + e->offset) {
+XSaveXTILEDATA *tiledata = buf + e->offset;
+
+memcpy(tiledata, >xtiledata, sizeof(env->xtiledata));
+}
+}
 #endif
 }
 
@@ -247,5 +264,21 @@ void x86_cpu_xrstor_all_areas(X86CPU *cpu, const void 
*buf, uint32_t buflen)
 pkru = buf + e->offset;
 memcpy(>pkru, pkru, sizeof(env->pkru));
 }
+
+e = _ext_save_areas[XSTATE_XTILE_CFG_BIT];
+if (e->size && e->offset) {
+const XSaveXTILECFG *tilecfg = buf + e->offset;
+
+memcpy(>xtilecfg, tilecfg, sizeof(env->xtilecfg));
+}
+
+if

[PATCH v2 2/8] x86: Add AMX XTILECFG and XTILEDATA components

2022-02-16 Thread Yang Zhong

From: Jing Liu 

The AMX TILECFG register and the TMMx tile data registers are
saved/restored via XSAVE, respectively in state component 17
(64 bytes) and state component 18 (8192 bytes).

Add AMX feature bits to x86_ext_save_areas array to set
up AMX components. Add structs that define the layout of
AMX XSAVE areas and use QEMU_BUILD_BUG_ON to validate the
structs sizes.

Signed-off-by: Jing Liu 
Signed-off-by: Yang Zhong 
---
 target/i386/cpu.h | 18 +-
 target/i386/cpu.c |  8 
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index de1dc124ab..06d2d6bccf 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -537,6 +537,8 @@ typedef enum X86Seg {
 #define XSTATE_ZMM_Hi256_BIT6
 #define XSTATE_Hi16_ZMM_BIT 7
 #define XSTATE_PKRU_BIT 9
+#define XSTATE_XTILE_CFG_BIT17
+#define XSTATE_XTILE_DATA_BIT   18
 
 #define XSTATE_FP_MASK  (1ULL << XSTATE_FP_BIT)
 #define XSTATE_SSE_MASK (1ULL << XSTATE_SSE_BIT)
@@ -845,6 +847,8 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS];
 #define CPUID_7_0_EDX_TSX_LDTRK (1U << 16)
 /* AVX512_FP16 instruction */
 #define CPUID_7_0_EDX_AVX512_FP16   (1U << 23)
+/* AMX tile (two-dimensional register) */
+#define CPUID_7_0_EDX_AMX_TILE  (1U << 24)
 /* Speculation Control */
 #define CPUID_7_0_EDX_SPEC_CTRL (1U << 26)
 /* Single Thread Indirect Branch Predictors */
@@ -1348,6 +1352,16 @@ typedef struct XSavePKRU {
 uint32_t padding;
 } XSavePKRU;
 
+/* Ext. save area 17: AMX XTILECFG state */
+typedef struct XSaveXTILECFG {
+uint8_t xtilecfg[64];
+} XSaveXTILECFG;
+
+/* Ext. save area 18: AMX XTILEDATA state */
+typedef struct XSaveXTILEDATA {
+uint8_t xtiledata[8][1024];
+} XSaveXTILEDATA;
+
 QEMU_BUILD_BUG_ON(sizeof(XSaveAVX) != 0x100);
 QEMU_BUILD_BUG_ON(sizeof(XSaveBNDREG) != 0x40);
 QEMU_BUILD_BUG_ON(sizeof(XSaveBNDCSR) != 0x40);
@@ -1355,6 +1369,8 @@ QEMU_BUILD_BUG_ON(sizeof(XSaveOpmask) != 0x40);
 QEMU_BUILD_BUG_ON(sizeof(XSaveZMM_Hi256) != 0x200);
 QEMU_BUILD_BUG_ON(sizeof(XSaveHi16_ZMM) != 0x400);
 QEMU_BUILD_BUG_ON(sizeof(XSavePKRU) != 0x8);
+QEMU_BUILD_BUG_ON(sizeof(XSaveXTILECFG) != 0x40);
+QEMU_BUILD_BUG_ON(sizeof(XSaveXTILEDATA) != 0x2000);
 
 typedef struct ExtSaveArea {
 uint32_t feature, bits;
@@ -1362,7 +1378,7 @@ typedef struct ExtSaveArea {
 uint32_t ecx;
 } ExtSaveArea;
 
-#define XSAVE_STATE_AREA_COUNT (XSTATE_PKRU_BIT + 1)
+#define XSAVE_STATE_AREA_COUNT (XSTATE_XTILE_DATA_BIT + 1)
 
 extern ExtSaveArea x86_ext_save_areas[XSAVE_STATE_AREA_COUNT];
 
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 37f06b0b1a..ea7e8f9081 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -1401,6 +1401,14 @@ ExtSaveArea x86_ext_save_areas[XSAVE_STATE_AREA_COUNT] = 
{
 [XSTATE_PKRU_BIT] =
   { .feature = FEAT_7_0_ECX, .bits = CPUID_7_0_ECX_PKU,
 .size = sizeof(XSavePKRU) },
+[XSTATE_XTILE_CFG_BIT] = {
+.feature = FEAT_7_0_EDX, .bits = CPUID_7_0_EDX_AMX_TILE,
+.size = sizeof(XSaveXTILECFG),
+},
+[XSTATE_XTILE_DATA_BIT] = {
+.feature = FEAT_7_0_EDX, .bits = CPUID_7_0_EDX_AMX_TILE,
+.size = sizeof(XSaveXTILEDATA)
+},
 };
 
 static uint32_t xsave_area_size(uint64_t mask)

[PATCH v2 5/8] x86: Add AMX CPUIDs enumeration

2022-02-16 Thread Yang Zhong

From: Jing Liu 

Add AMX primary feature bits XFD and AMX_TILE to
enumerate the CPU's AMX capability. Meanwhile, add
AMX TILE and TMUL CPUID leaf and subleaves which
exist when AMX TILE is present to provide the maximum
capability of TILE and TMUL.

Signed-off-by: Jing Liu 
Signed-off-by: Yang Zhong 
---
 target/i386/cpu.c | 55 ---
 target/i386/kvm/kvm.c |  4 +++-
 2 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 5a7ee8c7e1..2465bed5df 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -576,6 +576,18 @@ static CPUCacheInfo legacy_l3_cache = {
 #define INTEL_PT_CYCLE_BITMAP0x1fff /* Support 0,2^(0~11) */
 #define INTEL_PT_PSB_BITMAP  (0x003f << 16) /* Support 
2K,4K,8K,16K,32K,64K */
 
+/* CPUID Leaf 0x1D constants: */
+#define INTEL_AMX_TILE_MAX_SUBLEAF 0x1
+#define INTEL_AMX_TOTAL_TILE_BYTES 0x2000
+#define INTEL_AMX_BYTES_PER_TILE   0x400
+#define INTEL_AMX_BYTES_PER_ROW0x40
+#define INTEL_AMX_TILE_MAX_NAMES   0x8
+#define INTEL_AMX_TILE_MAX_ROWS0x10
+
+/* CPUID Leaf 0x1E constants: */
+#define INTEL_AMX_TMUL_MAX_K   0x10
+#define INTEL_AMX_TMUL_MAX_N   0x40
+
 void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1,
   uint32_t vendor2, uint32_t vendor3)
 {
@@ -845,8 +857,8 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
 "avx512-vp2intersect", NULL, "md-clear", NULL,
 NULL, NULL, "serialize", NULL,
 "tsx-ldtrk", NULL, NULL /* pconfig */, NULL,
-NULL, NULL, NULL, "avx512-fp16",
-NULL, NULL, "spec-ctrl", "stibp",
+NULL, NULL, "amx-bf16", "avx512-fp16",
+"amx-tile", "amx-int8", "spec-ctrl", "stibp",
 NULL, "arch-capabilities", "core-capability", "ssbd",
 },
 .cpuid = {
@@ -911,7 +923,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
 .type = CPUID_FEATURE_WORD,
 .feat_names = {
 "xsaveopt", "xsavec", "xgetbv1", "xsaves",
-NULL, NULL, NULL, NULL,
+"xfd", NULL, NULL, NULL,
 NULL, NULL, NULL, NULL,
 NULL, NULL, NULL, NULL,
 NULL, NULL, NULL, NULL,
@@ -5587,6 +5599,43 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, 
uint32_t count,
 }
 break;
 }
+case 0x1D: {
+/* AMX TILE */
+*eax = 0;
+*ebx = 0;
+*ecx = 0;
+*edx = 0;
+if (!(env->features[FEAT_7_0_EDX] & CPUID_7_0_EDX_AMX_TILE)) {
+break;
+}
+
+if (count == 0) {
+/* Highest numbered palette subleaf */
+*eax = INTEL_AMX_TILE_MAX_SUBLEAF;
+} else if (count == 1) {
+*eax = INTEL_AMX_TOTAL_TILE_BYTES |
+   (INTEL_AMX_BYTES_PER_TILE << 16);
+*ebx = INTEL_AMX_BYTES_PER_ROW | (INTEL_AMX_TILE_MAX_NAMES << 16);
+*ecx = INTEL_AMX_TILE_MAX_ROWS;
+}
+break;
+}
+case 0x1E: {
+/* AMX TMUL */
+*eax = 0;
+*ebx = 0;
+*ecx = 0;
+*edx = 0;
+if (!(env->features[FEAT_7_0_EDX] & CPUID_7_0_EDX_AMX_TILE)) {
+break;
+}
+
+if (count == 0) {
+/* Highest numbered palette subleaf */
+*ebx = INTEL_AMX_TMUL_MAX_K | (INTEL_AMX_TMUL_MAX_N << 8);
+}
+break;
+}
 case 0x4000:
 /*
  * CPUID code in kvm_arch_init_vcpu() ignores stuff
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 3bdcd724c4..8562d3d138 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -1779,7 +1779,9 @@ int kvm_arch_init_vcpu(CPUState *cs)
 c = _data.entries[cpuid_i++];
 }
 break;
-case 0x14: {
+case 0x14:
+case 0x1d:
+case 0x1e: {
 uint32_t times;
 
 c->function = i;

[PATCH v2 4/8] x86: Add XFD faulting bit for state components

2022-02-16 Thread Yang Zhong

From: Jing Liu 

Intel introduces XFD faulting mechanism for extended
XSAVE features to dynamically enable the features in
runtime. If CPUID (EAX=0Dh, ECX=n, n>1).ECX[2] is set
as 1, it indicates support for XFD faulting of this
state component.

Signed-off-by: Jing Liu 
Signed-off-by: Yang Zhong 
---
 target/i386/cpu.h | 2 ++
 target/i386/cpu.c | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index d4ad0f56bd..f7fc2e97a6 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -558,8 +558,10 @@ typedef enum X86Seg {
 #define ARCH_REQ_XCOMP_GUEST_PERM   0x1025
 
 #define ESA_FEATURE_ALIGN64_BIT 1
+#define ESA_FEATURE_XFD_BIT 2
 
 #define ESA_FEATURE_ALIGN64_MASK(1U << ESA_FEATURE_ALIGN64_BIT)
+#define ESA_FEATURE_XFD_MASK(1U << ESA_FEATURE_XFD_BIT)
 
 
 /* CPUID feature words */
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 377d993438..5a7ee8c7e1 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -5497,7 +5497,8 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, 
uint32_t count,
 const ExtSaveArea *esa = _ext_save_areas[count];
 *eax = esa->size;
 *ebx = esa->offset;
-*ecx = esa->ecx & ESA_FEATURE_ALIGN64_MASK;
+*ecx = (esa->ecx & ESA_FEATURE_ALIGN64_MASK) |
+   (esa->ecx & ESA_FEATURE_XFD_MASK);
 }
 }
 break;

[PATCH v2 1/8] x86: Fix the 64-byte boundary enumeration for extended state

2022-02-16 Thread Yang Zhong

From: Jing Liu 

The extended state subleaves (EAX=0Dh, ECX=n, n>1).ECX[1]
indicate whether the extended state component locates
on the next 64-byte boundary following the preceding state
component when the compacted format of an XSAVE area is
used.

Right now, they are all zero because no supported component
needed the bit to be set, but the upcoming AMX feature will
use it.  Fix the subleaves value according to KVM's supported
cpuid.

Signed-off-by: Jing Liu 
Signed-off-by: Yang Zhong 
---
 target/i386/cpu.h | 6 ++
 target/i386/cpu.c | 1 +
 target/i386/kvm/kvm-cpu.c | 1 +
 3 files changed, 8 insertions(+)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 9911d7c871..de1dc124ab 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -548,6 +548,11 @@ typedef enum X86Seg {
 #define XSTATE_Hi16_ZMM_MASK(1ULL << XSTATE_Hi16_ZMM_BIT)
 #define XSTATE_PKRU_MASK(1ULL << XSTATE_PKRU_BIT)
 
+#define ESA_FEATURE_ALIGN64_BIT 1
+
+#define ESA_FEATURE_ALIGN64_MASK(1U << ESA_FEATURE_ALIGN64_BIT)
+
+
 /* CPUID feature words */
 typedef enum FeatureWord {
 FEAT_1_EDX, /* CPUID[1].EDX */
@@ -1354,6 +1359,7 @@ QEMU_BUILD_BUG_ON(sizeof(XSavePKRU) != 0x8);
 typedef struct ExtSaveArea {
 uint32_t feature, bits;
 uint32_t offset, size;
+uint32_t ecx;
 } ExtSaveArea;
 
 #define XSAVE_STATE_AREA_COUNT (XSTATE_PKRU_BIT + 1)
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index aa9e636800..37f06b0b1a 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -5487,6 +5487,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, 
uint32_t count,
 const ExtSaveArea *esa = _ext_save_areas[count];
 *eax = esa->size;
 *ebx = esa->offset;
+*ecx = esa->ecx & ESA_FEATURE_ALIGN64_MASK;
 }
 }
 break;
diff --git a/target/i386/kvm/kvm-cpu.c b/target/i386/kvm/kvm-cpu.c
index d95028018e..ce27d3b1df 100644
--- a/target/i386/kvm/kvm-cpu.c
+++ b/target/i386/kvm/kvm-cpu.c
@@ -104,6 +104,7 @@ static void kvm_cpu_xsave_init(void)
 if (sz != 0) {
 assert(esa->size == sz);
 esa->offset = kvm_arch_get_supported_cpuid(s, 0xd, i, R_EBX);
+esa->ecx = kvm_arch_get_supported_cpuid(s, 0xd, i, R_ECX);
 }
 }
 }

[PATCH v2 0/8] AMX support in Qemu

2022-02-16 Thread Yang Zhong

Intel introduces Advanced Matrix Extensions (AMX) [1] feature that
consists of configurable two-dimensional "TILE" registers and new
accelerator instructions that operate on them. TMUL (Tile matrix
MULtiply) is the first accelerator instruction set to use the new
registers.

Since AMX KVM patches have been merged into Linux release, this series
is based on latest Linux release(5.17-rc4).

According to the KVM design, the userspace VMM (e.g. Qemu) is expected
to request guest permission for the dynamically-enabled XSAVE features
only once when the first vCPU is created, and KVM checks guest permission
in KVM_SET_CPUID2.

Intel AMX is XSAVE supported and XSAVE enabled. Those extended features
has large state while current kvm_xsave only allows 4KB. The AMX KVM has
extended struct kvm_xsave to meet this requirenment and added one extra
KVM_GET_XSAVE2 ioctl to handle extended features. From our test, the AMX
live migration work well.

Notice: This version still includes some definitions in the linux-headers,
once Qemu sync those linux-headers, I will remove those definitions. So
please ignore those changes.

[1] Intel Architecture Instruction Set Extension Programming Reference
https://software.intel.com/content/dam/develop/external/us/en/documents/\
architecture-instruction-set-extensions-programming-reference.pdf

Thanks,
Yang


Change history
--
v1->v2:
   - Patch 1 moved "esa->ecx" into the "if{}"(Paolo).
   - Patch 3, the requiremnets from Paoalo,
 - Moved "esa->ecx" into the "if{}".
 - Used the "mask" as parameter to replace xtiledata bits in
   kvm_request_xsave_components()
 - Used the new defined KVM_X86_XCOMP_GUEST_SUPP from KVM to get
   supported_xcr0 from kvm_arch_get_supported_cpuid().
 - Updated the kvm_request_xsave_components() for future usage.
   - Patch 5 added "case 0x1e:" in kvm_arch_init_vcpu()(Paolo).
   - Patch 6 replaced "if (e->size && e->offset)" with 
 "if (e->size && e->offset && buflen >= e->size + e->offset)"
 for xsave and xrstor(Paolo).
   - Patch 8, which is new added patch and is only for linux-headers.
 This patch can be directly dropped once Qemu sync linux-headers. 

rfc v1->v1:
   - Patch 1 changed commit message(Kevin and Paolo).
   - Patch 2 changed commit message(Kevin and Paolo).
   - Patch 3, below requirements from Paolo,
 - Called ARCH_REQ_XCOMP_GUEST_PERM from x86_cpu_enable_xsave_components.
   Used kvm_request_xsave_components() to replace x86_xsave_req_perm().
   Replaced syscall(ARCH_GET_XCOMP_GUEST_PERM) with 
kvm_arch_get_supported_cpuid()
   in kvm_request_xsave_components().
 - Changed kvm_cpu_xsave_init() to use host_cpuid() instead of
   kvm_arch_get_supported_cpuid().
 - Added the "function == 0xd" handle in kvm_arch_get_supported_cpuid().
   - Patch 4, used "uint32_t ecx" to replace "uint32_t need_align, support_xfd".
   - Patch 6, below changes,
 - Changed the commit message(Kevin) and Used the new function
 - kvm_init_xsave() to replace some pieces of code(Wei).
 - Moved KVM_CAP_XSAVE2 extension check to kvm_arch_init_vcpu() to
   make the request permission before KVM_CAP_XSAVE2 extension check(Paolo).
   - Removed RFC prefix.

Jing Liu (5):
  x86: Fix the 64-byte boundary enumeration for extended state
  x86: Add AMX XTILECFG and XTILEDATA components
  x86: Add XFD faulting bit for state components
  x86: Add AMX CPUIDs enumeration
  x86: add support for KVM_CAP_XSAVE2 and AMX state migration

Yang Zhong (2):
  x86: Grant AMX permission for guest
  linux-header: Sync the linux headers

Zeng Guang (1):
  x86: Support XFD and AMX xsave data migration

 linux-headers/asm-x86/kvm.h |  17 ++
 linux-headers/linux/kvm.h   |   4 ++
 target/i386/cpu.h   |  46 ++-
 target/i386/cpu.c   | 108 +++-
 target/i386/kvm/kvm-cpu.c   |  11 ++--
 target/i386/kvm/kvm.c   |  84 ++--
 target/i386/machine.c   |  42 ++
 target/i386/xsave_helper.c  |  33 +++
 8 files changed, 320 insertions(+), 25 deletions(-)

[PATCH v2 3/8] x86: Grant AMX permission for guest

2022-02-16 Thread Yang Zhong

Kernel allocates 4K xstate buffer by default. For XSAVE features
which require large state component (e.g. AMX), Linux kernel
dynamically expands the xstate buffer only after the process has
acquired the necessary permissions. Those are called dynamically-
enabled XSAVE features (or dynamic xfeatures).

There are separate permissions for native tasks and guests.

Qemu should request the guest permissions for dynamic xfeatures
which will be exposed to the guest. This only needs to be done
once before the first vcpu is created.

KVM implemented one new ARCH_GET_XCOMP_SUPP system attribute API to
get host side supported_xcr0 and Qemu can decide if it can request
dynamically enabled XSAVE features permission.
https://lore.kernel.org/all/20220126152210.3044876-1-pbonz...@redhat.com/

Suggested-by: Paolo Bonzini 
Signed-off-by: Yang Zhong 
Signed-off-by: Jing Liu 
---
 target/i386/cpu.h |  7 +++
 target/i386/cpu.c | 43 +++
 target/i386/kvm/kvm-cpu.c | 12 +--
 target/i386/kvm/kvm.c | 20 ++
 4 files changed, 76 insertions(+), 6 deletions(-)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 06d2d6bccf..d4ad0f56bd 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -549,6 +549,13 @@ typedef enum X86Seg {
 #define XSTATE_ZMM_Hi256_MASK   (1ULL << XSTATE_ZMM_Hi256_BIT)
 #define XSTATE_Hi16_ZMM_MASK(1ULL << XSTATE_Hi16_ZMM_BIT)
 #define XSTATE_PKRU_MASK(1ULL << XSTATE_PKRU_BIT)
+#define XSTATE_XTILE_CFG_MASK   (1ULL << XSTATE_XTILE_CFG_BIT)
+#define XSTATE_XTILE_DATA_MASK  (1ULL << XSTATE_XTILE_DATA_BIT)
+#define XFEATURE_XTILE_MASK (XSTATE_XTILE_CFG_MASK \
+ | XSTATE_XTILE_DATA_MASK)
+
+#define ARCH_GET_XCOMP_GUEST_PERM   0x1024
+#define ARCH_REQ_XCOMP_GUEST_PERM   0x1025
 
 #define ESA_FEATURE_ALIGN64_BIT 1
 
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index ea7e8f9081..377d993438 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -43,6 +43,8 @@
 #include "disas/capstone.h"
 #include "cpu-internal.h"
 
+#include 
+
 /* Helpers for building CPUID[2] descriptors: */
 
 struct CPUID2CacheDescriptorInfo {
@@ -6000,12 +6002,47 @@ static void x86_cpu_adjust_feat_level(X86CPU *cpu, 
FeatureWord w)
 }
 }
 
+static void kvm_request_xsave_components(X86CPU *cpu, uint64_t mask)
+{
+KVMState *s = kvm_state;
+uint64_t bitmask;
+long rc;
+
+if ((mask & XSTATE_XTILE_DATA_MASK) == XSTATE_XTILE_DATA_MASK) {
+bitmask = kvm_arch_get_supported_cpuid(s, 0xd, 0, R_EAX);
+if (!(bitmask & XSTATE_XTILE_DATA_MASK)) {
+warn_report("no amx support from supported_xcr0, "
+"bitmask:0x%lx", bitmask);
+return;
+}
+
+rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM,
+  XSTATE_XTILE_DATA_BIT);
+if (rc) {
+/*
+ * The older kernel version(<5.15) can't support
+ * ARCH_REQ_XCOMP_GUEST_PERM and directly return.
+ */
+return;
+}
+
+rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, );
+if (rc) {
+warn_report("prctl(ARCH_GET_XCOMP_GUEST_PERM) error: %ld", rc);
+} else if (!(bitmask & XFEATURE_XTILE_MASK)) {
+warn_report("prctl(ARCH_REQ_XCOMP_GUEST_PERM) failure "
+"and bitmask=0x%lx", bitmask);
+}
+}
+}
+
 /* Calculate XSAVE components based on the configured CPU feature flags */
 static void x86_cpu_enable_xsave_components(X86CPU *cpu)
 {
 CPUX86State *env = >env;
 int i;
 uint64_t mask;
+static bool request_perm;
 
 if (!(env->features[FEAT_1_ECX] & CPUID_EXT_XSAVE)) {
 env->features[FEAT_XSAVE_COMP_LO] = 0;
@@ -6021,6 +6058,12 @@ static void x86_cpu_enable_xsave_components(X86CPU *cpu)
 }
 }
 
+/* Only request permission for first vcpu */
+if (kvm_enabled() && !request_perm) {
+kvm_request_xsave_components(cpu, mask);
+request_perm = true;
+}
+
 env->features[FEAT_XSAVE_COMP_LO] = mask;
 env->features[FEAT_XSAVE_COMP_HI] = mask >> 32;
 }
diff --git a/target/i386/kvm/kvm-cpu.c b/target/i386/kvm/kvm-cpu.c
index ce27d3b1df..a35a1bf9fe 100644
--- a/target/i386/kvm/kvm-cpu.c
+++ b/target/i386/kvm/kvm-cpu.c
@@ -84,7 +84,7 @@ static void kvm_cpu_max_instance_init(X86CPU *cpu)
 static void kvm_cpu_xsave_init(void)
 {
 static bool first = true;
-KVMState *s = kvm_state;
+uint32_t eax, ebx, ecx, edx;
 int i;
 
 if (!first) {
@@ -100,11 +100,11 @@ static void kvm_cpu_xsave_init(void)
 ExtSaveArea *esa = _ext_save_areas[i];
 
 if (esa->size) {
-int sz = kvm_arch_get_supported_cpuid(s, 0xd, i, R_EAX);
-if (sz != 0) {
-assert(esa->size == sz);
-esa->offset =

Re: [PATCH 28/31] vdpa: Expose VHOST_F_LOG_ALL on SVQ

2022-02-16 Thread Jason Wang

On Wed, Feb 16, 2022 at 11:54 PM Eugenio Perez Martin
 wrote:
>
> On Tue, Feb 8, 2022 at 9:25 AM Jason Wang  wrote:
> >
> >
> > 在 2022/2/1 下午7:45, Eugenio Perez Martin 写道:
> > > On Sun, Jan 30, 2022 at 7:50 AM Jason Wang  wrote:
> > >>
> > >> 在 2022/1/22 上午4:27, Eugenio Pérez 写道:
> > >>> SVQ is able to log the dirty bits by itself, so let's use it to not
> > >>> block migration.
> > >>>
> > >>> Also, ignore set and clear of VHOST_F_LOG_ALL on set_features if SVQ is
> > >>> enabled. Even if the device supports it, the reports would be nonsense
> > >>> because SVQ memory is in the qemu region.
> > >>>
> > >>> The log region is still allocated. Future changes might skip that, but
> > >>> this series is already long enough.
> > >>>
> > >>> Signed-off-by: Eugenio Pérez 
> > >>> ---
> > >>>hw/virtio/vhost-vdpa.c | 20 
> > >>>1 file changed, 20 insertions(+)
> > >>>
> > >>> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> > >>> index fb0a338baa..75090d65e8 100644
> > >>> --- a/hw/virtio/vhost-vdpa.c
> > >>> +++ b/hw/virtio/vhost-vdpa.c
> > >>> @@ -1022,6 +1022,9 @@ static int vhost_vdpa_get_features(struct 
> > >>> vhost_dev *dev, uint64_t *features)
> > >>>if (ret == 0 && v->shadow_vqs_enabled) {
> > >>>/* Filter only features that SVQ can offer to guest */
> > >>>vhost_svq_valid_guest_features(features);
> > >>> +
> > >>> +/* Add SVQ logging capabilities */
> > >>> +*features |= BIT_ULL(VHOST_F_LOG_ALL);
> > >>>}
> > >>>
> > >>>return ret;
> > >>> @@ -1039,8 +1042,25 @@ static int vhost_vdpa_set_features(struct 
> > >>> vhost_dev *dev,
> > >>>
> > >>>if (v->shadow_vqs_enabled) {
> > >>>uint64_t dev_features, svq_features, acked_features;
> > >>> +uint8_t status = 0;
> > >>>bool ok;
> > >>>
> > >>> +ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, );
> > >>> +if (unlikely(ret)) {
> > >>> +return ret;
> > >>> +}
> > >>> +
> > >>> +if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
> > >>> +/*
> > >>> + * vhost is trying to enable or disable _F_LOG, and the 
> > >>> device
> > >>> + * would report wrong dirty pages. SVQ handles it.
> > >>> + */
> > >>
> > >> I fail to understand this comment, I'd think there's no way to disable
> > >> dirty page tracking for SVQ.
> > >>
> > > vhost_log_global_{start,stop} are called at the beginning and end of
> > > migration. To inform the device that it should start logging, they set
> > > or clean VHOST_F_LOG_ALL at vhost_dev_set_log.
> >
> >
> > Yes, but for SVQ, we can't disable dirty page tracking, isn't it? The
> > only thing is to ignore or filter out the F_LOG_ALL and pretend to be
> > enabled and disabled.
> >
>
> Yes, that's what this patch does.
>
> >
> > >
> > > While SVQ does not use VHOST_F_LOG_ALL, it exports the feature bit so
> > > vhost does not block migration. Maybe we need to look for another way
> > > to do this?
> >
> >
> > I'm fine with filtering since it's much more simpler, but I fail to
> > understand why we need to check DRIVER_OK.
> >
>
> Ok maybe I can make that part more clear,
>
> Since both operations use vhost_vdpa_set_features we must just filter
> the one that actually sets or removes VHOST_F_LOG_ALL, without
> affecting other features.
>
> In practice, that means to not forward the set features after
> DRIVER_OK. The device is not expecting them anymore.

I wonder what happens if we don't do this.

So kernel had this check:

/*
 * It's not allowed to change the features after they have
 * been negotiated.
 */
if (ops->get_status(vdpa) & VIRTIO_CONFIG_S_FEATURES_OK)
return -EBUSY;

So is it FEATURES_OK actually?

For this patch, I wonder if the thing we need to do is to see whether
it is a enable/disable F_LOG_ALL and simply return.

Thanks

>
> Does that make more sense?
>
> Thanks!
>
> > Thanks
> >
> >
> > >
> > > Thanks!
> > >
> > >> Thanks
> > >>
> > >>
> > >>> +return 0;
> > >>> +}
> > >>> +
> > >>> +/* We must not ack _F_LOG if SVQ is enabled */
> > >>> +features &= ~BIT_ULL(VHOST_F_LOG_ALL);
> > >>> +
> > >>>ret = vhost_vdpa_get_dev_features(dev, _features);
> > >>>if (ret != 0) {
> > >>>error_report("Can't get vdpa device features, got (%d)", 
> > >>> ret);
> >
>

Re: [PATCH v4 2/2] target/riscv: Enable Zicbo[m,z,p] instructions

2022-02-16 Thread Christoph Müllner

On Thu, Feb 17, 2022 at 3:15 AM Weiwei Li  wrote:

>
> 在 2022/2/16 下午11:48, Christoph Muellner 写道:
> > diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
> > index 39ffb883fc..04500fe352 100644
> > --- a/target/riscv/cpu.c
> > +++ b/target/riscv/cpu.c
> > @@ -764,6 +764,10 @@ static Property riscv_cpu_properties[] = {
> >   DEFINE_PROP_BOOL("Counters", RISCVCPU, cfg.ext_counters, true),
> >   DEFINE_PROP_BOOL("Zifencei", RISCVCPU, cfg.ext_ifencei, true),
> >   DEFINE_PROP_BOOL("Zicsr", RISCVCPU, cfg.ext_icsr, true),
> > +DEFINE_PROP_BOOL("zicbom", RISCVCPU, cfg.ext_icbom, true),
> > +DEFINE_PROP_BOOL("zicboz", RISCVCPU, cfg.ext_icboz, true),
> > +DEFINE_PROP_UINT16("cbom_blocksize", RISCVCPU, cfg.cbom_blocksize,
> 64),
> > +DEFINE_PROP_UINT16("cboz_blocksize", RISCVCPU, cfg.cboz_blocksize,
> 64),
> Why use two different cache block size here? Is there any new spec
> update for this?
>

No, we are talking about the same specification.

Section 2.7 states the following:
"""
The initial set of CMO extensions requires the following information to be
discovered by software:
* The size of the cache block for management and prefetch instructions
* The size of the cache block for zero instructions
* CBIE support at each privilege level
"""

So at least the spec authors did differentiate between the two block sizes
as well.


> >   DEFINE_PROP_BOOL("Zfh", RISCVCPU, cfg.ext_zfh, false),
> >   DEFINE_PROP_BOOL("Zfhmin", RISCVCPU, cfg.ext_zfhmin, false),
> >   DEFINE_PROP_BOOL("Zve32f", RISCVCPU, cfg.ext_zve32f, false),
> > +
> > +/* helper_zicbom_access
> > + *
> > + * Check access permissions (LOAD, STORE or FETCH as specified in
> section
> > + * 2.5.2 of the CMO specification) for Zicbom, raising either store
> > + * page-fault (non-virtualised) or store guest-page fault (virtualised).
> > + */
> > +static void helper_zicbom_access(CPURISCVState *env, target_ulong
> address,
> > + uintptr_t ra)
> > +{
> > +int ret;
> > +void* phost;
> > +int mmu_idx = cpu_mmu_index(env, false);
> > +
> > +/* Get the size of the cache block for management instructions. */
> > +RISCVCPU *cpu = env_archcpu(env);
> > +uint16_t cbomlen = cpu->cfg.cbom_blocksize;
> > +
> > +/* Mask off low-bits to align-down to the cache-block. */
> > +address &= ~(cbomlen - 1);
> > +
> > +/* A cache-block management instruction is permitted to access
> > + * the specified cache block whenever a load instruction, store
> > + * instruction, or instruction fetch is permitted to access the
> > + * corresponding physical addresses.
> > + */
> > +ret = probe_access_range_flags(env, address, cbomlen, MMU_DATA_LOAD,
> > +   mmu_idx, true, , ra);
> > +if (ret == TLB_INVALID_MASK)
> > +ret = probe_access_range_flags(env, address, cbomlen,
> MMU_INST_FETCH,
> > +   mmu_idx, true, , ra);
> > +if (ret == TLB_INVALID_MASK)
> > +probe_access_range_flags(env, address, cbomlen, MMU_DATA_STORE,
> > + mmu_idx, false, , ra);
> > +}
> > +
>
>
> I think it's a little different here. Probe_access_range_flags may
> trigger different execptions for different access_type. For example:
>
> If  the page for the address  is executable and readable but not
> writable,  and the access cannot pass the pmp check for all access_type,
>
> it may trigger access fault for load/fetch access, and  trigger page
> fault for  store access.
>

Just to be clear:
The patch does not trigger any fault for LOAD or FETCH because nonfault is
set
to true (6th argument of probe_access_range_flags()).
Only the last call to probe_access_range_flags() raises an exception.

Section 2.5.2 states the following:
"""
If access to the cache block is not permitted, a cache-block management
instruction raises a store page fault or store guest-page fault exception
if address translation does not permit any
access or raises a store access fault exception otherwise.
"""

In your scenario we have (1...allowed; 0...not allowed):
* read: perm:1, pmp:0
* fetch: perm:1: pmp:0
* write: perm:0, pmp:0

Address translation would allow read and fetch access, but PMP blocks that.
So the "does not permit any"-part is wrong, therefore we should raise a
store page fault.

In fact, I can't predict what will happen, because the code in
target/riscv/cpu_helper.c does
not really prioritize page faults or PMP faults. it returns one of them,
once they are encountered.

In order to model this properly, we would have to refactor cpu_helper.c to
separate page permissions
from PMP. However, that seems a bit out of scope for a Zicbo* support
patchset.



>
> I think the final exception should be access fault instead of the page
> fault caused by probe_access_range_flags with MMU_DATA_STORE.
>
> Regards,
>
> Weiwei Li
>
>

[PATCH] tcg: Remove dh_alias indirection for dh_typecode

2022-02-16 Thread Richard Henderson

The dh_alias redirect is intended to handle TCG types as distinguished
from C types.  TCG does not distinguish signed int from unsigned int,
because they are the same size.  However, we need to retain this
distinction for dh_typecode, lest we fail to extend abi types properly
for the host call parameters.

This bug was detected when running the 'arm' emulator on an s390
system. The s390 uses TCG_TARGET_EXTEND_ARGS which triggers code
in tcg_gen_callN to extend 32 bit values to 64 bits; the incorrect
sign data in the typemask for each argument caused the values to be
extended as unsigned values.

This simple program exhibits the problem:

static volatile int num = -9;
static volatile int den = -5;

int
main(void)
{
int quo = num / den;
printf("num %d den %d quo %d\n", num, den, quo);
exit(0);
}

When run on the broken qemu, this results in:

num -9 den -5 quo 0

The correct result is:

num -9 den -5 quo 1

Reported-by: Keith Packard 
Signed-off-by: Richard Henderson 
---
 include/exec/helper-head.h   | 19 ++-
 target/hppa/helper.h |  2 ++
 target/i386/ops_sse_header.h |  3 +++
 target/m68k/helper.h |  1 +
 target/ppc/helper.h  |  3 +++
 5 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/include/exec/helper-head.h b/include/exec/helper-head.h
index b974eb394a..734af067fe 100644
--- a/include/exec/helper-head.h
+++ b/include/exec/helper-head.h
@@ -53,13 +53,16 @@
 # ifdef TARGET_LONG_BITS
 #  if TARGET_LONG_BITS == 32
 #   define dh_alias_tl i32
+#   define dh_typecode_tl dh_typecode_i32
 #  else
 #   define dh_alias_tl i64
+#   define dh_typecode_tl dh_typecode_i64
 #  endif
 # endif
-# define dh_alias_env ptr
 # define dh_ctype_tl target_ulong
+# define dh_alias_env ptr
 # define dh_ctype_env CPUArchState *
+# define dh_typecode_env dh_typecode_ptr
 #endif
 
 /* We can't use glue() here because it falls foul of C preprocessor
@@ -92,18 +95,16 @@
 #define dh_typecode_i64 4
 #define dh_typecode_s64 5
 #define dh_typecode_ptr 6
-#define dh_typecode(t) glue(dh_typecode_, dh_alias(t))
+#define dh_typecode_int dh_typecode_s32
+#define dh_typecode_f16 dh_typecode_i32
+#define dh_typecode_f32 dh_typecode_i32
+#define dh_typecode_f64 dh_typecode_i64
+#define dh_typecode_cptr dh_typecode_ptr
+#define dh_typecode(t) dh_typecode_##t
 
 #define dh_callflag_i32  0
-#define dh_callflag_s32  0
-#define dh_callflag_int  0
 #define dh_callflag_i64  0
-#define dh_callflag_s64  0
-#define dh_callflag_f16  0
-#define dh_callflag_f32  0
-#define dh_callflag_f64  0
 #define dh_callflag_ptr  0
-#define dh_callflag_cptr dh_callflag_ptr
 #define dh_callflag_void 0
 #define dh_callflag_noreturn TCG_CALL_NO_RETURN
 #define dh_callflag(t) glue(dh_callflag_, dh_alias(t))
diff --git a/target/hppa/helper.h b/target/hppa/helper.h
index fe8a9ce493..c7e35ce8c7 100644
--- a/target/hppa/helper.h
+++ b/target/hppa/helper.h
@@ -1,7 +1,9 @@
 #if TARGET_REGISTER_BITS == 64
 # define dh_alias_tr i64
+# define dh_typecode_tr  dh_typecode_i64
 #else
 # define dh_alias_tr i32
+# define dh_typecode_tr  dh_typecode_i32
 #endif
 #define dh_ctype_tr  target_ureg
 
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index e68af5c403..cef28f2aae 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -30,6 +30,9 @@
 #define dh_ctype_Reg Reg *
 #define dh_ctype_ZMMReg ZMMReg *
 #define dh_ctype_MMXReg MMXReg *
+#define dh_typecode_Reg dh_typecode_ptr
+#define dh_typecode_ZMMReg dh_typecode_ptr
+#define dh_typecode_MMXReg dh_typecode_ptr
 
 DEF_HELPER_3(glue(psrlw, SUFFIX), void, env, Reg, Reg)
 DEF_HELPER_3(glue(psraw, SUFFIX), void, env, Reg, Reg)
diff --git a/target/m68k/helper.h b/target/m68k/helper.h
index 9842eeaa95..0a6b4146f6 100644
--- a/target/m68k/helper.h
+++ b/target/m68k/helper.h
@@ -17,6 +17,7 @@ DEF_HELPER_4(cas2l_parallel, void, env, i32, i32, i32)
 
 #define dh_alias_fp ptr
 #define dh_ctype_fp FPReg *
+#define dh_typecode_fp dh_typecode_ptr
 
 DEF_HELPER_3(exts32, void, env, fp, s32)
 DEF_HELPER_3(extf32, void, env, fp, f32)
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index ab008c9d4e..ae7d503fcf 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -127,9 +127,11 @@ DEF_HELPER_FLAGS_1(ftsqrt, TCG_CALL_NO_RWG_SE, i32, i64)
 
 #define dh_alias_avr ptr
 #define dh_ctype_avr ppc_avr_t *
+#define dh_typecode_avr dh_typecode_ptr
 
 #define dh_alias_vsr ptr
 #define dh_ctype_vsr ppc_vsr_t *
+#define dh_typecode_vsr dh_typecode_ptr
 
 DEF_HELPER_3(vavgub, void, avr, avr, avr)
 DEF_HELPER_3(vavguh, void, avr, avr, avr)
@@ -708,6 +710,7 @@ DEF_HELPER_3(store_dbatu, void, env, i32, tl)
 
 #define dh_alias_fprp ptr
 #define dh_ctype_fprp ppc_fprp_t *
+#define dh_typecode_fprp dh_typecode_ptr
 
 DEF_HELPER_4(DADD, void, env, fprp, fprp, fprp)
 DEF_HELPER_4(DADDQ, void, env, fprp, fprp, fprp)
-- 
2.25.1

[PATCH v16 7/7] softmmu/dirtylimit: Implement dirty page rate limit

2022-02-16 Thread huangy81

From: Hyman Huang(黄勇) 

Implement dirtyrate calculation periodically basing on
dirty-ring and throttle virtual CPU until it reachs the quota
dirty page rate given by user.

Introduce qmp commands "set-vcpu-dirty-limit",
"cancel-vcpu-dirty-limit", "query-vcpu-dirty-limit"
to enable, disable, query dirty page limit for virtual CPU.

Meanwhile, introduce corresponding hmp commands
"set_vcpu_dirty_limit", "cancel_vcpu_dirty_limit",
"info vcpu_dirty_limit" so the feature can be more usable.

Signed-off-by: Hyman Huang(黄勇) 
Acked-by: Markus Armbruster 
Reviewed-by: Peter Xu 
---
 hmp-commands-info.hx  |  13 
 hmp-commands.hx   |  32 +
 include/monitor/hmp.h |   3 +
 qapi/migration.json   |  80 +
 softmmu/dirtylimit.c  | 195 ++
 5 files changed, 323 insertions(+)

diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx
index e90f20a..61b23d2 100644
--- a/hmp-commands-info.hx
+++ b/hmp-commands-info.hx
@@ -865,6 +865,19 @@ SRST
 Display the vcpu dirty rate information.
 ERST
 
+{
+.name   = "vcpu_dirty_limit",
+.args_type  = "",
+.params = "",
+.help   = "show dirty page limit information of all vCPU",
+.cmd= hmp_info_vcpu_dirty_limit,
+},
+
+SRST
+  ``info vcpu_dirty_limit``
+Display the vcpu dirty page limit information.
+ERST
+
 #if defined(TARGET_I386)
 {
 .name   = "sgx",
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 70a9136..5bedee2 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1744,3 +1744,35 @@ ERST
   "\n\t\t\t -b to specify dirty bitmap as method of 
calculation)",
 .cmd= hmp_calc_dirty_rate,
 },
+
+SRST
+``set_vcpu_dirty_limit``
+  Set dirty page rate limit on virtual CPU, the information about all the
+  virtual CPU dirty limit status can be observed with ``info vcpu_dirty_limit``
+  command.
+ERST
+
+{
+.name   = "set_vcpu_dirty_limit",
+.args_type  = "dirty_rate:l,cpu_index:l?",
+.params = "dirty_rate [cpu_index]",
+.help   = "set dirty page rate limit, use cpu_index to set limit"
+  "\n\t\t\t\t\t on a specified virtual cpu",
+.cmd= hmp_set_vcpu_dirty_limit,
+},
+
+SRST
+``cancel_vcpu_dirty_limit``
+  Cancel dirty page rate limit on virtual CPU, the information about all the
+  virtual CPU dirty limit status can be observed with ``info vcpu_dirty_limit``
+  command.
+ERST
+
+{
+.name   = "cancel_vcpu_dirty_limit",
+.args_type  = "cpu_index:l?",
+.params = "[cpu_index]",
+.help   = "cancel dirty page rate limit, use cpu_index to cancel"
+  "\n\t\t\t\t\t limit on a specified virtual cpu",
+.cmd= hmp_cancel_vcpu_dirty_limit,
+},
diff --git a/include/monitor/hmp.h b/include/monitor/hmp.h
index 96d0148..478820e 100644
--- a/include/monitor/hmp.h
+++ b/include/monitor/hmp.h
@@ -131,6 +131,9 @@ void hmp_replay_delete_break(Monitor *mon, const QDict 
*qdict);
 void hmp_replay_seek(Monitor *mon, const QDict *qdict);
 void hmp_info_dirty_rate(Monitor *mon, const QDict *qdict);
 void hmp_calc_dirty_rate(Monitor *mon, const QDict *qdict);
+void hmp_set_vcpu_dirty_limit(Monitor *mon, const QDict *qdict);
+void hmp_cancel_vcpu_dirty_limit(Monitor *mon, const QDict *qdict);
+void hmp_info_vcpu_dirty_limit(Monitor *mon, const QDict *qdict);
 void hmp_human_readable_text_helper(Monitor *mon,
 HumanReadableText *(*qmp_handler)(Error 
**));
 
diff --git a/qapi/migration.json b/qapi/migration.json
index 5975a0e..2ccbb92 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -1861,6 +1861,86 @@
 { 'command': 'query-dirty-rate', 'returns': 'DirtyRateInfo' }
 
 ##
+# @DirtyLimitInfo:
+#
+# Dirty page rate limit information of a virtual CPU.
+#
+# @cpu-index: index of a virtual CPU.
+#
+# @limit-rate: upper limit of dirty page rate (MB/s) for a virtual
+#  CPU, 0 means unlimited.
+#
+# @current-rate: current dirty page rate (MB/s) for a virtual CPU.
+#
+# Since: 7.0
+#
+##
+{ 'struct': 'DirtyLimitInfo',
+  'data': { 'cpu-index': 'int',
+'limit-rate': 'uint64',
+'current-rate': 'uint64' } }
+
+##
+# @set-vcpu-dirty-limit:
+#
+# Set the upper limit of dirty page rate for virtual CPUs.
+#
+# Requires KVM with accelerator property "dirty-ring-size" set.
+# A virtual CPU's dirty page rate is a measure of its memory load.
+# To observe dirty page rates, use @calc-dirty-rate.
+#
+# @cpu-index: index of a virtual CPU, default is all.
+#
+# @dirty-rate: upper limit of dirty page rate (MB/s) for virtual CPUs.
+#
+# Since: 7.0
+#
+# Example:
+#   {"execute": "set-vcpu-dirty-limit"}
+#"arguments": { "dirty-rate": 200,
+#   "cpu-index": 1 } }
+#
+##
+{ 'command': 'set-vcpu-dirty-limit',
+  'data': { '*cpu-index': 'int',
+

[PATCH v16 4/7] softmmu/dirtylimit: Implement vCPU dirtyrate calculation periodically

2022-02-16 Thread huangy81

From: Hyman Huang(黄勇) 

Introduce the third method GLOBAL_DIRTY_LIMIT of dirty
tracking for calculate dirtyrate periodly for dirty page
rate limit.

Add dirtylimit.c to implement dirtyrate calculation periodly,
which will be used for dirty page rate limit.

Add dirtylimit.h to export util functions for dirty page rate
limit implementation.

Signed-off-by: Hyman Huang(黄勇) 
Reviewed-by: Peter Xu 
---
 include/exec/memory.h   |   5 +-
 include/sysemu/dirtylimit.h |  22 +
 softmmu/dirtylimit.c| 116 
 softmmu/meson.build |   1 +
 4 files changed, 143 insertions(+), 1 deletion(-)
 create mode 100644 include/sysemu/dirtylimit.h
 create mode 100644 softmmu/dirtylimit.c

diff --git a/include/exec/memory.h b/include/exec/memory.h
index 4d5997e..88ca510 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -69,7 +69,10 @@ static inline void fuzz_dma_read_cb(size_t addr,
 /* Dirty tracking enabled because measuring dirty rate */
 #define GLOBAL_DIRTY_DIRTY_RATE (1U << 1)
 
-#define GLOBAL_DIRTY_MASK  (0x3)
+/* Dirty tracking enabled because dirty limit */
+#define GLOBAL_DIRTY_LIMIT  (1U << 2)
+
+#define GLOBAL_DIRTY_MASK  (0x7)
 
 extern unsigned int global_dirty_tracking;
 
diff --git a/include/sysemu/dirtylimit.h b/include/sysemu/dirtylimit.h
new file mode 100644
index 000..da459f0
--- /dev/null
+++ b/include/sysemu/dirtylimit.h
@@ -0,0 +1,22 @@
+/*
+ * Dirty page rate limit common functions
+ *
+ * Copyright (c) 2022 CHINA TELECOM CO.,LTD.
+ *
+ * Authors:
+ *  Hyman Huang(黄勇) 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#ifndef QEMU_DIRTYRLIMIT_H
+#define QEMU_DIRTYRLIMIT_H
+
+#define DIRTYLIMIT_CALC_TIME_MS 1000/* 1000ms */
+
+int64_t vcpu_dirty_rate_get(int cpu_index);
+void vcpu_dirty_rate_stat_start(void);
+void vcpu_dirty_rate_stat_stop(void);
+void vcpu_dirty_rate_stat_initialize(void);
+void vcpu_dirty_rate_stat_finalize(void);
+#endif
diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
new file mode 100644
index 000..6102e8c
--- /dev/null
+++ b/softmmu/dirtylimit.c
@@ -0,0 +1,116 @@
+/*
+ * Dirty page rate limit implementation code
+ *
+ * Copyright (c) 2022 CHINA TELECOM CO.,LTD.
+ *
+ * Authors:
+ *  Hyman Huang(黄勇) 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/main-loop.h"
+#include "qapi/qapi-commands-migration.h"
+#include "sysemu/dirtyrate.h"
+#include "sysemu/dirtylimit.h"
+#include "exec/memory.h"
+#include "hw/boards.h"
+
+struct {
+VcpuStat stat;
+bool running;
+QemuThread thread;
+} *vcpu_dirty_rate_stat;
+
+static void vcpu_dirty_rate_stat_collect(void)
+{
+VcpuStat stat;
+int i = 0;
+
+/* calculate vcpu dirtyrate */
+vcpu_calculate_dirtyrate(DIRTYLIMIT_CALC_TIME_MS,
+ ,
+ GLOBAL_DIRTY_LIMIT,
+ false);
+
+for (i = 0; i < stat.nvcpu; i++) {
+vcpu_dirty_rate_stat->stat.rates[i].id = i;
+vcpu_dirty_rate_stat->stat.rates[i].dirty_rate =
+stat.rates[i].dirty_rate;
+}
+
+free(stat.rates);
+}
+
+static void *vcpu_dirty_rate_stat_thread(void *opaque)
+{
+rcu_register_thread();
+
+/* start log sync */
+global_dirty_log_change(GLOBAL_DIRTY_LIMIT, true);
+
+while (qatomic_read(_dirty_rate_stat->running)) {
+vcpu_dirty_rate_stat_collect();
+}
+
+/* stop log sync */
+global_dirty_log_change(GLOBAL_DIRTY_LIMIT, false);
+
+rcu_unregister_thread();
+return NULL;
+}
+
+int64_t vcpu_dirty_rate_get(int cpu_index)
+{
+DirtyRateVcpu *rates = vcpu_dirty_rate_stat->stat.rates;
+return qatomic_read([cpu_index].dirty_rate);
+}
+
+void vcpu_dirty_rate_stat_start(void)
+{
+if (qatomic_read(_dirty_rate_stat->running)) {
+return;
+}
+
+qatomic_set(_dirty_rate_stat->running, 1);
+qemu_thread_create(_dirty_rate_stat->thread,
+   "dirtyrate-stat",
+   vcpu_dirty_rate_stat_thread,
+   NULL,
+   QEMU_THREAD_JOINABLE);
+}
+
+void vcpu_dirty_rate_stat_stop(void)
+{
+qatomic_set(_dirty_rate_stat->running, 0);
+qemu_mutex_unlock_iothread();
+qemu_thread_join(_dirty_rate_stat->thread);
+qemu_mutex_lock_iothread();
+}
+
+void vcpu_dirty_rate_stat_initialize(void)
+{
+MachineState *ms = MACHINE(qdev_get_machine());
+int max_cpus = ms->smp.max_cpus;
+
+vcpu_dirty_rate_stat =
+g_malloc0(sizeof(*vcpu_dirty_rate_stat));
+
+vcpu_dirty_rate_stat->stat.nvcpu = max_cpus;
+vcpu_dirty_rate_stat->stat.rates =
+g_malloc0(sizeof(DirtyRateVcpu) * max_cpus);
+
+vcpu_dirty_rate_stat->running = false;
+}
+
+void

[PATCH v16 3/7] migration/dirtyrate: Refactor dirty page rate calculation

2022-02-16 Thread huangy81

From: Hyman Huang(黄勇) 

abstract out dirty log change logic into function
global_dirty_log_change.

abstract out dirty page rate calculation logic via
dirty-ring into function vcpu_calculate_dirtyrate.

abstract out mathematical dirty page rate calculation
into do_calculate_dirtyrate, decouple it from DirtyStat.

rename set_sample_page_period to dirty_stat_wait, which
is well-understood and will be reused in dirtylimit.

handle cpu hotplug/unplug scenario during measurement of
dirty page rate.

export util functions outside migration.

Signed-off-by: Hyman Huang(黄勇) 
Reviewed-by: Peter Xu 
---
 include/sysemu/dirtyrate.h |  28 ++
 migration/dirtyrate.c  | 227 -
 migration/dirtyrate.h  |   7 +-
 3 files changed, 174 insertions(+), 88 deletions(-)
 create mode 100644 include/sysemu/dirtyrate.h

diff --git a/include/sysemu/dirtyrate.h b/include/sysemu/dirtyrate.h
new file mode 100644
index 000..4d3b9a4
--- /dev/null
+++ b/include/sysemu/dirtyrate.h
@@ -0,0 +1,28 @@
+/*
+ * dirty page rate helper functions
+ *
+ * Copyright (c) 2022 CHINA TELECOM CO.,LTD.
+ *
+ * Authors:
+ *  Hyman Huang(黄勇) 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_DIRTYRATE_H
+#define QEMU_DIRTYRATE_H
+
+typedef struct VcpuStat {
+int nvcpu; /* number of vcpu */
+DirtyRateVcpu *rates; /* array of dirty rate for each vcpu */
+} VcpuStat;
+
+int64_t vcpu_calculate_dirtyrate(int64_t calc_time_ms,
+ VcpuStat *stat,
+ unsigned int flag,
+ bool one_shot);
+
+void global_dirty_log_change(unsigned int flag,
+ bool start);
+#endif
diff --git a/migration/dirtyrate.c b/migration/dirtyrate.c
index d65e744..79348de 100644
--- a/migration/dirtyrate.c
+++ b/migration/dirtyrate.c
@@ -46,7 +46,7 @@ static struct DirtyRateStat DirtyStat;
 static DirtyRateMeasureMode dirtyrate_mode =
 DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING;
 
-static int64_t set_sample_page_period(int64_t msec, int64_t initial_time)
+static int64_t dirty_stat_wait(int64_t msec, int64_t initial_time)
 {
 int64_t current_time;
 
@@ -60,6 +60,132 @@ static int64_t set_sample_page_period(int64_t msec, int64_t 
initial_time)
 return msec;
 }
 
+static inline void record_dirtypages(DirtyPageRecord *dirty_pages,
+ CPUState *cpu, bool start)
+{
+if (start) {
+dirty_pages[cpu->cpu_index].start_pages = cpu->dirty_pages;
+} else {
+dirty_pages[cpu->cpu_index].end_pages = cpu->dirty_pages;
+}
+}
+
+static int64_t do_calculate_dirtyrate(DirtyPageRecord dirty_pages,
+  int64_t calc_time_ms)
+{
+uint64_t memory_size_MB;
+uint64_t increased_dirty_pages =
+dirty_pages.end_pages - dirty_pages.start_pages;
+
+memory_size_MB = (increased_dirty_pages * TARGET_PAGE_SIZE) >> 20;
+
+return memory_size_MB * 1000 / calc_time_ms;
+}
+
+void global_dirty_log_change(unsigned int flag, bool start)
+{
+qemu_mutex_lock_iothread();
+if (start) {
+memory_global_dirty_log_start(flag);
+} else {
+memory_global_dirty_log_stop(flag);
+}
+qemu_mutex_unlock_iothread();
+}
+
+/*
+ * global_dirty_log_sync
+ * 1. sync dirty log from kvm
+ * 2. stop dirty tracking if needed.
+ */
+static void global_dirty_log_sync(unsigned int flag, bool one_shot)
+{
+qemu_mutex_lock_iothread();
+memory_global_dirty_log_sync();
+if (one_shot) {
+memory_global_dirty_log_stop(flag);
+}
+qemu_mutex_unlock_iothread();
+}
+
+static DirtyPageRecord *vcpu_dirty_stat_alloc(VcpuStat *stat)
+{
+CPUState *cpu;
+DirtyPageRecord *records;
+int nvcpu = 0;
+
+CPU_FOREACH(cpu) {
+nvcpu++;
+}
+
+stat->nvcpu = nvcpu;
+stat->rates = g_malloc0(sizeof(DirtyRateVcpu) * nvcpu);
+
+records = g_malloc0(sizeof(DirtyPageRecord) * nvcpu);
+
+return records;
+}
+
+static void vcpu_dirty_stat_collect(VcpuStat *stat,
+DirtyPageRecord *records,
+bool start)
+{
+CPUState *cpu;
+
+CPU_FOREACH(cpu) {
+record_dirtypages(records, cpu, start);
+}
+}
+
+int64_t vcpu_calculate_dirtyrate(int64_t calc_time_ms,
+ VcpuStat *stat,
+ unsigned int flag,
+ bool one_shot)
+{
+DirtyPageRecord *records;
+int64_t init_time_ms;
+int64_t duration;
+int64_t dirtyrate;
+int i = 0;
+unsigned int gen_id;
+
+retry:
+init_time_ms = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+
+cpu_list_lock();
+gen_id = cpu_list_generation_id_get();
+records = vcpu_dirty_stat_alloc(stat);
+vcpu_dirty_stat_collect(stat, records, true);
+

[PATCH v16 5/7] accel/kvm/kvm-all: Introduce kvm_dirty_ring_size function

2022-02-16 Thread huangy81

From: Hyman Huang(黄勇) 

Introduce kvm_dirty_ring_size util function to help calculate
dirty ring ful time.

Signed-off-by: Hyman Huang(黄勇) 
Acked-by: Peter Xu 
---
 accel/kvm/kvm-all.c| 5 +
 accel/stubs/kvm-stub.c | 5 +
 include/sysemu/kvm.h   | 2 ++
 3 files changed, 12 insertions(+)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 7b06b8a..8821d80 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -2312,6 +2312,11 @@ bool kvm_dirty_ring_enabled(void)
 return kvm_state->kvm_dirty_ring_size ? true : false;
 }
 
+uint32_t kvm_dirty_ring_size(void)
+{
+return kvm_state->kvm_dirty_ring_size;
+}
+
 static int kvm_init(MachineState *ms)
 {
 MachineClass *mc = MACHINE_GET_CLASS(ms);
diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c
index 5319573..1128cb2 100644
--- a/accel/stubs/kvm-stub.c
+++ b/accel/stubs/kvm-stub.c
@@ -152,4 +152,9 @@ bool kvm_dirty_ring_enabled(void)
 {
 return false;
 }
+
+uint32_t kvm_dirty_ring_size(void)
+{
+return 0;
+}
 #endif
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index 6eb39a0..bc3f0b5 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -563,4 +563,6 @@ bool kvm_cpu_check_are_resettable(void);
 bool kvm_arch_cpu_check_are_resettable(void);
 
 bool kvm_dirty_ring_enabled(void);
+
+uint32_t kvm_dirty_ring_size(void);
 #endif
-- 
1.8.3.1

[PATCH v16 6/7] softmmu/dirtylimit: Implement virtual CPU throttle

2022-02-16 Thread huangy81

From: Hyman Huang(黄勇) 

Setup a negative feedback system when vCPU thread
handling KVM_EXIT_DIRTY_RING_FULL exit by introducing
throttle_us_per_full field in struct CPUState. Sleep
throttle_us_per_full microseconds to throttle vCPU
if dirtylimit is in service.

Signed-off-by: Hyman Huang(黄勇) 
Reviewed-by: Peter Xu 
---
 accel/kvm/kvm-all.c |  19 ++-
 include/hw/core/cpu.h   |   6 +
 include/sysemu/dirtylimit.h |  15 +++
 softmmu/dirtylimit.c| 291 
 softmmu/trace-events|   7 ++
 5 files changed, 337 insertions(+), 1 deletion(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 8821d80..98e43e6 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -45,6 +45,7 @@
 #include "qemu/guest-random.h"
 #include "sysemu/hw_accel.h"
 #include "kvm-cpus.h"
+#include "sysemu/dirtylimit.h"
 
 #include "hw/boards.h"
 
@@ -476,6 +477,7 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp)
 cpu->kvm_state = s;
 cpu->vcpu_dirty = true;
 cpu->dirty_pages = 0;
+cpu->throttle_us_per_full = 0;
 
 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
 if (mmap_size < 0) {
@@ -1469,6 +1471,11 @@ static void *kvm_dirty_ring_reaper_thread(void *data)
  */
 sleep(1);
 
+/* keep sleeping so that dirtylimit not be interfered by reaper */
+if (dirtylimit_in_service()) {
+continue;
+}
+
 trace_kvm_dirty_ring_reaper("wakeup");
 r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
 
@@ -2964,8 +2971,18 @@ int kvm_cpu_exec(CPUState *cpu)
  */
 trace_kvm_dirty_ring_full(cpu->cpu_index);
 qemu_mutex_lock_iothread();
-kvm_dirty_ring_reap(kvm_state, NULL);
+/* We throttle vCPU by making it sleep once it exit from kernel
+ * due to dirty ring full. In the dirtylimit scenario, reaping
+ * all vCPUs after a single vCPU dirty ring get full result in
+ * the miss of sleep, so just reap the ring-fulled vCPU.
+ */
+if (dirtylimit_in_service()) {
+kvm_dirty_ring_reap(kvm_state, cpu);
+} else {
+kvm_dirty_ring_reap(kvm_state, NULL);
+}
 qemu_mutex_unlock_iothread();
+dirtylimit_vcpu_execute(cpu);
 ret = 0;
 break;
 case KVM_EXIT_SYSTEM_EVENT:
diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index 76ab3b8..dbeb31a 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -411,6 +411,12 @@ struct CPUState {
  */
 bool throttle_thread_scheduled;
 
+/*
+ * Sleep throttle_us_per_full microseconds once dirty ring is full
+ * if dirty page rate limit is enabled.
+ */
+int64_t throttle_us_per_full;
+
 bool ignore_memory_transaction_failures;
 
 /* Used for user-only emulation of prctl(PR_SET_UNALIGN). */
diff --git a/include/sysemu/dirtylimit.h b/include/sysemu/dirtylimit.h
index da459f0..8d2c1f3 100644
--- a/include/sysemu/dirtylimit.h
+++ b/include/sysemu/dirtylimit.h
@@ -19,4 +19,19 @@ void vcpu_dirty_rate_stat_start(void);
 void vcpu_dirty_rate_stat_stop(void);
 void vcpu_dirty_rate_stat_initialize(void);
 void vcpu_dirty_rate_stat_finalize(void);
+
+void dirtylimit_state_lock(void);
+void dirtylimit_state_unlock(void);
+void dirtylimit_state_initialize(void);
+void dirtylimit_state_finalize(void);
+bool dirtylimit_in_service(void);
+bool dirtylimit_vcpu_index_valid(int cpu_index);
+void dirtylimit_process(void);
+void dirtylimit_change(bool start);
+void dirtylimit_set_vcpu(int cpu_index,
+ uint64_t quota,
+ bool enable);
+void dirtylimit_set_all(uint64_t quota,
+bool enable);
+void dirtylimit_vcpu_execute(CPUState *cpu);
 #endif
diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
index 6102e8c..76d0b44 100644
--- a/softmmu/dirtylimit.c
+++ b/softmmu/dirtylimit.c
@@ -18,6 +18,26 @@
 #include "sysemu/dirtylimit.h"
 #include "exec/memory.h"
 #include "hw/boards.h"
+#include "sysemu/kvm.h"
+#include "trace.h"
+
+/*
+ * Dirtylimit stop working if dirty page rate error
+ * value less than DIRTYLIMIT_TOLERANCE_RANGE
+ */
+#define DIRTYLIMIT_TOLERANCE_RANGE  25  /* MB/s */
+/*
+ * Plus or minus vcpu sleep time linearly if dirty
+ * page rate error value percentage over
+ * DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT.
+ * Otherwise, plus or minus a fixed vcpu sleep time.
+ */
+#define DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT 50
+/*
+ * Max vcpu sleep time percentage during a cycle
+ * composed of dirty ring full and sleep time.
+ */
+#define DIRTYLIMIT_THROTTLE_PCT_MAX 99
 
 struct {
 VcpuStat stat;
@@ -25,6 +45,30 @@ struct {
 QemuThread thread;
 } *vcpu_dirty_rate_stat;
 
+typedef struct VcpuDirtyLimitState {
+int cpu_index;
+bool enabled;
+/*
+ * Quota dirty page rate, unit is MB/s
+ * zero if not enabled.
+ */
+

[PATCH v16 2/7] cpus: Introduce cpu_list_generation_id

2022-02-16 Thread huangy81

From: Hyman Huang(黄勇) 

Introduce cpu_list_generation_id to track cpu list generation so
that cpu hotplug/unplug can be detected during measurement of
dirty page rate.

cpu_list_generation_id could be used to detect changes of cpu
list, which is prepared for dirty page rate measurement.

Signed-off-by: Hyman Huang(黄勇) 
Reviewed-by: Peter Xu 
---
 cpus-common.c | 8 
 include/exec/cpu-common.h | 1 +
 2 files changed, 9 insertions(+)

diff --git a/cpus-common.c b/cpus-common.c
index 6e73d3e..31c6415 100644
--- a/cpus-common.c
+++ b/cpus-common.c
@@ -73,6 +73,12 @@ static int cpu_get_free_index(void)
 }
 
 CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
+static unsigned int cpu_list_generation_id;
+
+unsigned int cpu_list_generation_id_get(void)
+{
+return cpu_list_generation_id;
+}
 
 void cpu_list_add(CPUState *cpu)
 {
@@ -84,6 +90,7 @@ void cpu_list_add(CPUState *cpu)
 assert(!cpu_index_auto_assigned);
 }
 QTAILQ_INSERT_TAIL_RCU(, cpu, node);
+cpu_list_generation_id++;
 }
 
 void cpu_list_remove(CPUState *cpu)
@@ -96,6 +103,7 @@ void cpu_list_remove(CPUState *cpu)
 
 QTAILQ_REMOVE_RCU(, cpu, node);
 cpu->cpu_index = UNASSIGNED_CPU_INDEX;
+cpu_list_generation_id++;
 }
 
 CPUState *qemu_get_cpu(int index)
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index de5f444..eb33642 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -20,6 +20,7 @@ extern intptr_t qemu_host_page_mask;
 void qemu_init_cpu_list(void);
 void cpu_list_lock(void);
 void cpu_list_unlock(void);
+unsigned int cpu_list_generation_id_get(void);
 
 void tcg_flush_softmmu_tlb(CPUState *cs);
 
-- 
1.8.3.1

[PATCH v16 1/7] accel/kvm/kvm-all: Refactor per-vcpu dirty ring reaping

2022-02-16 Thread huangy81

From: Hyman Huang(黄勇) 

Add a non-required argument 'CPUState' to kvm_dirty_ring_reap so
that it can cover single vcpu dirty-ring-reaping scenario.

Signed-off-by: Hyman Huang(黄勇) 
Reviewed-by: Peter Xu 
---
 accel/kvm/kvm-all.c | 23 +--
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 0e66ebb..7b06b8a 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -756,17 +756,20 @@ static uint32_t kvm_dirty_ring_reap_one(KVMState *s, 
CPUState *cpu)
 }
 
 /* Must be with slots_lock held */
-static uint64_t kvm_dirty_ring_reap_locked(KVMState *s)
+static uint64_t kvm_dirty_ring_reap_locked(KVMState *s, CPUState* cpu)
 {
 int ret;
-CPUState *cpu;
 uint64_t total = 0;
 int64_t stamp;
 
 stamp = get_clock();
 
-CPU_FOREACH(cpu) {
-total += kvm_dirty_ring_reap_one(s, cpu);
+if (cpu) {
+total = kvm_dirty_ring_reap_one(s, cpu);
+} else {
+CPU_FOREACH(cpu) {
+total += kvm_dirty_ring_reap_one(s, cpu);
+}
 }
 
 if (total) {
@@ -787,7 +790,7 @@ static uint64_t kvm_dirty_ring_reap_locked(KVMState *s)
  * Currently for simplicity, we must hold BQL before calling this.  We can
  * consider to drop the BQL if we're clear with all the race conditions.
  */
-static uint64_t kvm_dirty_ring_reap(KVMState *s)
+static uint64_t kvm_dirty_ring_reap(KVMState *s, CPUState *cpu)
 {
 uint64_t total;
 
@@ -807,7 +810,7 @@ static uint64_t kvm_dirty_ring_reap(KVMState *s)
  * reset below.
  */
 kvm_slots_lock();
-total = kvm_dirty_ring_reap_locked(s);
+total = kvm_dirty_ring_reap_locked(s, cpu);
 kvm_slots_unlock();
 
 return total;
@@ -854,7 +857,7 @@ static void kvm_dirty_ring_flush(void)
  * vcpus out in a synchronous way.
  */
 kvm_cpu_synchronize_kick_all();
-kvm_dirty_ring_reap(kvm_state);
+kvm_dirty_ring_reap(kvm_state, NULL);
 trace_kvm_dirty_ring_flush(1);
 }
 
@@ -1398,7 +1401,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
  * Not easy.  Let's cross the fingers until it's fixed.
  */
 if (kvm_state->kvm_dirty_ring_size) {
-kvm_dirty_ring_reap_locked(kvm_state);
+kvm_dirty_ring_reap_locked(kvm_state, NULL);
 } else {
 kvm_slot_get_dirty_log(kvm_state, mem);
 }
@@ -1470,7 +1473,7 @@ static void *kvm_dirty_ring_reaper_thread(void *data)
 r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
 
 qemu_mutex_lock_iothread();
-kvm_dirty_ring_reap(s);
+kvm_dirty_ring_reap(s, NULL);
 qemu_mutex_unlock_iothread();
 
 r->reaper_iteration++;
@@ -2956,7 +2959,7 @@ int kvm_cpu_exec(CPUState *cpu)
  */
 trace_kvm_dirty_ring_full(cpu->cpu_index);
 qemu_mutex_lock_iothread();
-kvm_dirty_ring_reap(kvm_state);
+kvm_dirty_ring_reap(kvm_state, NULL);
 qemu_mutex_unlock_iothread();
 ret = 0;
 break;
-- 
1.8.3.1

[PATCH v16 0/7] support dirty restraint on vCPU

2022-02-16 Thread huangy81

From: Hyman Huang(黄勇) 

v16
- rebase on master
- drop the unused typedef syntax in [PATCH v15 6/7] 
- add the Reviewed-by and Acked-by tags by the way 

v15
- rebase on master
- drop the 'init_time_ms' parameter in function vcpu_calculate_dirtyrate 
- drop the 'setup' field in dirtylimit_state and call dirtylimit_process
  directly, which makes code cleaner.
- code clean in dirtylimit_adjust_throttle
- fix miss dirtylimit_state_unlock() in dirtylimit_process and
  dirtylimit_query_all
- add some comment

Please review. Thanks,

Regards
Yong 

v14
- v13 sent by accident, resend patchset. 

v13
- rebase on master
- passing NULL to kvm_dirty_ring_reap in commit
  "refactor per-vcpu dirty ring reaping" to keep the logic unchanged.
  In other word, we still try the best to reap as much PFNs as possible
  if dirtylimit not in service.
- move the cpu list gen id changes into a separate patch.   
- release the lock before sleep during dirty page rate calculation.
- move the dirty ring size fetch logic into a separate patch.
- drop the DIRTYLIMIT_LINEAR_ADJUSTMENT_WATERMARK MACRO .
- substitute bh with function pointer when implement dirtylimit.
- merge the dirtylimit_start/stop into dirtylimit_change.
- fix "cpu-index" parameter type with "int" to keep consistency.
- fix some syntax error in documents.

Please review. Thanks,

Yong

v12
- rebase on master
- add a new commmit to refactor per-vcpu dirty ring reaping, which can resolve 
  the "vcpu miss the chances to sleep" problem
- remove the dirtylimit_thread and implemtment throttle in bottom half instead.
- let the dirty ring reaper thread keep sleeping when dirtylimit is in service 
- introduce cpu_list_generation_id to identify cpu_list changing. 
- keep taking the cpu_list_lock during dirty_stat_wait to prevent vcpu 
plug/unplug
  when calculating the dirty page rate
- move the dirtylimit global initializations out of dirtylimit_set_vcpu and do
  some code clean
- add DIRTYLIMIT_LINEAR_ADJUSTMENT_WATERMARK in case of oscillation when 
throttling 
- remove the unmatched count field in dirtylimit_state
- add stub to fix build on non-x86
- refactor the documents

Thanks Peter and Markus for reviewing the previous versions, please review.

Thanks,
Yong

v11
- rebase on master
- add a commit " refactor dirty page rate calculation"  so that dirty page rate 
limit
  can reuse the calculation logic. 
- handle the cpu hotplug/unplug case in the dirty page rate calculation logic.
- modify the qmp commands according to Markus's advice.
- introduce a standalone file dirtylimit.c to implement dirty page rate limit
- check if dirty limit in service by dirtylimit_state pointer instead of global 
variable
- introduce dirtylimit_mutex to protect dirtylimit_state
- do some code clean and docs

See the commit for more detail, thanks Markus and Peter very mush for the code
review and give the experienced and insightful advices, most modifications are
based on these advices.

v10:
- rebase on master
- make the following modifications on patch [1/3]:
  1. Make "dirtylimit-calc" thread joinable and join it after quitting.

  2. Add finalize function to free dirtylimit_calc_state

  3. Do some code clean work

- make the following modifications on patch [2/3]:
  1. Remove the original implementation of throttle according to
 Peter's advice.
 
  2. Introduce a negative feedback system and implement the throttle
 on all vcpu in one thread named "dirtylimit". 

  3. Simplify the algo when calculation the throttle_us_per_full:
 increase/decrease linearly when there exists a wide difference
 between quota and current dirty page rate, increase/decrease
 a fixed time slice when the difference is narrow. This makes
 throttle responds faster and reach the quota smoothly.

  4. Introduce a unfit_cnt in algo to make sure throttle really
 takes effect.

  5. Set the max sleep time 99 times more than "ring_full_time_us". 

 


 
  6. Make "dirtylimit" thread joinable and join it after quitting.  

 


 
- make the following modifications on patch [3/3]:

Re: [PATCH v3 7/7] hw/mips/gt64xxx_pci: Resolve gt64120_register()

2022-02-16 Thread BALATON Zoltan


On Wed, 16 Feb 2022, Bernhard Beschow wrote:

Now that gt64120_register() lost its pic parameter, there is an
opportunity to remove it. gt64120_register() is old style by wrapping
qdev API, and the new style is to use qdev directly. So take the
opportunity and modernize the code.

Suggested-by: BALATON Zoltan 
Signed-off-by: Bernhard Beschow 
---
hw/mips/gt64xxx_pci.c  | 21 -
hw/mips/malta.c| 13 -
include/hw/mips/mips.h |  3 ---
3 files changed, 12 insertions(+), 25 deletions(-)


Very good but maybe it could be simplified even further, see below.


diff --git a/hw/mips/gt64xxx_pci.c b/hw/mips/gt64xxx_pci.c
index eb205d6d70..e0ff1b5566 100644
--- a/hw/mips/gt64xxx_pci.c
+++ b/hw/mips/gt64xxx_pci.c
@@ -26,7 +26,6 @@
#include "qapi/error.h"
#include "qemu/units.h"
#include "qemu/log.h"
-#include "hw/mips/mips.h"
#include "hw/pci/pci.h"
#include "hw/pci/pci_host.h"
#include "migration/vmstate.h"
@@ -1151,30 +1150,18 @@ static void gt64120_reset(DeviceState *dev)
static void gt64120_realize(DeviceState *dev, Error **errp)
{
GT64120State *s = GT64120_PCI_HOST_BRIDGE(dev);
+PCIHostState *phb = PCI_HOST_BRIDGE(dev);

memory_region_init_io(>ISD_mem, OBJECT(dev), _mem_ops, s,
  "gt64120-isd", 0x1000);
-}
-
-PCIBus *gt64120_register(void)
-{
-GT64120State *d;
-PCIHostState *phb;
-DeviceState *dev;
-
-dev = qdev_new(TYPE_GT64120_PCI_HOST_BRIDGE);
-d = GT64120_PCI_HOST_BRIDGE(dev);
-phb = PCI_HOST_BRIDGE(dev);
-memory_region_init(>pci0_mem, OBJECT(dev), "pci0-mem", 4 * GiB);
-address_space_init(>pci0_mem_as, >pci0_mem, "pci0-mem");
+memory_region_init(>pci0_mem, OBJECT(dev), "pci0-mem", 4 * GiB);
+address_space_init(>pci0_mem_as, >pci0_mem, "pci0-mem");
phb->bus = pci_root_bus_new(dev, "pci",
->pci0_mem,
+>pci0_mem,
get_system_io(),
PCI_DEVFN(18, 0), TYPE_PCI_BUS);
-sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), _fatal);

pci_create_simple(phb->bus, PCI_DEVFN(0, 0), "gt64120_pci");
-return phb->bus;
}

static void gt64120_pci_realize(PCIDevice *d, Error **errp)
diff --git a/hw/mips/malta.c b/hw/mips/malta.c
index 13254dbc89..16fdaed3db 100644
--- a/hw/mips/malta.c
+++ b/hw/mips/malta.c
@@ -38,6 +38,7 @@
#include "hw/mips/mips.h"
#include "hw/mips/cpudevs.h"
#include "hw/pci/pci.h"
+#include "hw/pci/pci_host.h"
#include "qemu/log.h"
#include "hw/mips/bios.h"
#include "hw/ide.h"
@@ -1230,7 +1231,7 @@ void mips_malta_init(MachineState *machine)
const size_t smbus_eeprom_size = 8 * 256;
uint8_t *smbus_eeprom_buf = g_malloc0(smbus_eeprom_size);
uint64_t kernel_entry, bootloader_run_addr;
-PCIBus *pci_bus;
+PCIHostState *phb;
ISABus *isa_bus;
qemu_irq cbus_irq, i8259_irq;
I2CBus *smbus;
@@ -1390,7 +1391,9 @@ void mips_malta_init(MachineState *machine)
stl_p(memory_region_get_ram_ptr(bios_copy) + 0x10, 0x0420);

/* Northbridge */
-pci_bus = gt64120_register();
+dev = qdev_new("gt64120");
+phb = PCI_HOST_BRIDGE(dev);
+sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), _fatal);


Since no need for setting properties you could do qdev_new and realize in 
one step with sysbus_create_simple and then get the bus from it so no need 
for going through phb either. Something like:


dev = sysbus_create_simple("gt64120", -1. 0);
pci_bus = PCI_BUS(qdev_get_child_bus(dev, "pci"));

should work with less changes.

Regards,
BALATON Zoltan


/*
 * The whole address space decoded by the GT-64120A doesn't generate
 * exception when accessing invalid memory. Create an empty slot to
@@ -1399,7 +1402,7 @@ void mips_malta_init(MachineState *machine)
empty_slot_init("GT64120", 0, 0x2000);

/* Southbridge */
-dev = piix4_create(pci_bus, _bus, );
+dev = piix4_create(phb->bus, _bus, );

/* Interrupt controller */
qdev_connect_gpio_out_named(dev, "intr", 0, i8259_irq);
@@ -1414,10 +1417,10 @@ void mips_malta_init(MachineState *machine)
isa_create_simple(isa_bus, TYPE_FDC37M81X_SUPERIO);

/* Network card */
-network_init(pci_bus);
+network_init(phb->bus);

/* Optional PCI video card */
-pci_vga_init(pci_bus);
+pci_vga_init(phb->bus);
}

static void mips_malta_instance_init(Object *obj)
diff --git a/include/hw/mips/mips.h b/include/hw/mips/mips.h
index ff88942e63..101799f7d3 100644
--- a/include/hw/mips/mips.h
+++ b/include/hw/mips/mips.h
@@ -9,9 +9,6 @@

#include "exec/memory.h"

-/* gt64xxx.c */
-PCIBus *gt64120_register(void);
-
/* bonito.c */
PCIBus *bonito_init(qemu_irq *pic);

Re: [PATCH v4 2/2] target/riscv: Enable Zicbo[m,z,p] instructions

2022-02-16 Thread Weiwei Li




在 2022/2/16 下午11:48, Christoph Muellner 写道:

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 39ffb883fc..04500fe352 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -764,6 +764,10 @@ static Property riscv_cpu_properties[] = {
  DEFINE_PROP_BOOL("Counters", RISCVCPU, cfg.ext_counters, true),
  DEFINE_PROP_BOOL("Zifencei", RISCVCPU, cfg.ext_ifencei, true),
  DEFINE_PROP_BOOL("Zicsr", RISCVCPU, cfg.ext_icsr, true),
+DEFINE_PROP_BOOL("zicbom", RISCVCPU, cfg.ext_icbom, true),
+DEFINE_PROP_BOOL("zicboz", RISCVCPU, cfg.ext_icboz, true),
+DEFINE_PROP_UINT16("cbom_blocksize", RISCVCPU, cfg.cbom_blocksize, 64),
+DEFINE_PROP_UINT16("cboz_blocksize", RISCVCPU, cfg.cboz_blocksize, 64),
Why use two different cache block size here? Is there any new spec 
update for this?

  DEFINE_PROP_BOOL("Zfh", RISCVCPU, cfg.ext_zfh, false),
  DEFINE_PROP_BOOL("Zfhmin", RISCVCPU, cfg.ext_zfhmin, false),
  DEFINE_PROP_BOOL("Zve32f", RISCVCPU, cfg.ext_zve32f, false),
+
+/* helper_zicbom_access
+ *
+ * Check access permissions (LOAD, STORE or FETCH as specified in section
+ * 2.5.2 of the CMO specification) for Zicbom, raising either store
+ * page-fault (non-virtualised) or store guest-page fault (virtualised).
+ */
+static void helper_zicbom_access(CPURISCVState *env, target_ulong address,
+ uintptr_t ra)
+{
+int ret;
+void* phost;
+int mmu_idx = cpu_mmu_index(env, false);
+
+/* Get the size of the cache block for management instructions. */
+RISCVCPU *cpu = env_archcpu(env);
+uint16_t cbomlen = cpu->cfg.cbom_blocksize;
+
+/* Mask off low-bits to align-down to the cache-block. */
+address &= ~(cbomlen - 1);
+
+/* A cache-block management instruction is permitted to access
+ * the specified cache block whenever a load instruction, store
+ * instruction, or instruction fetch is permitted to access the
+ * corresponding physical addresses.
+ */
+ret = probe_access_range_flags(env, address, cbomlen, MMU_DATA_LOAD,
+   mmu_idx, true, , ra);
+if (ret == TLB_INVALID_MASK)
+ret = probe_access_range_flags(env, address, cbomlen, MMU_INST_FETCH,
+   mmu_idx, true, , ra);
+if (ret == TLB_INVALID_MASK)
+probe_access_range_flags(env, address, cbomlen, MMU_DATA_STORE,
+ mmu_idx, false, , ra);
+}
+



I think it's a little different here. Probe_access_range_flags may 
trigger different execptions for different access_type. For example:


If  the page for the address  is executable and readable but not 
writable,  and the access cannot pass the pmp check for all access_type,


it may trigger access fault for load/fetch access, and  trigger page 
fault for  store access.


I think the final exception should be access fault instead of the page 
fault caused by probe_access_range_flags with MMU_DATA_STORE.


Regards,

Weiwei Li

Re: [PATCH] hw/arm/virt: Fix CPU's default NUMA node ID

2022-02-16 Thread Gavin Shan


On 1/26/22 5:14 PM, Igor Mammedov wrote:

On Wed, 26 Jan 2022 13:24:10 +0800
Gavin Shan  wrote:


The default CPU-to-NUMA association is given by mc->get_default_cpu_node_id()
when it isn't provided explicitly. However, the CPU topology isn't fully
considered in the default association and it causes CPU topology broken
warnings on booting Linux guest.

For example, the following warning messages are observed when the Linux guest
is booted with the following command lines.

   /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64 \
   -accel kvm -machine virt,gic-version=host   \
   -cpu host   \
   -smp 6,sockets=2,cores=3,threads=1  \
   -m 1024M,slots=16,maxmem=64G\
   -object memory-backend-ram,id=mem0,size=128M\
   -object memory-backend-ram,id=mem1,size=128M\
   -object memory-backend-ram,id=mem2,size=128M\
   -object memory-backend-ram,id=mem3,size=128M\
   -object memory-backend-ram,id=mem4,size=128M\
   -object memory-backend-ram,id=mem4,size=384M\
   -numa node,nodeid=0,memdev=mem0 \
   -numa node,nodeid=1,memdev=mem1 \
   -numa node,nodeid=2,memdev=mem2 \
   -numa node,nodeid=3,memdev=mem3 \
   -numa node,nodeid=4,memdev=mem4 \
   -numa node,nodeid=5,memdev=mem5
  :
   alternatives: patching kernel code
   BUG: arch topology borken
   the CLS domain not a subset of the MC domain
   
   BUG: arch topology borken
   the DIE domain not a subset of the NODE domain

With current implementation of mc->get_default_cpu_node_id(), CPU#0 to CPU#5
are associated with NODE#0 to NODE#5 separately. That's incorrect because
CPU#0/1/2 should be associated with same NUMA node because they're seated
in same socket.

This fixes the issue by considering the socket when default CPU-to-NUMA
is given. With this applied, no more CPU topology broken warnings are seen
from the Linux guest. The 6 CPUs are associated with NODE#0/1, but there are
no CPUs associated with NODE#2/3/4/5.



From migration point of view it looks fine to me, and doesn't need a compat knob

since NUMA data (on virt-arm) only used to construct ACPI tables (and we don't
version those unless something is broken by it).



Signed-off-by: Gavin Shan 
---
  hw/arm/virt.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 141350bf21..b4a95522d3 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -2499,7 +2499,7 @@ virt_cpu_index_to_props(MachineState *ms, unsigned 
cpu_index)
  
  static int64_t virt_get_default_cpu_node_id(const MachineState *ms, int idx)

  {
-return idx % ms->numa_state->num_nodes;
+return idx / (ms->smp.dies * ms->smp.clusters * ms->smp.cores * 
ms->smp.threads);


I'd like for ARM folks to confirm whether above is correct
(i.e. socket is NUMA node boundary and also if above topo vars
could have odd values. Don't look at horribly complicated x86
as example, but it showed that vendors could stash pretty much
anything there, so we should consider it here as well and maybe
forbid that in smp virt-arm parser)



After doing some investigation, I don't think the socket is NUMA node boundary.
Unfortunately, I didn't find it's documented like this in any documents after
checking device-tree specification, Linux CPU topology and NUMA binding 
documents.

However, there are two options here according to Linux (guest) kernel code:
(A) socket is NUMA node boundary  (B) CPU die is NUMA node boundary. They are
equivalent as CPU die isn't supported on arm/virt machine. Besides, the topology
of one-to-one association between socket and NUMA node sounds natural and 
simplified.
So I think (A) is the best way to go.

Another thing I want to explain here is how the changes affect the memory
allocation in Linux guest. Taking the command lines included in the commit
log as an example, the first two NUMA nodes are bound to CPUs while the other
4 NUMA nodes are regarded as remote NUMA nodes to CPUs. The remote NUMA node
won't accommodate the memory allocation until the memory in the near (local)
NUMA node becomes exhausted. However, it's uncertain how the memory is hosted
if memory binding isn't applied.

Besides, I think the code should be improved like below to avoid overflow on
ms->numa_state->num_nodes.

 static int64_t virt_get_default_cpu_node_id(const MachineState *ms, int idx)
 {
-return idx % ms->numa_state->num_nodes;
+int node_idx;
+
+node_idx = idx / (ms->smp.dies * ms->smp.clusters * ms->smp.cores * 
ms->smp.threads);
+return node_idx % ms->numa_state->num_nodes;
 }



  }
  
  static const CPUArchIdList *virt_possible_cpu_arch_ids(MachineState *ms)




Thanks,
Gavin

Re: [PATCH v2 3/3] target/ppc/kvm: Use KVM_CAP_PPC_AIL_MODE_3 to determine cap-ail-mode-3 support

2022-02-16 Thread David Gibson

On Wed, Feb 16, 2022 at 04:39:03PM +1000, Nicholas Piggin wrote:
> Use KVM_CAP_PPC_AIL_MODE_3 to determine cap-ail-mode-3 support for KVM
> guests. Keep the fallback heuristic for KVM hosts that pre-date this
> CAP.
> 
> This is only proposed the KVM CAP has not yet been allocated. I will
> ask to merge the new KVM cap when there are no objections on the QEMU
> side.
> 
> not-yet-Signed-off-by: Nicholas Piggin 

LGTM, once the kernel side work is done.

> ---
> Since v1: 
> - Remove incorrect test for unsupported cap query. Add comment instead.
> 
>  linux-headers/linux/kvm.h |  1 +
>  target/ppc/kvm.c  | 18 ++
>  2 files changed, 19 insertions(+)
> 
> diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
> index 02c5e7b7bb..d91f578200 100644
> --- a/linux-headers/linux/kvm.h
> +++ b/linux-headers/linux/kvm.h
> @@ -1130,6 +1130,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_BINARY_STATS_FD 203
>  #define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204
>  #define KVM_CAP_ARM_MTE 205
> +#define KVM_CAP_PPC_AIL_MODE_3 210

You may be aware of this already, but once the cap is allocated on the
kernel side, you should do a full update of the kernel headers as a
separate patch.  A direct hack to the headers copy is fine for a
preliminary posting, though.

>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
> index 1338c41f8f..7f71f78253 100644
> --- a/target/ppc/kvm.c
> +++ b/target/ppc/kvm.c
> @@ -90,6 +90,7 @@ static int cap_ppc_nested_kvm_hv;
>  static int cap_large_decr;
>  static int cap_fwnmi;
>  static int cap_rpt_invalidate;
> +static int cap_ail_mode_3;
>  
>  static uint32_t debug_inst_opcode;
>  
> @@ -154,6 +155,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
>  }
>  
>  cap_rpt_invalidate = kvm_vm_check_extension(s, 
> KVM_CAP_PPC_RPT_INVALIDATE);
> +cap_ail_mode_3 = kvm_vm_check_extension(s, KVM_CAP_PPC_AIL_MODE_3);
>  kvm_ppc_register_host_cpu_type();
>  
>  return 0;
> @@ -2567,6 +2569,17 @@ bool kvmppc_supports_ail_3(void)
>  {
>  PowerPCCPUClass *pcc = kvm_ppc_get_host_cpu_class();
>  
> +if (cap_ail_mode_3) {
> +return 1;
> +}
> +
> +/*
> + * cap-ail-mode-3 is disabled, but it may be because the KVM host 
> pre-dates
> + * the cap. Special-case the test because the performance cost for
> + * disabling the feature unconditionally is prohibitive until updated
> + * KVM is widely in use.
> + */
> +
>  /*
>   * KVM PR only supports AIL-0
>   */
> @@ -2589,6 +2602,11 @@ bool kvmppc_supports_ail_3(void)
>  return 0;
>  }
>  
> +/*
> + * Beyond ISA v3.1 (POWER10), this could return 0, because all KVM
> + * implementations for such hosts would support the cap.
> + */
> +
>  return 1;
>  }
>  

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature

Re: [PATCH 26/27] target/ppc: cpu_init: Move check_pow and QOM macros to a header

2022-02-16 Thread David Gibson

On Wed, Feb 16, 2022 at 10:06:26AM -0300, Fabiano Rosas wrote:
> David Gibson  writes:
> 
> > On Tue, Feb 15, 2022 at 06:41:47PM -0300, Fabiano Rosas wrote:
> >> These will need to be accessed from other files once we move the CPUs
> >> code to separate files.
> >> 
> >> Signed-off-by: Fabiano Rosas 
> >> ---
> >>  target/ppc/cpu.h  | 57 +++
> >>  target/ppc/cpu_init.c | 55 -
> >>  2 files changed, 57 insertions(+), 55 deletions(-)
> >> 
> >> diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
> >> index 6a06a7f533..ba0739c43b 100644
> >> --- a/target/ppc/cpu.h
> >> +++ b/target/ppc/cpu.h
> >> @@ -2733,4 +2733,61 @@ void dump_mmu(CPUPPCState *env);
> >>  void ppc_maybe_bswap_register(CPUPPCState *env, uint8_t *mem_buf, int 
> >> len);
> >>  void ppc_store_vscr(CPUPPCState *env, uint32_t vscr);
> >>  uint32_t ppc_get_vscr(CPUPPCState *env);
> >> +
> >> +/*/
> >> +/* Power management enable checks 
> >>*/
> >> +static inline int check_pow_none(CPUPPCState *env)
> >> +{
> >> +return 0;
> >> +}
> >> +
> >> +static inline int check_pow_nocheck(CPUPPCState *env)
> >> +{
> >> +return 1;
> >> +}
> >> +
> >> +static inline int check_pow_hid0(CPUPPCState *env)
> >
> > I'm a little nervous about moving this to a more exposed location.  By
> > definition the HID register is implementation dependent, and we can
> > see immediately below that not all things use the same interpretation
> > of it in practice.  So at the very least it seems like it has a bad
> > name to be exposed more widely.  It also seems like it might better
> > belong next to the code for the cpus that actually use this version.
> 
> Good point. Since these are quite simple it might be best to duplicate
> them when doing the split between the families. I'm doing the same for
> vscr_init.

Right, that sounds like a good idea to me.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature

Re: [PATCH v2 23/27] target/ppc: Rename spr_tcg.h to spr_common.h

2022-02-16 Thread David Gibson

On Wed, Feb 16, 2022 at 01:24:22PM -0300, Fabiano Rosas wrote:
> Initial intent for the spr_tcg header was to expose the spr_read|write
> callbacks that are only used by TCG code. However, although these
> routines are TCG-specific, the KVM code needs access to env->sprs
> which creation is currently coupled to the callback registration.
> 
> We are probably not going to decouple SPR creation and TCG callback
> registration any time soon, so let's rename the header to spr_common
> to accomodate the register_*_sprs functions that will be moved out of
> cpu_init.c in the following patches.
> 
> Signed-off-by: Fabiano Rosas 

Reviewed-by: David Gibson 

> ---
>  target/ppc/cpu_init.c  | 2 +-
>  target/ppc/{spr_tcg.h => spr_common.h} | 4 ++--
>  target/ppc/translate.c | 2 +-
>  3 files changed, 4 insertions(+), 4 deletions(-)
>  rename target/ppc/{spr_tcg.h => spr_common.h} (99%)
> 
> diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
> index adb23019ef..17f12aceb6 100644
> --- a/target/ppc/cpu_init.c
> +++ b/target/ppc/cpu_init.c
> @@ -44,7 +44,7 @@
>  
>  #include "helper_regs.h"
>  #include "internal.h"
> -#include "spr_tcg.h"
> +#include "spr_common.h"
>  #include "power8-pmu.h"
>  
>  /* #define PPC_DEBUG_SPR */
> diff --git a/target/ppc/spr_tcg.h b/target/ppc/spr_common.h
> similarity index 99%
> rename from target/ppc/spr_tcg.h
> rename to target/ppc/spr_common.h
> index df2abacc64..5aec76ade4 100644
> --- a/target/ppc/spr_tcg.h
> +++ b/target/ppc/spr_common.h
> @@ -16,8 +16,8 @@
>   * You should have received a copy of the GNU Lesser General Public
>   * License along with this library; if not, see 
> .
>   */
> -#ifndef SPR_TCG_H
> -#define SPR_TCG_H
> +#ifndef SPR_COMMON_H
> +#define SPR_COMMON_H
>  
>  #define SPR_NOACCESS (_noaccess)
>  
> diff --git a/target/ppc/translate.c b/target/ppc/translate.c
> index 2eaffd432a..ecc5a104e0 100644
> --- a/target/ppc/translate.c
> +++ b/target/ppc/translate.c
> @@ -35,7 +35,7 @@
>  #include "exec/translator.h"
>  #include "exec/log.h"
>  #include "qemu/atomic128.h"
> -#include "spr_tcg.h"
> +#include "spr_common.h"
>  
>  #include "qemu/qemu-print.h"
>  #include "qapi/error.h"

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature

Re: [PATCH v2 2/3] spapr: Add SPAPR_CAP_AIL_MODE_3 for AIL mode 3 support for H_SET_MODE hcall

2022-02-16 Thread David Gibson

On Wed, Feb 16, 2022 at 04:39:02PM +1000, Nicholas Piggin wrote:
> The behaviour of the Address Translation Mode on Interrupt resource is
> not consistently supported by all CPU versions or all KVM versions:
> KVM-HV does not support mode 2, and does not support mode 3 on POWER7 or
> early POWER9 processesors. KVM PR only supports mode 0. TCG supports all
> modes (0, 2, 3). This leads to inconsistencies in guest behaviour and
> could cause problems migrating guests.
> 
> This was not noticable for Linux guests for a long time because the
> kernel only uses modes 0 and 3, and it used to consider AIL-3 to be
> advisory in that it would always keep the AIL-0 vectors around. Recent
> Linux guests depend on the AIL mode working as specified in order to
> support the SCV facility interrupt. If AIL-3 can not be provided, then
> Linux must be given an error so it can disable the SCV facility, rather
> than silently failing.
> 
> Add the ail-mode-3 capability to specify that AIL-3 is supported. AIL-0
> is implied as the baseline, and AIL-2 is no longer supported by spapr.
> AIL-2 is not known to be used by any software, but support in TCG could
> be restored with an ail-mode-2 capability quite easily if a regression
> is reported.
> 
> Modify the H_SET_MODE Address Translation Mode on Interrupt resource
> handler to check capabilities and correctly return error if not
> supported.
> 
> A heuristic is added for KVM to determine AIL-3 support before the
> introduction of a new KVM CAP, because blanket disabling AIL-3 has too
> much performance cost.
> 
> Signed-off-by: Nicholas Piggin 

Reviewed-by: David Gibson 

[snip]
> diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
> index dc93b99189..1338c41f8f 100644
> --- a/target/ppc/kvm.c
> +++ b/target/ppc/kvm.c
> @@ -2563,6 +2563,35 @@ int kvmppc_has_cap_rpt_invalidate(void)
>  return cap_rpt_invalidate;
>  }
>  
> +bool kvmppc_supports_ail_3(void)
> +{
> +PowerPCCPUClass *pcc = kvm_ppc_get_host_cpu_class();
> +
> +/*
> + * KVM PR only supports AIL-0
> + */
> +if (kvmppc_is_pr(kvm_state)) {
> +return 0;
> +}
> +
> +/*
> + * KVM HV hosts support AIL-3 on POWER8 and above, except for radix
> + * mode on some early POWER9s.
> + */
> +if (!(pcc->insns_flags2 & PPC2_ISA207S)) {
> +return 0;
> +}
> +
> +/* These tests match the CPU_FTR_P9_RADIX_PREFETCH_BUG flag in Linux */
> +if (((pcc->pvr & 0xff00) == CPU_POWERPC_POWER9_DD1) ||
> +((pcc->pvr & 0xff00) == CPU_POWERPC_POWER9_DD20) ||
> +((pcc->pvr & 0xff00) == CPU_POWERPC_POWER9_DD21)) {
> +return 0;
> +}

Deducing what KVM supports rather than getting it to tell us
explicitly with a cap is usually frowned upon.  However, given the
earlier discussion, I'm satisfied that this is the least bad available
option, at least for now.


> +
> +return 1;
> +}
> +
>  PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void)
>  {
>  uint32_t host_pvr = mfpvr();
> diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
> index ee9325bf9a..7bba26d1da 100644
> --- a/target/ppc/kvm_ppc.h
> +++ b/target/ppc/kvm_ppc.h
> @@ -73,6 +73,7 @@ int kvmppc_set_cap_nested_kvm_hv(int enable);
>  int kvmppc_get_cap_large_decr(void);
>  int kvmppc_enable_cap_large_decr(PowerPCCPU *cpu, int enable);
>  int kvmppc_has_cap_rpt_invalidate(void);
> +bool kvmppc_supports_ail_3(void);
>  int kvmppc_enable_hwrng(void);
>  int kvmppc_put_books_sregs(PowerPCCPU *cpu);
>  PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void);
> @@ -393,6 +394,11 @@ static inline int kvmppc_has_cap_rpt_invalidate(void)
>  return false;
>  }
>  
> +static inline bool kvmppc_supports_ail_3(void)
> +{
> +return false;
> +}
> +
>  static inline int kvmppc_enable_hwrng(void)
>  {
>  return -1;

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature

Re: [PATCH v2 26/27] target/ppc: cpu_init: Move check_pow and QOM macros to a header

2022-02-16 Thread David Gibson

On Wed, Feb 16, 2022 at 01:24:25PM -0300, Fabiano Rosas wrote:
> These will need to be accessed from other files once we move the CPUs
> code to separate files.
> 
> The check_pow_hid0 and check_pow_hid0_74xx are too specific to be
> moved to a header so I'll deal with them later when splitting this
> code between the multiple CPU families.
> 
> Signed-off-by: Fabiano Rosas 

Reviewed-by: David Gibson 

> ---
>  target/ppc/cpu.h  | 39 +++
>  target/ppc/cpu_init.c | 37 -
>  2 files changed, 39 insertions(+), 37 deletions(-)
> 
> diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
> index 6a06a7f533..1d33e8afea 100644
> --- a/target/ppc/cpu.h
> +++ b/target/ppc/cpu.h
> @@ -2733,4 +2733,43 @@ void dump_mmu(CPUPPCState *env);
>  void ppc_maybe_bswap_register(CPUPPCState *env, uint8_t *mem_buf, int len);
>  void ppc_store_vscr(CPUPPCState *env, uint32_t vscr);
>  uint32_t ppc_get_vscr(CPUPPCState *env);
> +
> +/*/
> +/* Power management enable checks
> */
> +static inline int check_pow_none(CPUPPCState *env)
> +{
> +return 0;
> +}
> +
> +static inline int check_pow_nocheck(CPUPPCState *env)
> +{
> +return 1;
> +}
> +
> +/*/
> +/* PowerPC implementations definitions   
> */
> +
> +#define POWERPC_FAMILY(_name)   \
> +static void \
> +glue(glue(ppc_, _name), _cpu_family_class_init)(ObjectClass *, void *); \
> +\
> +static const TypeInfo   \
> +glue(glue(ppc_, _name), _cpu_family_type_info) = {  \
> +.name = stringify(_name) "-family-" TYPE_POWERPC_CPU,   \
> +.parent = TYPE_POWERPC_CPU, \
> +.abstract = true,   \
> +.class_init = glue(glue(ppc_, _name), _cpu_family_class_init),  \
> +};  \
> +\
> +static void glue(glue(ppc_, _name), _cpu_family_register_types)(void)   \
> +{   \
> +type_register_static(   \
> +(glue(ppc_, _name), _cpu_family_type_info));   \
> +}   \
> +\
> +type_init(glue(glue(ppc_, _name), _cpu_family_register_types))  \
> +\
> +static void glue(glue(ppc_, _name), _cpu_family_class_init)
> +
> +
>  #endif /* PPC_CPU_H */
> diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
> index 4c6b462cad..5bbbcceb55 100644
> --- a/target/ppc/cpu_init.c
> +++ b/target/ppc/cpu_init.c
> @@ -2484,18 +2484,6 @@ static void init_excp_POWER10(CPUPPCState *env)
>  
>  #endif
>  
> -/*/
> -/* Power management enable checks
> */
> -static int check_pow_none(CPUPPCState *env)
> -{
> -return 0;
> -}
> -
> -static int check_pow_nocheck(CPUPPCState *env)
> -{
> -return 1;
> -}
> -
>  static int check_pow_hid0(CPUPPCState *env)
>  {
>  if (env->spr[SPR_HID0] & 0x00E0) {
> @@ -2514,31 +2502,6 @@ static int check_pow_hid0_74xx(CPUPPCState *env)
>  return 0;
>  }
>  
> -/*/
> -/* PowerPC implementations definitions   
> */
> -
> -#define POWERPC_FAMILY(_name)   \
> -static void \
> -glue(glue(ppc_, _name), _cpu_family_class_init)(ObjectClass *, void *); \
> -\
> -static const TypeInfo   \
> -glue(glue(ppc_, _name), _cpu_family_type_info) = {  \
> -.name = stringify(_name) "-family-" TYPE_POWERPC_CPU,   \
> -.parent = TYPE_POWERPC_CPU, \
> -.abstract = true,   \
> -.class_init = glue(glue(ppc_, _name), _cpu_family_class_init),  \
> -};

Re: [PATCH 22/27] target/ppc: cpu_init: Rename register_ne_601_sprs

2022-02-16 Thread David Gibson

On Wed, Feb 16, 2022 at 10:19:40AM -0300, Fabiano Rosas wrote:
> David Gibson  writes:
> 
> > On Tue, Feb 15, 2022 at 06:41:43PM -0300, Fabiano Rosas wrote:
> >> The important part of this function is that it applies to non-embedded
> >> CPUs, not that it also applies to the 601. We removed support for the
> >> 601 anyway, so rename this function.
> >> 
> >> Signed-off-by: Fabiano Rosas 
> >
> > Reviewed-by: David Gibson 
> >
> > Although, I wonder if "books_common" or something might be a better
> > name, though.  Admittedly, I don't think the "BookS" terminology
> > existed at the time of most of these earlier CPUs.  However, these
> > days the majority of 7xx chips are probably in embedded applications,
> > even if they weren't designed for an embedded chip line.
> 
> The 'ne' in the original name was probably meant to signify
> not-BookE. So non_booke perhaps would work? The thing with calling it
> books_common is that we're using BookS only for the 970 and upwards and
> this function applies to 6xx, 7xx, 74xx as well.

So, an informal server / embedded split is older than the "Book S"
vs. "Book E" terminology, so it's a question of whether we want to
apply the newer terminology to the older systems.  As you say, that's
arguably problematic for BookS, but it's equally troublesome for "not
BookE": 40x and possibly 44x as well also predate the "Book E"
terminology (and certainly don't meet even the earliest version of the
Book E spec).  However they are from the "embedded" branch of cpu
models, and do not have the registers that the ne_601 function
creates.

Naming things is hard :/.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson

signature.asc
Description: PGP signature

qemu crash 100% CPU with Ubuntu10.04 guest (solved)

2022-02-16 Thread Ben Smith

Hi All,

I'm cross-posting this from Reddit qemu_kvm, in case it helps in some
way. I know my setup is ancient and unique; let me know if you would
like more info.

Symptoms:
1. Ubuntu10.04 32-bit guest locks up randomly between 0 and 30 days.
2. The console shows a CPU trace dump, nothing else logged on the guest or host.
3. Host system (Ubuntu20.04) 100% CPU for qemu process.

Solution:
When using virt-install, always use the "--os-variant" parameter!
e.g. --os-variant ubuntu10.04

>From the man page "--os-variant... Optimize the guest configuration
for a specific operating system".
In this case, "optimize" apparently means "stop the crashing".

I was deliberately avoiding the option because the VM was already
performing much better than expected and I didn't want to complicate
the configuration.

This was very, very painful to troubleshoot; Involving spinning up 60
VMs simultaneously, waiting for a failure, changing one parameter,
repeat. :(

Re: Adding a handshake to qemu-guest-agent

2022-02-16 Thread Michael Roth

On Wed, Feb 16, 2022 at 10:12:36AM +0100, Markus Armbruster wrote:
> Michael Roth  writes:
> 
> > On Mon, Feb 14, 2022 at 03:14:37PM +0100, Markus Armbruster wrote:
> >> Cc: the qemu-ga maintainer
> >> 
> >> John Snow  writes:
> >> 
> >> > [Moving our discussion upstream, because it stopped being brief and 
> >> > simple.]
> >
> > Hi John, Markus,
> >
> >> 
> >> Motivation: qemu-ga doesn't do capability negotiation as specified in
> >> docs/interop/qmp-spec.txt.
> >> 
> >> Reminder: qmp-spec.txt specifies the server shall send a greeting
> >> containing the capabilities on offer.  The client shall send a
> >> qmp_capabilities command before any other command.
> >> 
> >> We can't just fix qemu-ga to comply, because it would break existing
> >> clients.
> >> 
> >> We could document its behavior in qmp-spec.txt.  Easy enough, but also
> >> kind of sad.
> >
> > I'm not sure we could've ever done it QMP-style with the initial
> > greeting/negotiation mode. It's been a while, I but recall virtio-serial
> > chardev in guest not having a very straight-forward way of flushing out
> > data from the vring after a new client connects on the host side, so
> > new clients had a chance of reading left-over garbage from previous
> > client sessions. Or maybe it was open/close/open on the guest/chardev
> > side that didn't cause the flush... anyway:
> >
> > This is why guest-sync was there, so you could verify the stream was
> > in sync with a given "session ID" before continuing. But that doesn't
> > help much if the stream is in some garbage state that parser can't
> > recover from...
> >
> > This is why guest-sync-delimited was introduced; it inserts a 0xFF
> > sential value (invalid for any normal QMP stream) prior to response that
> > a client can scan for to flush the stream. Similarly, clients are
> > supposed to precede guest-sync/guest-sync-delimited so QGA to get stuck
> > trying to parse a partial read from an earlier client that is 'eating' a
> > new request from a new client connection. I don't think these are really
> > issues with vsock (or the other transports QGA accepts), but AFAIK
> > Windows is still mostly reliant on virtio-serial, so these are probably
> > still needed.
> 
> I believe you're right about the reason being virtio-serial.  I
> documented it that way in commit 72e9e569d0 "docs/interop/qmp-spec: How
> to force known good parser state".
> 
> 2.6 Forcing the JSON parser into known-good state
> -
> 
> Incomplete or invalid input can leave the server's JSON parser in a
> state where it can't parse additional commands.  To get it back into
> known-good state, the client should provoke a lexical error.
> 
> The cleanest way to do that is sending an ASCII control character
> other than '\t' (horizontal tab), '\r' (carriage return), or '\n' (new
> line).
> 
> Sadly, older versions of QEMU can fail to flag this as an error.  If a
> client needs to deal with them, it should send a 0xFF byte.
> 
> 2.7 QGA Synchronization
> ---
> 
> When a client connects to QGA over a transport lacking proper
> connection semantics such as virtio-serial, QGA may have read partial
> input from a previous client.  The client needs to force QGA's parser
> into known-good state using the previous section's technique.
> Moreover, the client may receive output a previous client didn't read.
> To help with skipping that output, QGA provides the
> 'guest-sync-delimited' command.  Refer to its documentation for
> details.
> 
> 0xFF is invalid UTF-8, which is kind of icky.  We should've used a
> proper control character like EOT (end of transmission) from the start.
> Water under the bridge.
> 
> guest-sync has another design flaw: an unread command reply consisting
> of just an integer can be confused with guest-sync's reply.  Unlikely as
> long as guest-sync's @id argument is chosen at random, as its
> documentation demands.
> 
> guest-sync could be deprecated, I guess.  

Yes, should probably be deprecated in favor of guest-sync-delimited. I
left it for clients that really don't want to dig into the transport
layer to search for 0xFF, but still want at least some ability to
re-sync.

> 
> The @id argument of guest-sync and guest-sync-delimited feels kind of
> redundant with the command object's @id member.  Except QGA didn't
> conform to the QMP spec until commit 4eaca8de26 "qmp: common 'id'
> handling & make QGA conform to QMP spec" (v4.0.0).  More water under the
> bridge.
> 
> Note that there's no need for all this when the transport provides
> proper connection semantics.  Clients relying on connection semantics
> work fine even when they don't bother with this syncing stuff.  Do such
> clients exist?  We probably don't know.  May or may not matter.

True, I think only virtio-serial and maybe isa-serial require the sync.
I was hoping virtio-vsock might quickly become the

Re: [PATCH v3 2/7] malta: Move PCI interrupt handling from gt64xxx_pci to piix4

2022-02-16 Thread Philippe Mathieu-Daudé via


On 16/2/22 23:45, Bernhard Beschow wrote:

Handling PCI interrupts in piix4 increases cohesion and reduces differences
between piix4 and piix3.

Signed-off-by: Bernhard Beschow 
---
  hw/isa/piix4.c | 55 ++
  hw/mips/gt64xxx_pci.c  | 60 --
  hw/mips/malta.c|  6 +
  include/hw/mips/mips.h |  2 +-
  4 files changed, 62 insertions(+), 61 deletions(-)


Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH v3 3/7] hw/isa/piix4: Resolve redundant i8259[] attribute

2022-02-16 Thread Philippe Mathieu-Daudé via


On 16/2/22 23:45, Bernhard Beschow wrote:

This is a follow-up on patch "malta: Move PCI interrupt handling from
gt64xxx_pci to piix4" where i8259[] was moved from MaltaState to
PIIX4State to make the code movement more obvious. However, i8259[]
seems redundant to *isa, so remove it.

Signed-off-by: Bernhard Beschow 
---
  hw/isa/piix4.c | 7 +--
  1 file changed, 1 insertion(+), 6 deletions(-)


Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH v3 7/7] hw/mips/gt64xxx_pci: Resolve gt64120_register()

2022-02-16 Thread Philippe Mathieu-Daudé via


On 16/2/22 23:45, Bernhard Beschow wrote:

Now that gt64120_register() lost its pic parameter, there is an
opportunity to remove it. gt64120_register() is old style by wrapping
qdev API, and the new style is to use qdev directly. So take the
opportunity and modernize the code.

Suggested-by: BALATON Zoltan 
Signed-off-by: Bernhard Beschow 
---
  hw/mips/gt64xxx_pci.c  | 21 -
  hw/mips/malta.c| 13 -
  include/hw/mips/mips.h |  3 ---
  3 files changed, 12 insertions(+), 25 deletions(-)



  static void gt64120_pci_realize(PCIDevice *d, Error **errp)
diff --git a/hw/mips/malta.c b/hw/mips/malta.c
index 13254dbc89..16fdaed3db 100644
--- a/hw/mips/malta.c
+++ b/hw/mips/malta.c
@@ -38,6 +38,7 @@
  #include "hw/mips/mips.h"
  #include "hw/mips/cpudevs.h"
  #include "hw/pci/pci.h"
+#include "hw/pci/pci_host.h"
  #include "qemu/log.h"
  #include "hw/mips/bios.h"
  #include "hw/ide.h"
@@ -1230,7 +1231,7 @@ void mips_malta_init(MachineState *machine)
  const size_t smbus_eeprom_size = 8 * 256;
  uint8_t *smbus_eeprom_buf = g_malloc0(smbus_eeprom_size);
  uint64_t kernel_entry, bootloader_run_addr;
-PCIBus *pci_bus;
+PCIHostState *phb;
  ISABus *isa_bus;
  qemu_irq cbus_irq, i8259_irq;
  I2CBus *smbus;
@@ -1390,7 +1391,9 @@ void mips_malta_init(MachineState *machine)
  stl_p(memory_region_get_ram_ptr(bios_copy) + 0x10, 0x0420);
  
  /* Northbridge */

-pci_bus = gt64120_register();
+dev = qdev_new("gt64120");
+phb = PCI_HOST_BRIDGE(dev);
+sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), _fatal);


Nice cleanup!

I might move the phb assignation afther the realize() for clarity
(usually we only set qdev properties between qdev_new and the
realize).

Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH v3 6/7] hw/isa/piix4: Replace some magic IRQ constants

2022-02-16 Thread Philippe Mathieu-Daudé via


On 16/2/22 23:45, Bernhard Beschow wrote:

This is a follow-up on patch "malta: Move PCI interrupt handling from
gt64xxx_pci to piix4". gt64xxx_pci used magic constants, and probably
didn't want to use piix4-specific constants. Now that the interrupt
handing resides in piix4, its constants can be used.

Signed-off-by: Bernhard Beschow 
---
  hw/isa/piix4.c | 6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)


Reviewed-by: Philippe Mathieu-Daudé

[PATCH v3 5/7] hw/isa/piix4: Resolve global instance variable

2022-02-16 Thread Bernhard Beschow

Now that piix4_set_irq's opaque parameter references own PIIX4State,
piix4_dev becomes redundant.

Signed-off-by: Bernhard Beschow 
Reviewed-by: Philippe Mathieu-Daudé 
---
 hw/isa/piix4.c| 10 +++---
 include/hw/southbridge/piix.h |  2 --
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/hw/isa/piix4.c b/hw/isa/piix4.c
index caa2002e2c..2e9b5ccada 100644
--- a/hw/isa/piix4.c
+++ b/hw/isa/piix4.c
@@ -39,8 +39,6 @@
 #include "sysemu/runstate.h"
 #include "qom/object.h"
 
-PCIDevice *piix4_dev;
-
 struct PIIX4State {
 PCIDevice dev;
 qemu_irq cpu_intr;
@@ -58,16 +56,16 @@ static void piix4_set_irq(void *opaque, int irq_num, int 
level)
 {
 int i, pic_irq, pic_level;
 PIIX4State *s = opaque;
-PCIBus *bus = pci_get_bus(piix4_dev);
+PCIBus *bus = pci_get_bus(>dev);
 
 /* now we change the pic irq level according to the piix irq mappings */
 /* XXX: optimize */
-pic_irq = piix4_dev->config[PIIX_PIRQCA + irq_num];
+pic_irq = s->dev.config[PIIX_PIRQCA + irq_num];
 if (pic_irq < 16) {
 /* The pic level is the logical OR of all the PCI irqs mapped to it. */
 pic_level = 0;
 for (i = 0; i < 4; i++) {
-if (pic_irq == piix4_dev->config[PIIX_PIRQCA + i]) {
+if (pic_irq == s->dev.config[PIIX_PIRQCA + i]) {
 pic_level |= pci_bus_get_irq_level(bus, i);
 }
 }
@@ -219,8 +217,6 @@ static void piix4_realize(PCIDevice *dev, Error **errp)
 return;
 }
 isa_init_irq(ISA_DEVICE(>rtc), >rtc.irq, RTC_ISA_IRQ);
-
-piix4_dev = dev;
 }
 
 static void piix4_init(Object *obj)
diff --git a/include/hw/southbridge/piix.h b/include/hw/southbridge/piix.h
index 6387f2b612..f63f83e5c6 100644
--- a/include/hw/southbridge/piix.h
+++ b/include/hw/southbridge/piix.h
@@ -70,8 +70,6 @@ typedef struct PIIXState PIIX3State;
 DECLARE_INSTANCE_CHECKER(PIIX3State, PIIX3_PCI_DEVICE,
  TYPE_PIIX3_PCI_DEVICE)
 
-extern PCIDevice *piix4_dev;
-
 PIIX3State *piix3_create(PCIBus *pci_bus, ISABus **isa_bus);
 
 DeviceState *piix4_create(PCIBus *pci_bus, ISABus **isa_bus, I2CBus **smbus);
-- 
2.35.1

Re: [PATCH v3] Hexagon (target/hexagon) properly handle NaN in dfmin/dfmax/sfmin/sfmax

2022-02-16 Thread Richard Henderson


On 2/16/22 15:39, Taylor Simpson wrote:

The float??_minnum implementation differs from Hexagon for SNaN,
it returns NaN, but Hexagon returns the other input.  So, we use
float??_minimum_number.  For double precision, we check for QNaN and
raise the invalid flag.


I'm surprised that the behaviour for double differs from single, but the docs are light on 
the subject.  Anyway,


Reviewed-by: Richard Henderson 


r~

[PATCH v3 7/7] hw/mips/gt64xxx_pci: Resolve gt64120_register()

2022-02-16 Thread Bernhard Beschow

Now that gt64120_register() lost its pic parameter, there is an
opportunity to remove it. gt64120_register() is old style by wrapping
qdev API, and the new style is to use qdev directly. So take the
opportunity and modernize the code.

Suggested-by: BALATON Zoltan 
Signed-off-by: Bernhard Beschow 
---
 hw/mips/gt64xxx_pci.c  | 21 -
 hw/mips/malta.c| 13 -
 include/hw/mips/mips.h |  3 ---
 3 files changed, 12 insertions(+), 25 deletions(-)

diff --git a/hw/mips/gt64xxx_pci.c b/hw/mips/gt64xxx_pci.c
index eb205d6d70..e0ff1b5566 100644
--- a/hw/mips/gt64xxx_pci.c
+++ b/hw/mips/gt64xxx_pci.c
@@ -26,7 +26,6 @@
 #include "qapi/error.h"
 #include "qemu/units.h"
 #include "qemu/log.h"
-#include "hw/mips/mips.h"
 #include "hw/pci/pci.h"
 #include "hw/pci/pci_host.h"
 #include "migration/vmstate.h"
@@ -1151,30 +1150,18 @@ static void gt64120_reset(DeviceState *dev)
 static void gt64120_realize(DeviceState *dev, Error **errp)
 {
 GT64120State *s = GT64120_PCI_HOST_BRIDGE(dev);
+PCIHostState *phb = PCI_HOST_BRIDGE(dev);
 
 memory_region_init_io(>ISD_mem, OBJECT(dev), _mem_ops, s,
   "gt64120-isd", 0x1000);
-}
-
-PCIBus *gt64120_register(void)
-{
-GT64120State *d;
-PCIHostState *phb;
-DeviceState *dev;
-
-dev = qdev_new(TYPE_GT64120_PCI_HOST_BRIDGE);
-d = GT64120_PCI_HOST_BRIDGE(dev);
-phb = PCI_HOST_BRIDGE(dev);
-memory_region_init(>pci0_mem, OBJECT(dev), "pci0-mem", 4 * GiB);
-address_space_init(>pci0_mem_as, >pci0_mem, "pci0-mem");
+memory_region_init(>pci0_mem, OBJECT(dev), "pci0-mem", 4 * GiB);
+address_space_init(>pci0_mem_as, >pci0_mem, "pci0-mem");
 phb->bus = pci_root_bus_new(dev, "pci",
->pci0_mem,
+>pci0_mem,
 get_system_io(),
 PCI_DEVFN(18, 0), TYPE_PCI_BUS);
-sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), _fatal);
 
 pci_create_simple(phb->bus, PCI_DEVFN(0, 0), "gt64120_pci");
-return phb->bus;
 }
 
 static void gt64120_pci_realize(PCIDevice *d, Error **errp)
diff --git a/hw/mips/malta.c b/hw/mips/malta.c
index 13254dbc89..16fdaed3db 100644
--- a/hw/mips/malta.c
+++ b/hw/mips/malta.c
@@ -38,6 +38,7 @@
 #include "hw/mips/mips.h"
 #include "hw/mips/cpudevs.h"
 #include "hw/pci/pci.h"
+#include "hw/pci/pci_host.h"
 #include "qemu/log.h"
 #include "hw/mips/bios.h"
 #include "hw/ide.h"
@@ -1230,7 +1231,7 @@ void mips_malta_init(MachineState *machine)
 const size_t smbus_eeprom_size = 8 * 256;
 uint8_t *smbus_eeprom_buf = g_malloc0(smbus_eeprom_size);
 uint64_t kernel_entry, bootloader_run_addr;
-PCIBus *pci_bus;
+PCIHostState *phb;
 ISABus *isa_bus;
 qemu_irq cbus_irq, i8259_irq;
 I2CBus *smbus;
@@ -1390,7 +1391,9 @@ void mips_malta_init(MachineState *machine)
 stl_p(memory_region_get_ram_ptr(bios_copy) + 0x10, 0x0420);
 
 /* Northbridge */
-pci_bus = gt64120_register();
+dev = qdev_new("gt64120");
+phb = PCI_HOST_BRIDGE(dev);
+sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), _fatal);
 /*
  * The whole address space decoded by the GT-64120A doesn't generate
  * exception when accessing invalid memory. Create an empty slot to
@@ -1399,7 +1402,7 @@ void mips_malta_init(MachineState *machine)
 empty_slot_init("GT64120", 0, 0x2000);
 
 /* Southbridge */
-dev = piix4_create(pci_bus, _bus, );
+dev = piix4_create(phb->bus, _bus, );
 
 /* Interrupt controller */
 qdev_connect_gpio_out_named(dev, "intr", 0, i8259_irq);
@@ -1414,10 +1417,10 @@ void mips_malta_init(MachineState *machine)
 isa_create_simple(isa_bus, TYPE_FDC37M81X_SUPERIO);
 
 /* Network card */
-network_init(pci_bus);
+network_init(phb->bus);
 
 /* Optional PCI video card */
-pci_vga_init(pci_bus);
+pci_vga_init(phb->bus);
 }
 
 static void mips_malta_instance_init(Object *obj)
diff --git a/include/hw/mips/mips.h b/include/hw/mips/mips.h
index ff88942e63..101799f7d3 100644
--- a/include/hw/mips/mips.h
+++ b/include/hw/mips/mips.h
@@ -9,9 +9,6 @@
 
 #include "exec/memory.h"
 
-/* gt64xxx.c */
-PCIBus *gt64120_register(void);
-
 /* bonito.c */
 PCIBus *bonito_init(qemu_irq *pic);
 
-- 
2.35.1

[PATCH v3 6/7] hw/isa/piix4: Replace some magic IRQ constants

2022-02-16 Thread Bernhard Beschow

This is a follow-up on patch "malta: Move PCI interrupt handling from
gt64xxx_pci to piix4". gt64xxx_pci used magic constants, and probably
didn't want to use piix4-specific constants. Now that the interrupt
handing resides in piix4, its constants can be used.

Signed-off-by: Bernhard Beschow 
---
 hw/isa/piix4.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hw/isa/piix4.c b/hw/isa/piix4.c
index 2e9b5ccada..f876c71750 100644
--- a/hw/isa/piix4.c
+++ b/hw/isa/piix4.c
@@ -61,10 +61,10 @@ static void piix4_set_irq(void *opaque, int irq_num, int 
level)
 /* now we change the pic irq level according to the piix irq mappings */
 /* XXX: optimize */
 pic_irq = s->dev.config[PIIX_PIRQCA + irq_num];
-if (pic_irq < 16) {
+if (pic_irq < ISA_NUM_IRQS) {
 /* The pic level is the logical OR of all the PCI irqs mapped to it. */
 pic_level = 0;
-for (i = 0; i < 4; i++) {
+for (i = 0; i < PIIX_NUM_PIRQS; i++) {
 if (pic_irq == s->dev.config[PIIX_PIRQCA + i]) {
 pic_level |= pci_bus_get_irq_level(bus, i);
 }
@@ -315,7 +315,7 @@ DeviceState *piix4_create(PCIBus *pci_bus, ISABus 
**isa_bus, I2CBus **smbus)
NULL, 0, NULL);
 }
 
-pci_bus_irqs(pci_bus, piix4_set_irq, pci_slot_get_pirq, s, 4);
+pci_bus_irqs(pci_bus, piix4_set_irq, pci_slot_get_pirq, s, PIIX_NUM_PIRQS);
 
 return dev;
 }
-- 
2.35.1

[PATCH v3 3/7] hw/isa/piix4: Resolve redundant i8259[] attribute

2022-02-16 Thread Bernhard Beschow

This is a follow-up on patch "malta: Move PCI interrupt handling from
gt64xxx_pci to piix4" where i8259[] was moved from MaltaState to
PIIX4State to make the code movement more obvious. However, i8259[]
seems redundant to *isa, so remove it.

Signed-off-by: Bernhard Beschow 
---
 hw/isa/piix4.c | 7 +--
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/hw/isa/piix4.c b/hw/isa/piix4.c
index 196b56e69c..179968b18e 100644
--- a/hw/isa/piix4.c
+++ b/hw/isa/piix4.c
@@ -45,7 +45,6 @@ struct PIIX4State {
 PCIDevice dev;
 qemu_irq cpu_intr;
 qemu_irq *isa;
-qemu_irq i8259[ISA_NUM_IRQS];
 
 RTCState rtc;
 /* Reset Control Register */
@@ -320,11 +319,7 @@ DeviceState *piix4_create(PCIBus *pci_bus, ISABus 
**isa_bus, I2CBus **smbus)
NULL, 0, NULL);
 }
 
-pci_bus_irqs(pci_bus, piix4_set_irq, pci_slot_get_pirq, s->i8259, 4);
-
-for (int i = 0; i < ISA_NUM_IRQS; i++) {
-s->i8259[i] = qdev_get_gpio_in_named(dev, "isa", i);
-}
+pci_bus_irqs(pci_bus, piix4_set_irq, pci_slot_get_pirq, s->isa, 4);
 
 return dev;
 }
-- 
2.35.1

[PATCH v3 2/7] malta: Move PCI interrupt handling from gt64xxx_pci to piix4

2022-02-16 Thread Bernhard Beschow

Handling PCI interrupts in piix4 increases cohesion and reduces differences
between piix4 and piix3.

Signed-off-by: Bernhard Beschow 
---
 hw/isa/piix4.c | 55 ++
 hw/mips/gt64xxx_pci.c  | 60 --
 hw/mips/malta.c|  6 +
 include/hw/mips/mips.h |  2 +-
 4 files changed, 62 insertions(+), 61 deletions(-)

diff --git a/hw/isa/piix4.c b/hw/isa/piix4.c
index 0fe7b69bc4..196b56e69c 100644
--- a/hw/isa/piix4.c
+++ b/hw/isa/piix4.c
@@ -45,6 +45,7 @@ struct PIIX4State {
 PCIDevice dev;
 qemu_irq cpu_intr;
 qemu_irq *isa;
+qemu_irq i8259[ISA_NUM_IRQS];
 
 RTCState rtc;
 /* Reset Control Register */
@@ -54,6 +55,27 @@ struct PIIX4State {
 
 OBJECT_DECLARE_SIMPLE_TYPE(PIIX4State, PIIX4_PCI_DEVICE)
 
+static void piix4_set_irq(void *opaque, int irq_num, int level)
+{
+int i, pic_irq, pic_level;
+qemu_irq *pic = opaque;
+PCIBus *bus = pci_get_bus(piix4_dev);
+
+/* now we change the pic irq level according to the piix irq mappings */
+/* XXX: optimize */
+pic_irq = piix4_dev->config[PIIX_PIRQCA + irq_num];
+if (pic_irq < 16) {
+/* The pic level is the logical OR of all the PCI irqs mapped to it. */
+pic_level = 0;
+for (i = 0; i < 4; i++) {
+if (pic_irq == piix4_dev->config[PIIX_PIRQCA + i]) {
+pic_level |= pci_bus_get_irq_level(bus, i);
+}
+}
+qemu_set_irq(pic[pic_irq], pic_level);
+}
+}
+
 static void piix4_isa_reset(DeviceState *dev)
 {
 PIIX4State *d = PIIX4_PCI_DEVICE(dev);
@@ -248,8 +270,34 @@ static void piix4_register_types(void)
 
 type_init(piix4_register_types)
 
+static int pci_slot_get_pirq(PCIDevice *pci_dev, int irq_num)
+{
+int slot;
+
+slot = PCI_SLOT(pci_dev->devfn);
+
+switch (slot) {
+/* PIIX4 USB */
+case 10:
+return 3;
+/* AMD 79C973 Ethernet */
+case 11:
+return 1;
+/* Crystal 4281 Sound */
+case 12:
+return 2;
+/* PCI slot 1 to 4 */
+case 18 ... 21:
+return ((slot - 18) + irq_num) & 0x03;
+/* Unknown device, don't do any translation */
+default:
+return irq_num;
+}
+}
+
 DeviceState *piix4_create(PCIBus *pci_bus, ISABus **isa_bus, I2CBus **smbus)
 {
+PIIX4State *s;
 PCIDevice *pci;
 DeviceState *dev;
 int devfn = PCI_DEVFN(10, 0);
@@ -257,6 +305,7 @@ DeviceState *piix4_create(PCIBus *pci_bus, ISABus 
**isa_bus, I2CBus **smbus)
 pci = pci_create_simple_multifunction(pci_bus, devfn,  true,
   TYPE_PIIX4_PCI_DEVICE);
 dev = DEVICE(pci);
+s = PIIX4_PCI_DEVICE(pci);
 if (isa_bus) {
 *isa_bus = ISA_BUS(qdev_get_child_bus(dev, "isa.0"));
 }
@@ -271,5 +320,11 @@ DeviceState *piix4_create(PCIBus *pci_bus, ISABus 
**isa_bus, I2CBus **smbus)
NULL, 0, NULL);
 }
 
+pci_bus_irqs(pci_bus, piix4_set_irq, pci_slot_get_pirq, s->i8259, 4);
+
+for (int i = 0; i < ISA_NUM_IRQS; i++) {
+s->i8259[i] = qdev_get_gpio_in_named(dev, "isa", i);
+}
+
 return dev;
 }
diff --git a/hw/mips/gt64xxx_pci.c b/hw/mips/gt64xxx_pci.c
index 4cbd0911f5..eb205d6d70 100644
--- a/hw/mips/gt64xxx_pci.c
+++ b/hw/mips/gt64xxx_pci.c
@@ -29,7 +29,6 @@
 #include "hw/mips/mips.h"
 #include "hw/pci/pci.h"
 #include "hw/pci/pci_host.h"
-#include "hw/southbridge/piix.h"
 #include "migration/vmstate.h"
 #include "hw/intc/i8259.h"
 #include "hw/irq.h"
@@ -981,53 +980,6 @@ static const MemoryRegionOps isd_mem_ops = {
 },
 };
 
-static int gt64120_pci_map_irq(PCIDevice *pci_dev, int irq_num)
-{
-int slot;
-
-slot = PCI_SLOT(pci_dev->devfn);
-
-switch (slot) {
-/* PIIX4 USB */
-case 10:
-return 3;
-/* AMD 79C973 Ethernet */
-case 11:
-return 1;
-/* Crystal 4281 Sound */
-case 12:
-return 2;
-/* PCI slot 1 to 4 */
-case 18 ... 21:
-return ((slot - 18) + irq_num) & 0x03;
-/* Unknown device, don't do any translation */
-default:
-return irq_num;
-}
-}
-
-static void gt64120_pci_set_irq(void *opaque, int irq_num, int level)
-{
-int i, pic_irq, pic_level;
-qemu_irq *pic = opaque;
-PCIBus *bus = pci_get_bus(piix4_dev);
-
-/* now we change the pic irq level according to the piix irq mappings */
-/* XXX: optimize */
-pic_irq = piix4_dev->config[PIIX_PIRQCA + irq_num];
-if (pic_irq < 16) {
-/* The pic level is the logical OR of all the PCI irqs mapped to it. */
-pic_level = 0;
-for (i = 0; i < 4; i++) {
-if (pic_irq == piix4_dev->config[PIIX_PIRQCA + i]) {
-pic_level |= pci_bus_get_irq_level(bus, i);
-}
-}
-qemu_set_irq(pic[pic_irq], pic_level);
-}
-}
-
-
 static void gt64120_reset(DeviceState *dev)
 {
 GT64120State *s = GT64120_PCI_HOST_BRIDGE(dev);
@@ -1204,7 +1156,7 @@ static void

[PATCH v3 4/7] hw/isa/piix4: Pass PIIX4State as opaque parameter for piix4_set_irq()

2022-02-16 Thread Bernhard Beschow

Passing PIIX4State rather than just the qemu_irq allows for resolving
the global piix4_dev variable.

Signed-off-by: Bernhard Beschow 
Reviewed-by: Peter Maydell 
Reviewed-by: Philippe Mathieu-Daudé 
---
 hw/isa/piix4.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hw/isa/piix4.c b/hw/isa/piix4.c
index 179968b18e..caa2002e2c 100644
--- a/hw/isa/piix4.c
+++ b/hw/isa/piix4.c
@@ -57,7 +57,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(PIIX4State, PIIX4_PCI_DEVICE)
 static void piix4_set_irq(void *opaque, int irq_num, int level)
 {
 int i, pic_irq, pic_level;
-qemu_irq *pic = opaque;
+PIIX4State *s = opaque;
 PCIBus *bus = pci_get_bus(piix4_dev);
 
 /* now we change the pic irq level according to the piix irq mappings */
@@ -71,7 +71,7 @@ static void piix4_set_irq(void *opaque, int irq_num, int 
level)
 pic_level |= pci_bus_get_irq_level(bus, i);
 }
 }
-qemu_set_irq(pic[pic_irq], pic_level);
+qemu_set_irq(s->isa[pic_irq], pic_level);
 }
 }
 
@@ -319,7 +319,7 @@ DeviceState *piix4_create(PCIBus *pci_bus, ISABus 
**isa_bus, I2CBus **smbus)
NULL, 0, NULL);
 }
 
-pci_bus_irqs(pci_bus, piix4_set_irq, pci_slot_get_pirq, s->isa, 4);
+pci_bus_irqs(pci_bus, piix4_set_irq, pci_slot_get_pirq, s, 4);
 
 return dev;
 }
-- 
2.35.1

[PATCH v3 0/7] malta: Fix PCI IRQ levels to be preserved during migration, cleanup

2022-02-16 Thread Bernhard Beschow

Tested with [1]:

  qemu-system-mipsel -M malta -kernel vmlinux-3.2.0-4-4kc-malta -hda \
  debian_wheezy_mipsel_standard.qcow2 -append "root=/dev/sda1 console=tty0"

It was possible to log in as root and `poweroff` the machine.

Moreover, I ran:

  :$ make check
  Ok: 569
  Expected Fail:  0
  Fail:   0
  Unexpected Pass:0
  Skipped:178
  Timeout:0

[1] https://people.debian.org/~aurel32/qemu/mips/


v3:
  The migration bug now gets fixed in gt64xxx_pci before any cleanup. As
suggested by PMM the patch is based on commit e735b55a8c11.
  The code movement patch now moves the already fixed code. I might be a bit
too conservative here by removing Philippe's Reviewed-By tag.
  As suggested by BALATON Zoltan, the redundant i8259[] attribute is now
resolved immediately after the code movement. As a side effect, it also
removes moved code which doesn't adhere to the coding style (local loop
variable).
  To address BALATON Zoltan's comment and to reduce the number of required
Reviewed-By's, only piix4_set_irq() is modified to expect own DeviceState
paremeter. Up to v2, all remaining set_irq() functions were changed this
way.
  The patch resolving piix4's singleton variable got split into two patches:
One which resolves the singleton variable and one which replaces magic
constants. The split patches should be more comprehensible.
  Suggested by BALATON Zoltan, I took a chance to resolve gt64120_register(),
a method akin to the legacy init functions we're trying to get rid of.

v2:
  isa/piix4: Fix PCI IRQ levels to be preserved in VMState
  isa/piix4: Resolve redundant i8259[] attribute

Bernhard Beschow (7):
  hw/mips/gt64xxx_pci: Fix PCI IRQ levels to be preserved during
migration
  malta: Move PCI interrupt handling from gt64xxx_pci to piix4
  hw/isa/piix4: Resolve redundant i8259[] attribute
  hw/isa/piix4: Pass PIIX4State as opaque parameter for piix4_set_irq()
  hw/isa/piix4: Resolve global instance variable
  hw/isa/piix4: Replace some magic IRQ constants
  hw/mips/gt64xxx_pci: Resolve gt64120_register()

 hw/isa/piix4.c| 54 +--
 hw/mips/gt64xxx_pci.c | 80 +++
 hw/mips/malta.c   | 17 
 include/hw/mips/mips.h|  3 --
 include/hw/southbridge/piix.h |  2 -
 5 files changed, 65 insertions(+), 91 deletions(-)

-- 
2.35.1

[PATCH v3 1/7] hw/mips/gt64xxx_pci: Fix PCI IRQ levels to be preserved during migration

2022-02-16 Thread Bernhard Beschow

Based on commit e735b55a8c11dd455e31ccd4420e6c9485191d0c:

  piix_pci: eliminate PIIX3State::pci_irq_levels

  PIIX3State::pci_irq_levels are redundant which is already tracked by
  PCIBus layer. So eliminate them.

The IRQ levels in the PCIBus layer are already preserved during
migration. By reusing them and rather than having a redundant implementation
the bug is avoided in the first place.

Suggested-by: Peter Maydell 
Signed-off-by: Bernhard Beschow 
---
 hw/mips/gt64xxx_pci.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/hw/mips/gt64xxx_pci.c b/hw/mips/gt64xxx_pci.c
index c7480bd019..4cbd0911f5 100644
--- a/hw/mips/gt64xxx_pci.c
+++ b/hw/mips/gt64xxx_pci.c
@@ -1006,14 +1006,11 @@ static int gt64120_pci_map_irq(PCIDevice *pci_dev, int 
irq_num)
 }
 }
 
-static int pci_irq_levels[4];
-
 static void gt64120_pci_set_irq(void *opaque, int irq_num, int level)
 {
 int i, pic_irq, pic_level;
 qemu_irq *pic = opaque;
-
-pci_irq_levels[irq_num] = level;
+PCIBus *bus = pci_get_bus(piix4_dev);
 
 /* now we change the pic irq level according to the piix irq mappings */
 /* XXX: optimize */
@@ -1023,7 +1020,7 @@ static void gt64120_pci_set_irq(void *opaque, int 
irq_num, int level)
 pic_level = 0;
 for (i = 0; i < 4; i++) {
 if (pic_irq == piix4_dev->config[PIIX_PIRQCA + i]) {
-pic_level |= pci_irq_levels[i];
+pic_level |= pci_bus_get_irq_level(bus, i);
 }
 }
 qemu_set_irq(pic[pic_irq], pic_level);
-- 
2.35.1

Re: Portable inline asm to get address of TLS variable

2022-02-16 Thread Paolo Bonzini


On 2/16/22 18:46, Stefan Hajnoczi wrote:

However, I wonder if the compiler might reuse a register that already
contains the address. Then we'd have the coroutine problem again when
qemu_coroutine_yield() is called between the earlier address calculation
and the asm volatile statement.


Yes, the compiler should be able to reuse the register.  volatile only 
says that the contents of the "asm" cannot be subject to e.g. loop 
optimizations:


for (i = 0; i < 10; i++) {
asm("# assembly": "=r"(k) : "0"(10));
j += k;
}

will likely execute the asm once, while "asm volatile" (or an asm 
without inputs, which is always volatile) will execute it ten times.


However, the input of the assembly can be evaluated only once either 
way.  For example, in the case above you might have "movl $10, %edx" 
outside the loop even with asm volatile.


One way to fix it for modules could be to define a (global, non-TLS) 
variable in QEMU with the %fs-based offset of the relevant thread-local 
variable, and initialize it before modules are loaded.


Paolo

Re: [PATCH] ppc/spapr: Advertise StoreEOI for POWER10 compat guests

2022-02-16 Thread Daniel Henrique Barboza





On 2/14/22 11:11, Cédric Le Goater wrote:

When an interrupt has been handled, the OS notifies the interrupt
controller with a EOI sequence. On a POWER9 and POWER10 systems using


nit:  s/a EOI sequence/an EOI sequence



the XIVE interrupt controller, this can be done with a load or a store
operation on the ESB interrupt management page of the interrupt. The
StoreEOI operation has less latency and improves interrupt handling
performance but it was deactivated during the POWER9 DD2.0 timeframe
because of ordering issues. POWER9 systems use the LoadEOI instead.
POWER10 compat guests should have fixed the issue with
Load-after-Store ordering and StoreEOI can be activated for them
again.

To maintain performance, this ordering is only enforced for the
XIVE_ESB_SET_PQ_10 load operation. This operation can be used to
disable temporarily an interrupt source. If StoreEOI is active, a
source could be left enabled if the load and store operations come
out of order.

Add a check in our XIVE emulation model for Load-after-Store when
StoreEOI is active. It should catch unreliable sequences. Other load
operations should be fine without it.

Signed-off-by: Cédric Le Goater 
---
  include/hw/ppc/spapr_xive.h |  1 +
  include/hw/ppc/xive.h   |  8 
  hw/intc/spapr_xive.c| 15 +++
  hw/intc/spapr_xive_kvm.c| 15 +++
  hw/intc/xive.c  |  6 ++
  hw/ppc/spapr_hcall.c|  7 +++
  6 files changed, 52 insertions(+)

diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
index b282960ad90d..9c247d8bf57d 100644
--- a/include/hw/ppc/spapr_xive.h
+++ b/include/hw/ppc/spapr_xive.h
@@ -73,6 +73,7 @@ void spapr_xive_map_mmio(SpaprXive *xive);
  
  int spapr_xive_end_to_target(uint8_t end_blk, uint32_t end_idx,

   uint32_t *out_server, uint8_t *out_prio);
+void spapr_xive_enable_store_eoi(SpaprXive *xive, bool enable);
  
  /*

   * KVM XIVE device helpers
diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
index 126e4e2c3a17..133f308c2792 100644
--- a/include/hw/ppc/xive.h
+++ b/include/hw/ppc/xive.h
@@ -285,6 +285,14 @@ uint8_t xive_esb_set(uint8_t *pq, uint8_t value);
  #define XIVE_ESB_SET_PQ_10  0xe00 /* Load */
  #define XIVE_ESB_SET_PQ_11  0xf00 /* Load */
  
+/*

+ * Load-after-store ordering
+ *
+ * Adding this offset to the load address will enforce
+ * load-after-store ordering. This is required to use with StoreEOI.
+ */
+#define XIVE_ESB_LD_ST_MO   0x40 /* Load-after-store ordering */
+
  uint8_t xive_source_esb_get(XiveSource *xsrc, uint32_t srcno);
  uint8_t xive_source_esb_set(XiveSource *xsrc, uint32_t srcno, uint8_t pq);
  
diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c

index dc641cc604bf..0b8a246ad594 100644
--- a/hw/intc/spapr_xive.c
+++ b/hw/intc/spapr_xive.c
@@ -25,6 +25,7 @@
  #include "hw/ppc/xive_regs.h"
  #include "hw/qdev-properties.h"
  #include "trace.h"
+#include "cpu-models.h"
  
  /*

   * XIVE Virtualization Controller BAR and Thread Managment BAR that we
@@ -1854,3 +1855,17 @@ void spapr_xive_hcall_init(SpaprMachineState *spapr)
  spapr_register_hypercall(H_INT_SYNC, h_int_sync);
  spapr_register_hypercall(H_INT_RESET, h_int_reset);
  }
+
+/*
+ * Advertise StoreEOI for a P10 compat guest. OS is required to
+ * enforce load-after-store ordering.
+ */
+void spapr_xive_enable_store_eoi(SpaprXive *xive, bool enable)
+{
+if (enable) {
+xive->source.esb_flags |= XIVE_SRC_STORE_EOI;
+} else {
+xive->source.esb_flags &= ~XIVE_SRC_STORE_EOI;
+}
+
+}
diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
index 61fe7bd2d322..bd023407bd7f 100644
--- a/hw/intc/spapr_xive_kvm.c
+++ b/hw/intc/spapr_xive_kvm.c
@@ -296,6 +296,21 @@ static uint64_t xive_esb_rw(XiveSource *xsrc, int srcno, 
uint32_t offset,
  
  static uint8_t xive_esb_read(XiveSource *xsrc, int srcno, uint32_t offset)

  {
+/*
+ * The XIVE_ESB_SET_PQ_10 load operation is used to disable
+ * temporarily an interrupt source. If StoreEOI is active, a
+ * source could be left enabled if the load and store operations
+ * come out of order.
+ *
+ * As we don't know the characteristics of the host source
+ * interrupts (StoreEOI or not), enforce the load-after-store
+ * ordering always. The performance penalty will be very small for
+ * QEMU.
+ */
+if (offset == XIVE_ESB_SET_PQ_10) {
+offset |= XIVE_ESB_LD_ST_MO;
+}
+
  return xive_esb_rw(xsrc, srcno, offset, 0, 0) & 0x3;
  }
  
diff --git a/hw/intc/xive.c b/hw/intc/xive.c

index b8e4c7294d59..d62881873b1b 100644
--- a/hw/intc/xive.c
+++ b/hw/intc/xive.c
@@ -1024,6 +1024,12 @@ static uint64_t xive_source_esb_read(void *opaque, 
hwaddr addr, unsigned size)
  case XIVE_ESB_SET_PQ_01 ... XIVE_ESB_SET_PQ_01 + 0x0FF:
  case XIVE_ESB_SET_PQ_10 ... XIVE_ESB_SET_PQ_10 + 0x0FF:
  case XIVE_ESB_SET_PQ_11 ... XIVE_ESB_SET_PQ_11 + 0x0FF:
+if

Re: [PULL v2 07/35] target/riscv: access cfg structure through DisasContext

2022-02-16 Thread Alistair Francis

On Wed, Feb 16, 2022 at 8:24 PM Philipp Tomsich
 wrote:
>
> Alistair,
>
> This PULL seems not to include the fixup (which you had intended to
> squash into it) for the regression introduced (i.e. the condition
> being inverted):
>   
> https://patchwork.kernel.org/project/qemu-devel/patch/20220203153946.2676353-1-philipp.toms...@vrull.eu/

Well

It does not include it and I'm not really sure why it doesn't. The V1
PR didn't either.

I thought I had applied it, but I guess not. I have actually applied
it to riscv-to-apply.next now

Alistair


> Without that change this will introduce a regression in Zb[abcs]
> (i.e., it will be enabled when it shouldn't be, and will be disabled
> when it should be on).
>
> Please ignore, if I missed a later stand-alone patch (I just looked at
> the series in Patchworks).
>
> Thanks,
> Philipp.
>
>
> On Wed, 16 Feb 2022 at 07:29, Alistair Francis
>  wrote:
> >
> > From: Philipp Tomsich 
> >
> > The Zb[abcs] support code still uses the RISCV_CPU macros to access
> > the configuration information (i.e., check whether an extension is
> > available/enabled).  Now that we provide this information directly
> > from DisasContext, we can access this directly via the cfg_ptr field.
> >
> > Signed-off-by: Philipp Tomsich 
> > Reviewed-by: Alistair Francis 
> > Suggested-by: Richard Henderson 
> > Reviewed-by: Richard Henderson 
> > Message-Id: <20220202005249.3566542-5-philipp.toms...@vrull.eu>
> > Signed-off-by: Alistair Francis 
> > ---
> >  target/riscv/insn_trans/trans_rvb.c.inc | 8 
> >  1 file changed, 4 insertions(+), 4 deletions(-)
> >
> > diff --git a/target/riscv/insn_trans/trans_rvb.c.inc 
> > b/target/riscv/insn_trans/trans_rvb.c.inc
> > index 810431a1d6..f9bd3b7ec4 100644
> > --- a/target/riscv/insn_trans/trans_rvb.c.inc
> > +++ b/target/riscv/insn_trans/trans_rvb.c.inc
> > @@ -19,25 +19,25 @@
> >   */
> >
> >  #define REQUIRE_ZBA(ctx) do {\
> > -if (!RISCV_CPU(ctx->cs)->cfg.ext_zba) {  \
> > +if (ctx->cfg_ptr->ext_zba) { \
> >  return false;\
> >  }\
> >  } while (0)
> >
> >  #define REQUIRE_ZBB(ctx) do {\
> > -if (!RISCV_CPU(ctx->cs)->cfg.ext_zbb) {  \
> > +if (ctx->cfg_ptr->ext_zbb) { \
> >  return false;\
> >  }\
> >  } while (0)
> >
> >  #define REQUIRE_ZBC(ctx) do {\
> > -if (!RISCV_CPU(ctx->cs)->cfg.ext_zbc) {  \
> > +if (ctx->cfg_ptr->ext_zbc) { \
> >  return false;\
> >  }\
> >  } while (0)
> >
> >  #define REQUIRE_ZBS(ctx) do {\
> > -if (!RISCV_CPU(ctx->cs)->cfg.ext_zbs) {  \
> > +if (ctx->cfg_ptr->ext_zbs) { \
> >  return false;\
> >  }\
> >  } while (0)
> > --
> > 2.34.1
> >

Re: [PATCH] tcg: Add 'signed' bit to typecodes

2022-02-16 Thread Keith Packard via

Richard Henderson  writes:

> The signed information is still there, merged with the typecode:
>
> #define dh_typecode_void 0
> #define dh_typecode_noreturn 0
> #define dh_typecode_i32 2
> #define dh_typecode_s32 3
> #define dh_typecode_i64 4
> #define dh_typecode_s64 5
> #define dh_typecode_ptr 6
>
> Note that is_signed is bit 0.
>
> But I can see that dh_alias_s32 hides it -- definitely a bug there.

Awesome. I suspected that was the underlying cause -- missing some of
what dh_alias does to the values.

As I said in the commit message, I looked at just filling out the
dh_typecode_ set to avoid using dh_alias entirely, but that actually
turned out to be a more complicated patch as you need to handle 'tl'
types, which are machine-specific; something dh_alias handles.

Either way works; I took the path which involved creating less new code
(as dh_is_signed was already written) to try and make it a bit easier to
evaluate the result.

Thanks for taking a look at the patch; I had a fine evening chasing this
down starting with a bug report written in an embedded python dialect :-)

-- 
-keith

signature.asc
Description: PGP signature

Re: [PATCH] tests/tcg/s390x: Build tests with debian11

2022-02-16 Thread Philippe Mathieu-Daudé via


On 16/2/22 12:51, David Hildenbrand wrote:

We need a newer compiler to build upcoming tests that test for z15
features with -march=z15. So let's do it similar to arm64 and powerpc,
using an environment based on debian11 to build tests only.

Cc: Thomas Huth 
Cc: Cornelia Huck 
Cc: Richard Henderson 
Cc: "Alex Bennée" 
Cc: "Philippe Mathieu-Daudé" 
Cc: Wainer dos Santos Moschetta 
Cc: Beraldo Leal 
Cc: David Miller 
Signed-off-by: David Hildenbrand 
---
  .gitlab-ci.d/container-cross.yml|  7 +++
  tests/docker/Makefile.include   |  3 ++-
  .../dockerfiles/debian-s390x-test-cross.docker  | 13 +
  tests/tcg/configure.sh  |  2 +-
  4 files changed, 23 insertions(+), 2 deletions(-)
  create mode 100644 tests/docker/dockerfiles/debian-s390x-test-cross.docker


Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH 0/6] hw/nvme: enhanced protection information (64-bit guard)

2022-02-16 Thread Keith Busch

On Mon, Feb 14, 2022 at 01:30:23PM +0100, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> This adds support for one possible new protection information format
> introduced in TP4068 (and integrated in NVMe 2.0): the 64-bit CRC guard
> and 48-bit reference tag. This version does not support storage tags.
> 
> Like the CRC16 support already present, this uses a software
> implementation of CRC64 (so it is naturally pretty slow). But its good
> enough for verification purposes.
> 
> This goes hand-in-hand with the support that Keith submitted for the
> Linux kernel[1].
> 
> [1]: 
> https://lore.kernel.org/linux-nvme/20220201190128.3075065-1-kbu...@kernel.org/

Other than comment on 6/6, series looks good to me.

Reviewed-by: Keith Busch

Re: [PATCH 6/6] hw/nvme: 64-bit pi support

2022-02-16 Thread Keith Busch

On Mon, Feb 14, 2022 at 01:30:29PM +0100, Klaus Jensen wrote:
> @@ -384,6 +389,12 @@ static int nvme_ns_check_constraints(NvmeNamespace *ns, 
> Error **errp)
>  return -1;
>  }
>  
> +if (ns->params.pif != NVME_PI_GUARD_16 &&
> +ns->params.pif != NVME_PI_GUARD_64) {
> +error_setg(errp, "invalid 'pif'");
> +return -1;
> +}

In addition, the requested metadata size ('params.ms') should be checked
against the requested PI option. The function currently just checks
against 8 bytes, but the 64b guard requires at least 16 bytes.

Otherwise, looks great.

Re: [PATCH v2] tests/qemu-iotests: Rework the checks and spots using GNU sed

2022-02-16 Thread Eric Blake

On Wed, Feb 16, 2022 at 01:54:54PM +0100, Thomas Huth wrote:
> Instead of failing the iotests if GNU sed is not available (or skipping
> them completely in the check-block.sh script), it would be better to
> simply skip the bash-based tests that rely on GNU sed, so that the other
> tests could still be run. Thus we now explicitely use "gsed" (either as
> direct program or as a wrapper around "sed" if it's the GNU version)
> in the spots that rely on the GNU sed behavior. Statements that use the
> "-r" parameter of sed have been switched to use "-E" instead, since this
> switch is supported by all sed versions on our supported build hosts
> (most also support "-r", but macOS' sed only supports "-E"). With all
> these changes in place, we then can also remove the sed checks from the
> check-block.sh script, so that "make check-block" can now be run on
> systems without GNU sed, too.
> 
> Signed-off-by: Thomas Huth 
> ---
>  I've checked that this still works fine with "make vm-build-freebsd",
>  "make vm-build-netbsd" and "make vm-build-openbsd" and the Cirrus-CI
>  macOS tasks.
> 
>  tests/check-block.sh | 12 --
>  tests/qemu-iotests/271   |  2 +-
>  tests/qemu-iotests/common.filter | 65 
>  tests/qemu-iotests/common.rc | 45 +++---
>  4 files changed, 57 insertions(+), 67 deletions(-)

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org

Re: [PATCH] tcg: Add 'signed' bit to typecodes

2022-02-16 Thread Richard Henderson


On 2/16/22 17:39, Keith Packard wrote:

Commit 7319d83a (tcg: Combine dh_is_64bit and dh_is_signed to
dh_typecode) converted the tcg type system to a 3-bit field from two
separate 1-bit fields. This subtly lost the 'signed' information from
the types as it uses the dh_alias macro to reduce the types down to
basic machine types. However, the dh_alias macro also discards sign
information, aliasing 's32' to 'i32'.


The signed information is still there, merged with the typecode:

#define dh_typecode_void 0
#define dh_typecode_noreturn 0
#define dh_typecode_i32 2
#define dh_typecode_s32 3
#define dh_typecode_i64 4
#define dh_typecode_s64 5
#define dh_typecode_ptr 6

Note that is_signed is bit 0.

But I can see that dh_alias_s32 hides it -- definitely a bug there.



r~

Re: [PULL 00/30] Misc mostly build system patches for 2022-02-15

2022-02-16 Thread Paolo Bonzini


On 2/16/22 15:41, Peter Maydell wrote:

On Wed, 16 Feb 2022 at 14:03, Paolo Bonzini  wrote:


On 2/16/22 10:56, Peter Maydell wrote:

Hi; this fails to build on OpenBSD (on the tests/vm/ setup).

Meson thinks it's found OpenGL:
  OpenGL support (epoxy)   : YES 1.5.4

but either it's wrong or else it's not putting the right
include directory onto the path, because the compiler
fails to find the headers:

In file included from ../src/hw/arm/virt.c:42:
In file included from
/home/qemu/qemu-test.sr5128/src/include/hw/vfio/vfio-calxeda-xgmac.h:17:
In file included from
/home/qemu/qemu-test.sr5128/src/include/hw/vfio/vfio-platform.h:20:
In file included from
/home/qemu/qemu-test.sr5128/src/include/hw/vfio/vfio-common.h:27:
/home/qemu/qemu-test.sr5128/src/include/ui/console.h:11:11: fatal
error: 'epoxy/gl.h' file not found
# include 
^~~~
1 error generated.


Yeah, there's a lot of uses of ui/console.h and they all need
epoxy/gl.h.  That's in need of some cleanup.


Why can't meson just do the same thing configure was doing,
ie add the include path to the cflags and the library path
to the linker flags?


Yes, it can do that as well.  I found it now:

@@ -43,7 +43,6 @@ vnc_ss.add(zlib, png, jpeg, gnutls)
 vnc_ss.add(when: sasl, if_true: files('vnc-auth-sasl.c'))
 softmmu_ss.add_all(when: vnc, if_true: vnc_ss)
 softmmu_ss.add(when: vnc, if_false: files('vnc-stubs.c'))
-specific_ss.add(when: ['CONFIG_SOFTMMU'], if_true: opengl)

 ui_modules = {}


Paolo

Re: [Virtio-fs] [PULL 00/12] virtiofs queue

2022-02-16 Thread Vivek Goyal

On Wed, Feb 16, 2022 at 07:40:14PM +, Dr. David Alan Gilbert wrote:
> * Dr. David Alan Gilbert (git) (dgilb...@redhat.com) wrote:
> > From: "Dr. David Alan Gilbert" 
> > 
> > The following changes since commit c13b8e9973635f34f3ce4356af27a311c993729c:
> > 
> >   Merge remote-tracking branch 
> > 'remotes/alistair/tags/pull-riscv-to-apply-20220216' into staging 
> > (2022-02-16 09:57:11 +)
> > 
> > are available in the Git repository at:
> > 
> >   https://gitlab.com/dagrh/qemu.git tags/pull-virtiofs-20220216
> > 
> > for you to fetch changes up to 47cc3ef597b2ee926c13c9433f4f73645429e128:
> > 
> >   virtiofsd: Add basic support for FUSE_SYNCFS request (2022-02-16 17:29:32 
> > +)
> 
> NAK
> this doesn't build on older Linuxes.
> 
> Rework version in the works.

Hi David,

I think it is patch 8 which is using gettid(). I have updated that
patch and now I am using syscall(NR_gettid) instead. Here is the
updated patch. I hope this solves the build on older Linux issue.


Subject: virtiofsd: Add helpers to work with /proc/self/task/tid/attr/fscreate

Soon we will be able to create and also set security context on the file
atomically using /proc/self/task/tid/attr/fscreate knob. If this knob
is available on the system, first set the knob with the desired context
and then create the file. It will be created with the context set in
fscreate. This works basically for SELinux and its per thread.

This patch just introduces the helper functions. Subsequent patches will
make use of these helpers.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Vivek Goyal 
---
 tools/virtiofsd/passthrough_ll.c |   92 +++
 1 file changed, 92 insertions(+)

Index: rhvgoyal-qemu/tools/virtiofsd/passthrough_ll.c
===
--- rhvgoyal-qemu.orig/tools/virtiofsd/passthrough_ll.c 2022-02-16 
15:53:13.657015138 -0500
+++ rhvgoyal-qemu/tools/virtiofsd/passthrough_ll.c  2022-02-16 
15:55:14.911234993 -0500
@@ -173,10 +173,14 @@ struct lo_data {
 
 /* An O_PATH file descriptor to /proc/self/fd/ */
 int proc_self_fd;
+/* An O_PATH file descriptor to /proc/self/task/ */
+int proc_self_task;
 int user_killpriv_v2, killpriv_v2;
 /* If set, virtiofsd is responsible for setting umask during creation */
 bool change_umask;
 int user_posix_acl, posix_acl;
+/* Keeps track if /proc//attr/fscreate should be used or not */
+bool use_fscreate;
 };
 
 static const struct fuse_opt lo_opts[] = {
@@ -257,6 +261,72 @@ static struct lo_data *lo_data(fuse_req_
 }
 
 /*
+ * Tries to figure out if /proc//attr/fscreate is usable or not. With
+ * selinux=0, read from fscreate returns -EINVAL.
+ *
+ * TODO: Link with libselinux and use is_selinux_enabled() instead down
+ * the line. It probably will be more reliable indicator.
+ */
+static bool is_fscreate_usable(struct lo_data *lo)
+{
+char procname[64];
+int fscreate_fd;
+size_t bytes_read;
+
+sprintf(procname, "%ld/attr/fscreate", syscall(SYS_gettid));
+fscreate_fd = openat(lo->proc_self_task, procname, O_RDWR);
+if (fscreate_fd == -1) {
+return false;
+}
+
+bytes_read = read(fscreate_fd, procname, 64);
+close(fscreate_fd);
+if (bytes_read == -1) {
+return false;
+}
+return true;
+}
+
+/* Helpers to set/reset fscreate */
+__attribute__((unused))
+static int open_set_proc_fscreate(struct lo_data *lo, const void *ctx,
+  size_t ctxlen,int *fd)
+{
+char procname[64];
+int fscreate_fd, err = 0;
+size_t written;
+
+sprintf(procname, "%ld/attr/fscreate", syscall(SYS_gettid));
+fscreate_fd = openat(lo->proc_self_task, procname, O_WRONLY);
+err = fscreate_fd == -1 ? errno : 0;
+if (err) {
+return err;
+}
+
+written = write(fscreate_fd, ctx, ctxlen);
+err = written == -1 ? errno : 0;
+if (err) {
+goto out;
+}
+
+*fd = fscreate_fd;
+return 0;
+out:
+close(fscreate_fd);
+return err;
+}
+
+__attribute__((unused))
+static void close_reset_proc_fscreate(int fd)
+{
+if ((write(fd, NULL, 0)) == -1) {
+fuse_log(FUSE_LOG_WARNING, "Failed to reset fscreate. err=%d\n", 
errno);
+}
+close(fd);
+return;
+}
+
+/*
  * Load capng's state from our saved state if the current thread
  * hadn't previously been loaded.
  * returns 0 on success
@@ -3522,6 +3592,15 @@ static void setup_namespaces(struct lo_d
 exit(1);
 }
 
+/* Get the /proc/self/task descriptor */
+lo->proc_self_task = open("/proc/self/task/", O_PATH);
+if (lo->proc_self_task == -1) {
+fuse_log(FUSE_LOG_ERR, "open(/proc/self/task, O_PATH): %m\n");
+exit(1);
+}
+
+lo->use_fscreate = is_fscreate_usab

Re: Portable inline asm to get address of TLS variable

2022-02-16 Thread Florian Weimer

* Stefan Hajnoczi:

> I'm basically asking whether the _var input operand is treated as
> volatile and part of the inline assembly or whether it's just regular
> C code that the compiler may optimize with the surrounding function?

_var is evaluated outside of the inline assembly, any compiler
barrier will come after that.  It's subject to CSE (or whatever it's
called.  Three asm statements in a row

  asm volatile("" : "=r"(dst_ptr) : "0"(_var));
  asm volatile("" : "=r"(dst_ptr) : "0"(_var));
  asm volatile("" : "=r"(dst_ptr) : "0"(_var));

result in

movqtls_var@gottpoff(%rip), %rax
addq%fs:0, %rax
movq%rax, %rdx
movq%rax, %rdx

which is probably not what you want.

Thanks,
Florian

[PATCH v5 2/3] s390x/cpumodel: Bump up QEMU model to a stripped-down IBM z15 GA1

2022-02-16 Thread David Miller

TCG implements everything we need to run basic z15 OS+software

Signed-off-by: David Miller 
---
 hw/s390x/s390-virtio-ccw.c  | 3 +++
 target/s390x/cpu_models.c   | 6 +++---
 target/s390x/gen-features.c | 7 +--
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index 84e3e63c43..90480e7cf9 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -802,7 +802,10 @@ DEFINE_CCW_MACHINE(7_0, "7.0", true);
 
 static void ccw_machine_6_2_instance_options(MachineState *machine)
 {
+static const S390FeatInit qemu_cpu_feat = { S390_FEAT_LIST_QEMU_V6_2 };
+
 ccw_machine_7_0_instance_options(machine);
+s390_set_qemu_cpu_model(0x3906, 14, 2, qemu_cpu_feat);
 }
 
 static void ccw_machine_6_2_class_options(MachineClass *mc)
diff --git a/target/s390x/cpu_models.c b/target/s390x/cpu_models.c
index 11e06cc51f..89f83e81d5 100644
--- a/target/s390x/cpu_models.c
+++ b/target/s390x/cpu_models.c
@@ -85,9 +85,9 @@ static S390CPUDef s390_cpu_defs[] = {
 CPUDEF_INIT(0x3932, 16, 1, 47, 0x0800U, "gen16b", "IBM 3932 GA1"),
 };
 
-#define QEMU_MAX_CPU_TYPE 0x3906
-#define QEMU_MAX_CPU_GEN 14
-#define QEMU_MAX_CPU_EC_GA 2
+#define QEMU_MAX_CPU_TYPE 0x8561
+#define QEMU_MAX_CPU_GEN 15
+#define QEMU_MAX_CPU_EC_GA 1
 static const S390FeatInit qemu_max_cpu_feat_init = { S390_FEAT_LIST_QEMU_MAX };
 static S390FeatBitmap qemu_max_cpu_feat;
 
diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c
index a3f30f69d9..22846121c4 100644
--- a/target/s390x/gen-features.c
+++ b/target/s390x/gen-features.c
@@ -731,16 +731,18 @@ static uint16_t qemu_V6_0[] = {
 S390_FEAT_ESOP,
 };
 
-static uint16_t qemu_LATEST[] = {
+static uint16_t qemu_V6_2[] = {
 S390_FEAT_INSTRUCTION_EXEC_PROT,
 S390_FEAT_MISC_INSTRUCTION_EXT2,
 S390_FEAT_MSA_EXT_8,
 S390_FEAT_VECTOR_ENH,
 };
 
+static uint16_t qemu_LATEST[] = {
+S390_FEAT_MISC_INSTRUCTION_EXT3,
+};
 /* add all new definitions before this point */
 static uint16_t qemu_MAX[] = {
-S390_FEAT_MISC_INSTRUCTION_EXT3,
 /* generates a dependency warning, leave it out for now */
 S390_FEAT_MSA_EXT_5,
 };
@@ -863,6 +865,7 @@ static FeatGroupDefSpec QemuFeatDef[] = {
 QEMU_FEAT_INITIALIZER(V4_0),
 QEMU_FEAT_INITIALIZER(V4_1),
 QEMU_FEAT_INITIALIZER(V6_0),
+QEMU_FEAT_INITIALIZER(V6_2),
 QEMU_FEAT_INITIALIZER(LATEST),
 QEMU_FEAT_INITIALIZER(MAX),
 };
-- 
2.32.0

[PATCH v5 1/3] s390x/tcg: Implement Miscellaneous-Instruction-Extensions Facility 3 for the s390x

2022-02-16 Thread David Miller

resolves: https://gitlab.com/qemu-project/qemu/-/issues/737
implements:
AND WITH COMPLEMENT   (NCRK, NCGRK)
NAND  (NNRK, NNGRK)
NOT EXCLUSIVE OR  (NXRK, NXGRK)
NOR   (NORK, NOGRK)
OR WITH COMPLEMENT(OCRK, OCGRK)
SELECT(SELR, SELGR)
SELECT HIGH   (SELFHR)
MOVE RIGHT TO LEFT(MVCRL)
POPULATION COUNT  (POPCNT)

Signed-off-by: David Miller 
---
 target/s390x/gen-features.c|  1 +
 target/s390x/helper.h  |  1 +
 target/s390x/tcg/insn-data.def | 30 +--
 target/s390x/tcg/mem_helper.c  | 20 +
 target/s390x/tcg/translate.c   | 53 --
 5 files changed, 100 insertions(+), 5 deletions(-)

diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c
index 7cb1a6ec10..a3f30f69d9 100644
--- a/target/s390x/gen-features.c
+++ b/target/s390x/gen-features.c
@@ -740,6 +740,7 @@ static uint16_t qemu_LATEST[] = {
 
 /* add all new definitions before this point */
 static uint16_t qemu_MAX[] = {
+S390_FEAT_MISC_INSTRUCTION_EXT3,
 /* generates a dependency warning, leave it out for now */
 S390_FEAT_MSA_EXT_5,
 };
diff --git a/target/s390x/helper.h b/target/s390x/helper.h
index 271b081e8c..69f69cf718 100644
--- a/target/s390x/helper.h
+++ b/target/s390x/helper.h
@@ -4,6 +4,7 @@ DEF_HELPER_FLAGS_4(nc, TCG_CALL_NO_WG, i32, env, i32, i64, i64)
 DEF_HELPER_FLAGS_4(oc, TCG_CALL_NO_WG, i32, env, i32, i64, i64)
 DEF_HELPER_FLAGS_4(xc, TCG_CALL_NO_WG, i32, env, i32, i64, i64)
 DEF_HELPER_FLAGS_4(mvc, TCG_CALL_NO_WG, void, env, i32, i64, i64)
+DEF_HELPER_FLAGS_4(mvcrl, TCG_CALL_NO_WG, void, env, i64, i64, i64)
 DEF_HELPER_FLAGS_4(mvcin, TCG_CALL_NO_WG, void, env, i32, i64, i64)
 DEF_HELPER_FLAGS_4(clc, TCG_CALL_NO_WG, i32, env, i32, i64, i64)
 DEF_HELPER_3(mvcl, i32, env, i32, i32)
diff --git a/target/s390x/tcg/insn-data.def b/target/s390x/tcg/insn-data.def
index 1c3e115712..efb1d5bc19 100644
--- a/target/s390x/tcg/insn-data.def
+++ b/target/s390x/tcg/insn-data.def
@@ -105,6 +105,9 @@
 D(0xa507, NILL,RI_a,  Z,   r1_o, i2_16u, r1, 0, andi, 0, 0x1000)
 D(0x9400, NI,  SI,Z,   la1, i2_8u, new, 0, ni, nz64, MO_UB)
 D(0xeb54, NIY, SIY,   LD,  la1, i2_8u, new, 0, ni, nz64, MO_UB)
+/* AND WITH COMPLEMENT */
+C(0xb9f5, NCRK,RRF_a, MIE3, r2, r3, new, r1_32, andc, nz32)
+C(0xb9e5, NCGRK,   RRF_a, MIE3, r2, r3, r1, 0, andc, nz64)
 
 /* BRANCH AND LINK */
 C(0x0500, BALR,RR_a,  Z,   0, r2_nz, r1, 0, bal, 0)
@@ -640,6 +643,8 @@
 C(0xeb8e, MVCLU,   RSY_a, E2,  0, a2, 0, 0, mvclu, 0)
 /* MOVE NUMERICS */
 C(0xd100, MVN, SS_a,  Z,   la1, a2, 0, 0, mvn, 0)
+/* MOVE RIGHT TO LEFT */
+C(0xe50a, MVCRL,   SSE,  MIE3, la1, a2, 0, 0, mvcrl, 0)
 /* MOVE PAGE */
 C(0xb254, MVPG,RRE,   Z,   0, 0, 0, 0, mvpg, 0)
 /* MOVE STRING */
@@ -707,6 +712,16 @@
 F(0xed0f, MSEB,RXF,   Z,   e1, m2_32u, new, e1, mseb, 0, IF_BFP)
 F(0xed1f, MSDB,RXF,   Z,   f1, m2_64, new, f1, msdb, 0, IF_BFP)
 
+/* NAND */
+C(0xb974, NNRK,RRF_a, MIE3, r2, r3, new, r1_32, nand, nz32)
+C(0xb964, NNGRK,   RRF_a, MIE3, r2, r3, r1, 0, nand, nz64)
+/* NOR */
+C(0xb976, NORK,RRF_a, MIE3, r2, r3, new, r1_32, nor, nz32)
+C(0xb966, NOGRK,   RRF_a, MIE3, r2, r3, r1, 0, nor, nz64)
+/* NOT EXCLUSIVE OR */
+C(0xb977, NXRK,RRF_a, MIE3, r2, r3, new, r1_32, nxor, nz32)
+C(0xb967, NXGRK,   RRF_a, MIE3, r2, r3, r1, 0, nxor, nz64)
+
 /* OR */
 C(0x1600, OR,  RR_a,  Z,   r1, r2, new, r1_32, or, nz32)
 C(0xb9f6, ORK, RRF_a, DO,  r2, r3, new, r1_32, or, nz32)
@@ -725,6 +740,9 @@
 D(0xa50b, OILL,RI_a,  Z,   r1_o, i2_16u, r1, 0, ori, 0, 0x1000)
 D(0x9600, OI,  SI,Z,   la1, i2_8u, new, 0, oi, nz64, MO_UB)
 D(0xeb56, OIY, SIY,   LD,  la1, i2_8u, new, 0, oi, nz64, MO_UB)
+/* OR WITH COMPLEMENT */
+C(0xb975, OCRK,RRF_a, MIE3, r2, r3, new, r1_32, orc, nz32)
+C(0xb965, OCGRK,   RRF_a, MIE3, r2, r3, r1, 0, orc, nz64)
 
 /* PACK */
 /* Really format SS_b, but we pack both lengths into one argument
@@ -735,6 +753,9 @@
 /* PACK UNICODE */
 C(0xe100, PKU, SS_f,  E2,  la1, a2, 0, 0, pku, 0)
 
+/* POPULATION COUNT */
+C(0xb9e1, POPCNT,  RRF_c, PC,  0, r2_o, r1, 0, popcnt, nz64)
+
 /* PREFETCH */
 /* Implemented as nops of course.  */
 C(0xe336, PFD, RXY_b, GIE, 0, 0, 0, 0, 0, 0)
@@ -743,9 +764,6 @@
 /* Implemented as nop of course.  */
 C(0xb2e8, PPA, RRF_c, PPA, 0, 0, 0, 0, 0, 0)
 
-/* POPULATION COUNT */
-C(0xb9e1, POPCNT,  RRE,   PC,  0, r2_o, r1, 0, popcnt, nz64)
-
 /* ROTATE LEFT SINGLE LOGICAL */
 C(0xeb1d, RLL, RSY_a, Z,   r3_o, sh, new, r1_32, rll32, 0)
 C(0xeb1c, RLLG,RSY_a, Z,   r3_o, sh, r1, 0, rll64, 0)
@@ -765,6 +783,12 @@
 /* SEARCH STRING UNICODE */
 C(0xb9be, SRSTU,   RRE,   ETF3, 0, 0, 0, 0, srstu, 0)
 
+/* SELECT */
+C(0xb9f0, SELR,RRF_a, MIE3, r2, r3, new, r1_32, loc, 0)
+C(0xb9e3, SELGR,   RRF_a,

[PATCH v5 3/3] tests/tcg/s390x: Tests for Miscellaneous-Instruction-Extensions Facility 3

2022-02-16 Thread David Miller

tests/tcg/s390x/mie3-compl.c: [N]*K instructions
tests/tcg/s390x/mie3-mvcrl.c: MVCRL instruction
tests/tcg/s390x/mie3-sel.c:  SELECT instruction

Signed-off-by: David Miller 
---
 tests/tcg/s390x/Makefile.target |  5 ++-
 tests/tcg/s390x/mie3-compl.c| 55 +
 tests/tcg/s390x/mie3-mvcrl.c| 31 +++
 tests/tcg/s390x/mie3-sel.c  | 42 +
 4 files changed, 132 insertions(+), 1 deletion(-)
 create mode 100644 tests/tcg/s390x/mie3-compl.c
 create mode 100644 tests/tcg/s390x/mie3-mvcrl.c
 create mode 100644 tests/tcg/s390x/mie3-sel.c

diff --git a/tests/tcg/s390x/Makefile.target b/tests/tcg/s390x/Makefile.target
index 1a7238b4eb..54e67446aa 100644
--- a/tests/tcg/s390x/Makefile.target
+++ b/tests/tcg/s390x/Makefile.target
@@ -1,12 +1,15 @@
 S390X_SRC=$(SRC_PATH)/tests/tcg/s390x
 VPATH+=$(S390X_SRC)
-CFLAGS+=-march=zEC12 -m64
+CFLAGS+=-march=z15 -m64
 TESTS+=hello-s390x
 TESTS+=csst
 TESTS+=ipm
 TESTS+=exrl-trt
 TESTS+=exrl-trtr
 TESTS+=pack
+TESTS+=mie3-compl
+TESTS+=mie3-mvcrl
+TESTS+=mie3-sel
 TESTS+=mvo
 TESTS+=mvc
 TESTS+=shift
diff --git a/tests/tcg/s390x/mie3-compl.c b/tests/tcg/s390x/mie3-compl.c
new file mode 100644
index 00..98281ee683
--- /dev/null
+++ b/tests/tcg/s390x/mie3-compl.c
@@ -0,0 +1,55 @@
+#include 
+
+
+#define F_EPI "stg %%r0, %[res] " : [res] "+m" (res) : : "r0", "r2", "r3"
+
+#define F_PROasm ( \
+"llihf %%r0,801\n" \
+"lg %%r2, %[a]\n"  \
+"lg %%r3, %[b] "   \
+: : [a] "m" (a),   \
+[b] "m" (b)\
+: "r2", "r3")
+
+#define FbinOp(S, ASM) uint64_t S(uint64_t a, uint64_t b) \
+{ uint64_t res = 0; F_PRO; ASM; return res; }
+
+/* AND WITH COMPLEMENT */
+FbinOp(_ncrk,  asm("ncrk  %%r0, %%r3, %%r2\n" F_EPI))
+FbinOp(_ncgrk, asm("ncgrk %%r0, %%r3, %%r2\n" F_EPI))
+
+/* NAND */
+FbinOp(_nnrk,  asm("nnrk  %%r0, %%r3, %%r2\n" F_EPI))
+FbinOp(_nngrk, asm("nngrk %%r0, %%r3, %%r2\n" F_EPI))
+
+/* NOT XOR */
+FbinOp(_nxrk,  asm("nxrk  %%r0, %%r3, %%r2\n" F_EPI))
+FbinOp(_nxgrk, asm("nxgrk %%r0, %%r3, %%r2\n" F_EPI))
+
+/* NOR */
+FbinOp(_nork,  asm("nork  %%r0, %%r3, %%r2\n" F_EPI))
+FbinOp(_nogrk, asm("nogrk %%r0, %%r3, %%r2\n" F_EPI))
+
+/* OR WITH COMPLEMENT */
+FbinOp(_ocrk,  asm("ocrk  %%r0, %%r3, %%r2\n" F_EPI))
+FbinOp(_ocgrk, asm("ocgrk %%r0, %%r3, %%r2\n" F_EPI))
+
+
+int main(int argc, char *argv[])
+{
+if (_ncrk(0xFF88, 0xAA11)  != 0x03210011ull ||
+_nnrk(0xFF88, 0xAA11)  != 0x032155FFull ||
+_nork(0xFF88, 0xAA11)  != 0x03210066ull ||
+_nxrk(0xFF88, 0xAA11)  != 0x0321AA66ull ||
+_ocrk(0xFF88, 0xAA11)  != 0x0321AA77ull ||
+_ncgrk(0xFF88, 0xAA11) != 0x0011ull ||
+_nngrk(0xFF88, 0xAA11) != 0x55FFull ||
+_nogrk(0xFF88, 0xAA11) != 0x0066ull ||
+_nxgrk(0xFF88, 0xAA11) != 0xAA66ull ||
+_ocgrk(0xFF88, 0xAA11) != 0xAA77ull)
+{
+return 1;
+}
+
+return 0;
+}
diff --git a/tests/tcg/s390x/mie3-mvcrl.c b/tests/tcg/s390x/mie3-mvcrl.c
new file mode 100644
index 00..81cf3ad702
--- /dev/null
+++ b/tests/tcg/s390x/mie3-mvcrl.c
@@ -0,0 +1,31 @@
+#include 
+#include 
+
+
+static inline void mvcrl_8(const char *dst, const char *src)
+{
+asm volatile (
+"llill %%r0, 8\n"
+"mvcrl 0(%[dst]), 0(%[src])\n"
+: : [dst] "d" (dst), [src] "d" (src)
+: "memory");
+}
+
+
+int main(int argc, char *argv[])
+{
+const char *alpha = "abcdefghijklmnop";
+
+/* array missing 'i' */
+char tstr[17] = "abcdefghjklmnop\0" ;
+
+/* mvcrl reference use: 'open a hole in an array' */
+mvcrl_8(tstr + 9, tstr + 8);
+
+/* place missing 'i' */
+tstr[8] = 'i';
+
+return strncmp(alpha, tstr, 16ul);
+}
+
+
diff --git a/tests/tcg/s390x/mie3-sel.c b/tests/tcg/s390x/mie3-sel.c
new file mode 100644
index 00..d6b7b0933b
--- /dev/null
+++ b/tests/tcg/s390x/mie3-sel.c
@@ -0,0 +1,42 @@
+#include 
+
+
+#define F_EPI "stg %%r0, %[res] " : [res] "+m" (res) : : "r0", "r2", "r3"
+
+#define F_PROasm ( \
+"lg %%r2, %[a]\n"  \
+"lg %%r3, %[b]\n"  \
+"lg %%r0, %[c]\n"  \
+"ltgr %%r0, %%r0"  \
+: : [a] "m" (a),   \
+[b] "m" (b),   \
+[c] "m" (c)\
+: "r0", "r2", "r3", "r4")
+
+
+
+#define Fi3(S, ASM) uint64_t S(uint64_t a, uint64_t b, uint64_t c) \
+{ uint64_t res = 0; F_PRO ; ASM ; return res; }
+
+
+Fi3 (_selre, asm("selre%%r0, %%r3, %%r2\n" F_EPI))
+Fi3 (_selgrz,asm("selgrz   %%r0, %%r3, %%r2\n" F_EPI))
+Fi3 (_selfhrnz,  asm("selfhrnz %%r0, %%r3, %%r2\n" F_EPI))
+
+
+int main(int argc, char *argv[])
+{
+uint64_t a = ~0, b = ~0, c = ~0;
+a =_selre(0x06660066ull, 0x06660006ull, a);
+b =   _selgrz(0xF00D0005ull, 0xF00D0055ull, b);
+c = _selfhrnz(0x00440044ull, 0x00040004ull, c);
+
+if ((0x0066ull != a) ||
+(0xF00D0005ull != b) ||
+

Re: Portable inline asm to get address of TLS variable

2022-02-16 Thread Florian Weimer

* Stefan Hajnoczi:

> On Wed, 16 Feb 2022 at 18:14, Florian Weimer  wrote:
>>
>> * Stefan Hajnoczi:
>>
>> > I've been trying to make the inline asm that gets the address of a TLS
>> > variable for QEMU coroutines pass QEMU's GitLab CI.
>> > https://gitlab.com/stefanha/qemu/-/blob/coroutine-tls-fix/include/qemu/coroutine-tls.h#L89
>> >
>> > The code isn't -fPIC-friendly (R_X86_64_TPOFF32 relocations aren't
>> > allowed in -fPIC shared libraries) so builds fail with ./configure
>> > --enable-modules. While I was tackling this I stumbled on this:
>> >
>> >   void *dst_ptr;
>> >   asm volatile("" : "=r"(dst_ptr) : "0"(_var))
>> >
>> > What's nice about it:
>> > - It's portable, there are no arch-specific assembly instructions.
>> > - It works for both -fPIC and non-PIC.
>> >
>> > However, I wonder if the compiler might reuse a register that already
>> > contains the address. Then we'd have the coroutine problem again when
>> > qemu_coroutine_yield() is called between the earlier address calculation
>> > and the asm volatile statement.
>> >
>> > Thoughts?
>>
>> Sorry, I don't see why this isn't equivalent to a plain _var.
>> What exactly are you trying to achieve?
>
> _var, except forcing the compiler to calculate the address from scratch.

I think you can compute

  (void *) _var - __builtin_thread_pointer ();

to get the offset.  On many targets, GCC folds away the thread pointer
load, but that doesn't change the outcome.  Then it boils down to
getting access to the thread pointer, and you can get that behind a
compiler barrier (in a separate function).

But going against ABI and toolchain in this way is really no long-term
solution.  You need to switch to stackless co-routines, or we need to
provide proper ABI-level support for this.  Today it's the thread
pointer, tomorrow it's the shadow stack pointer, and the day after that,
it's the SafeStack pointer.  And further down the road, it's some thread
state for garbage collection support.  Or something like that.

Thanks,
Florian

[PATCH v5 0/3] s390x: Add partial z15 support and tests

2022-02-16 Thread David Miller

Add partial support for s390x z15 ga1 and specific tests for mie3 

v4 -> v5:
* Readd missing tests/tcg/s390x/mie3-*.c to patch

v3 -> v4:
* Change popcnt encoding RRE -> RRF_c
* Remove redundant code op_sel -> op_loc
* Cleanup for checkpatch.pl
* Readded mie3-* to Makefile.target

v2 -> v3:
* Moved tests to separate patch.
* Combined patches into series.


David Miller (3):
  s390x/tcg: Implement Miscellaneous-Instruction-Extensions Facility 3
for the s390x
  s390x/cpumodel: Bump up QEMU model to a stripped-down IBM z15 GA1
  tests/tcg/s390x: Tests for Miscellaneous-Instruction-Extensions
Facility 3

 hw/s390x/s390-virtio-ccw.c  |  3 ++
 target/s390x/cpu_models.c   |  6 ++--
 target/s390x/gen-features.c |  6 +++-
 target/s390x/helper.h   |  1 +
 target/s390x/tcg/insn-data.def  | 30 --
 target/s390x/tcg/mem_helper.c   | 20 
 target/s390x/tcg/translate.c| 53 +--
 tests/tcg/s390x/Makefile.target |  5 ++-
 tests/tcg/s390x/mie3-compl.c| 55 +
 tests/tcg/s390x/mie3-mvcrl.c| 31 +++
 tests/tcg/s390x/mie3-sel.c  | 42 +
 11 files changed, 242 insertions(+), 10 deletions(-)
 create mode 100644 tests/tcg/s390x/mie3-compl.c
 create mode 100644 tests/tcg/s390x/mie3-mvcrl.c
 create mode 100644 tests/tcg/s390x/mie3-sel.c

-- 
2.32.0

Re: [PATCH v4 3/3] tests/tcg/s390x: Tests for Miscellaneous-Instruction-Extensions Facility 3

2022-02-16 Thread David Hildenbrand

On 16.02.22 21:18, David Miller wrote:
> That is strange, if I unstage them show status they are set to be committed:
> 
> null@rygar:~/projects/qemu/build$ git reset --soft HEAD~1
> null@rygar:~/projects/qemu/build$ git status
> On branch t2
> Changes to be committed:
>   (use "git restore --staged ..." to unstage)
> modified:   ../tests/tcg/s390x/Makefile.target
> new file:   ../tests/tcg/s390x/mie3-compl.c
> new file:   ../tests/tcg/s390x/mie3-mvcrl.c
> new file:   ../tests/tcg/s390x/mie3-sel.c

Maybe you did a "git add" but not a "git commit --amend" to squash them
into the previous commit?

-- 
Thanks,

David / dhildenb

Re: Portable inline asm to get address of TLS variable

2022-02-16 Thread Stefan Hajnoczi

On Wed, 16 Feb 2022 at 20:28, Stefan Hajnoczi  wrote:
>
> On Wed, 16 Feb 2022 at 18:14, Florian Weimer  wrote:
> >
> > * Stefan Hajnoczi:
> >
> > > I've been trying to make the inline asm that gets the address of a TLS
> > > variable for QEMU coroutines pass QEMU's GitLab CI.
> > > https://gitlab.com/stefanha/qemu/-/blob/coroutine-tls-fix/include/qemu/coroutine-tls.h#L89
> > >
> > > The code isn't -fPIC-friendly (R_X86_64_TPOFF32 relocations aren't
> > > allowed in -fPIC shared libraries) so builds fail with ./configure
> > > --enable-modules. While I was tackling this I stumbled on this:
> > >
> > >   void *dst_ptr;
> > >   asm volatile("" : "=r"(dst_ptr) : "0"(_var))
> > >
> > > What's nice about it:
> > > - It's portable, there are no arch-specific assembly instructions.
> > > - It works for both -fPIC and non-PIC.
> > >
> > > However, I wonder if the compiler might reuse a register that already
> > > contains the address. Then we'd have the coroutine problem again when
> > > qemu_coroutine_yield() is called between the earlier address calculation
> > > and the asm volatile statement.
> > >
> > > Thoughts?
> >
> > Sorry, I don't see why this isn't equivalent to a plain _var.
> > What exactly are you trying to achieve?
>
> _var, except forcing the compiler to calculate the address from scratch.
>
> The goal is to avoid stale TLS variable addresses when a coroutine
> yields in one thread and is resumed in another thread.

I'm basically asking whether the _var input operand is treated as
volatile and part of the inline assembly or whether it's just regular
C code that the compiler may optimize with the surrounding function?

Stefan

[PATCH v4 0/3] s390x: Add partial z15 support and tests

2022-02-16 Thread David Miller

Add partial support for s390x z15 ga1 and specific tests for mie3 

v3 -> v4:
* Change popcnt encoding RRE -> RRF_c
* Remove redundant code op_sel -> op_loc
* Cleanup for checkpatch.pl
* Readded mie3-* to Makefile.target

v2 -> v3:
* Moved tests to separate patch.
* Combined patches into series.


David Miller (3):
  s390x/tcg: Implement Miscellaneous-Instruction-Extensions Facility 3
for the s390x
  s390x/cpumodel: Bump up QEMU model to a stripped-down IBM z15 GA1
  tests/tcg/s390x: Tests for Miscellaneous-Instruction-Extensions
Facility 3

 hw/s390x/s390-virtio-ccw.c  |  3 ++
 target/s390x/cpu_models.c   |  6 ++--
 target/s390x/gen-features.c |  6 +++-
 target/s390x/helper.h   |  1 +
 target/s390x/tcg/insn-data.def  | 30 +--
 target/s390x/tcg/mem_helper.c   | 20 +
 target/s390x/tcg/translate.c| 53 +++--
 tests/tcg/s390x/Makefile.target |  5 +++-
 8 files changed, 114 insertions(+), 10 deletions(-)

-- 
2.32.0

Re: Portable inline asm to get address of TLS variable

2022-02-16 Thread Stefan Hajnoczi

On Wed, 16 Feb 2022 at 18:14, Florian Weimer  wrote:
>
> * Stefan Hajnoczi:
>
> > I've been trying to make the inline asm that gets the address of a TLS
> > variable for QEMU coroutines pass QEMU's GitLab CI.
> > https://gitlab.com/stefanha/qemu/-/blob/coroutine-tls-fix/include/qemu/coroutine-tls.h#L89
> >
> > The code isn't -fPIC-friendly (R_X86_64_TPOFF32 relocations aren't
> > allowed in -fPIC shared libraries) so builds fail with ./configure
> > --enable-modules. While I was tackling this I stumbled on this:
> >
> >   void *dst_ptr;
> >   asm volatile("" : "=r"(dst_ptr) : "0"(_var))
> >
> > What's nice about it:
> > - It's portable, there are no arch-specific assembly instructions.
> > - It works for both -fPIC and non-PIC.
> >
> > However, I wonder if the compiler might reuse a register that already
> > contains the address. Then we'd have the coroutine problem again when
> > qemu_coroutine_yield() is called between the earlier address calculation
> > and the asm volatile statement.
> >
> > Thoughts?
>
> Sorry, I don't see why this isn't equivalent to a plain _var.
> What exactly are you trying to achieve?

_var, except forcing the compiler to calculate the address from scratch.

The goal is to avoid stale TLS variable addresses when a coroutine
yields in one thread and is resumed in another thread.

Stefan

[PATCH v4 3/3] tests/tcg/s390x: Tests for Miscellaneous-Instruction-Extensions Facility 3

2022-02-16 Thread David Miller

tests/tcg/s390x/mie3-compl.c: [N]*K instructions
tests/tcg/s390x/mie3-mvcrl.c: MVCRL instruction
tests/tcg/s390x/mie3-sel.c:  SELECT instruction

Signed-off-by: David Miller 
---
 tests/tcg/s390x/Makefile.target | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/tcg/s390x/Makefile.target b/tests/tcg/s390x/Makefile.target
index 1a7238b4eb..54e67446aa 100644
--- a/tests/tcg/s390x/Makefile.target
+++ b/tests/tcg/s390x/Makefile.target
@@ -1,12 +1,15 @@
 S390X_SRC=$(SRC_PATH)/tests/tcg/s390x
 VPATH+=$(S390X_SRC)
-CFLAGS+=-march=zEC12 -m64
+CFLAGS+=-march=z15 -m64
 TESTS+=hello-s390x
 TESTS+=csst
 TESTS+=ipm
 TESTS+=exrl-trt
 TESTS+=exrl-trtr
 TESTS+=pack
+TESTS+=mie3-compl
+TESTS+=mie3-mvcrl
+TESTS+=mie3-sel
 TESTS+=mvo
 TESTS+=mvc
 TESTS+=shift
-- 
2.32.0

Re: [PATCH v4 3/3] tests/tcg/s390x: Tests for Miscellaneous-Instruction-Extensions Facility 3

2022-02-16 Thread David Hildenbrand

On 16.02.22 21:03, David Miller wrote:
> tests/tcg/s390x/mie3-compl.c: [N]*K instructions
> tests/tcg/s390x/mie3-mvcrl.c: MVCRL instruction
> tests/tcg/s390x/mie3-sel.c:  SELECT instruction
> 
> Signed-off-by: David Miller 
> ---
>  tests/tcg/s390x/Makefile.target | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/tests/tcg/s390x/Makefile.target b/tests/tcg/s390x/Makefile.target
> index 1a7238b4eb..54e67446aa 100644
> --- a/tests/tcg/s390x/Makefile.target
> +++ b/tests/tcg/s390x/Makefile.target
> @@ -1,12 +1,15 @@
>  S390X_SRC=$(SRC_PATH)/tests/tcg/s390x
>  VPATH+=$(S390X_SRC)
> -CFLAGS+=-march=zEC12 -m64
> +CFLAGS+=-march=z15 -m64
>  TESTS+=hello-s390x
>  TESTS+=csst
>  TESTS+=ipm
>  TESTS+=exrl-trt
>  TESTS+=exrl-trtr
>  TESTS+=pack
> +TESTS+=mie3-compl
> +TESTS+=mie3-mvcrl
> +TESTS+=mie3-sel
>  TESTS+=mvo
>  TESTS+=mvc
>  TESTS+=shift

404, tests not found :)

-- 
Thanks,

David / dhildenb

[PATCH v4 14/18] iotests.py: add qemu_io_pipe_and_status()

2022-02-16 Thread Vladimir Sementsov-Ogievskiy

Add helper that returns both status and output, to be used in the
following commit

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 tests/qemu-iotests/iotests.py | 4 
 1 file changed, 4 insertions(+)

diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index 6ba65eb1ff..23bc6f686f 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -278,6 +278,10 @@ def qemu_io(*args):
 '''Run qemu-io and return the stdout data'''
 return qemu_tool_pipe_and_status('qemu-io', qemu_io_wrap_args(args))[0]
 
+def qemu_io_pipe_and_status(*args):
+args = qemu_io_args + list(args)
+return qemu_tool_pipe_and_status('qemu-io', args)
+
 def qemu_io_log(*args):
 result = qemu_io(*args)
 log(result, filters=[filter_testfiles, filter_qemu_io])
-- 
2.31.1

Re: [PATCH v4 3/3] tests/tcg/s390x: Tests for Miscellaneous-Instruction-Extensions Facility 3

2022-02-16 Thread David Miller

That is strange, if I unstage them show status they are set to be committed:

null@rygar:~/projects/qemu/build$ git reset --soft HEAD~1
null@rygar:~/projects/qemu/build$ git status
On branch t2
Changes to be committed:
  (use "git restore --staged ..." to unstage)
modified:   ../tests/tcg/s390x/Makefile.target
new file:   ../tests/tcg/s390x/mie3-compl.c
new file:   ../tests/tcg/s390x/mie3-mvcrl.c
new file:   ../tests/tcg/s390x/mie3-sel.c




On Wed, Feb 16, 2022 at 3:13 PM David Hildenbrand  wrote:
>
> On 16.02.22 21:03, David Miller wrote:
> > tests/tcg/s390x/mie3-compl.c: [N]*K instructions
> > tests/tcg/s390x/mie3-mvcrl.c: MVCRL instruction
> > tests/tcg/s390x/mie3-sel.c:  SELECT instruction
> >
> > Signed-off-by: David Miller 
> > ---
> >  tests/tcg/s390x/Makefile.target | 5 -
> >  1 file changed, 4 insertions(+), 1 deletion(-)
> >
> > diff --git a/tests/tcg/s390x/Makefile.target 
> > b/tests/tcg/s390x/Makefile.target
> > index 1a7238b4eb..54e67446aa 100644
> > --- a/tests/tcg/s390x/Makefile.target
> > +++ b/tests/tcg/s390x/Makefile.target
> > @@ -1,12 +1,15 @@
> >  S390X_SRC=$(SRC_PATH)/tests/tcg/s390x
> >  VPATH+=$(S390X_SRC)
> > -CFLAGS+=-march=zEC12 -m64
> > +CFLAGS+=-march=z15 -m64
> >  TESTS+=hello-s390x
> >  TESTS+=csst
> >  TESTS+=ipm
> >  TESTS+=exrl-trt
> >  TESTS+=exrl-trtr
> >  TESTS+=pack
> > +TESTS+=mie3-compl
> > +TESTS+=mie3-mvcrl
> > +TESTS+=mie3-sel
> >  TESTS+=mvo
> >  TESTS+=mvc
> >  TESTS+=shift
>
> 404, tests not found :)
>
> --
> Thanks,
>
> David / dhildenb
>

[PATCH v4 17/18] qapi: backup: add immutable-source parameter

2022-02-16 Thread Vladimir Sementsov-Ogievskiy

We are on the way to implement internal-backup with fleecing scheme,
which includes backup job copying from fleecing block driver node
(which is target of copy-before-write filter) to final target of
backup. This job doesn't need own filter, as fleecing block driver node
is a kind of snapshot, it's immutable from reader point of view.

Let's add a parameter for backup to not insert filter but instead
unshare writes on source. This way backup job becomes a simple copying
process.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 qapi/block-core.json  | 11 ++-
 include/block/block_int.h |  1 +
 block/backup.c| 61 +++
 block/replication.c   |  2 +-
 blockdev.c|  1 +
 5 files changed, 69 insertions(+), 7 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index a904755e98..30d44683bf 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -1436,6 +1436,15 @@
 #above node specified by @drive. If this option is not 
given,
 #a node name is autogenerated. (Since: 4.2)
 #
+# @immutable-source: If true, assume source is immutable, and don't insert 
filter
+#as no copy-before-write operations are needed. It will
+#fail if there are existing writers on source node.
+#Any attempt to add writer to source node during backup 
will
+#also fail. @filter-node-name must not be set.
+#If false, insert copy-before-write filter above source 
node
+#(see also @filter-node-name parameter).
+#Default is false. (Since 6.2)
+#
 # @x-perf: Performance options. (Since 6.0)
 #
 # Features:
@@ -1455,7 +1464,7 @@
 '*on-source-error': 'BlockdevOnError',
 '*on-target-error': 'BlockdevOnError',
 '*auto-finalize': 'bool', '*auto-dismiss': 'bool',
-'*filter-node-name': 'str',
+'*filter-node-name': 'str', '*immutable-source': 'bool',
 '*x-perf': { 'type': 'BackupPerf',
  'features': [ 'unstable' ] } } }
 
diff --git a/include/block/block_int.h b/include/block/block_int.h
index c43315ae6e..0270af29ae 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -1348,6 +1348,7 @@ BlockJob *backup_job_create(const char *job_id, 
BlockDriverState *bs,
 BitmapSyncMode bitmap_mode,
 bool compress,
 const char *filter_node_name,
+bool immutable_source,
 BackupPerf *perf,
 BlockdevOnError on_source_error,
 BlockdevOnError on_target_error,
diff --git a/block/backup.c b/block/backup.c
index 21d5983779..104f8fd835 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -34,6 +34,14 @@ typedef struct BackupBlockJob {
 BlockDriverState *cbw;
 BlockDriverState *source_bs;
 BlockDriverState *target_bs;
+BlockBackend *source_blk;
+BlockBackend *target_blk;
+/*
+ * Note that if backup runs with filter (immutable-source parameter is
+ * false), @cbw is set but @source_blk and @target_blk are NULL.
+ * Otherwise if backup runs without filter (immutable-source paramter is
+ * true), @cbw is NULL but @source_blk and @target_blk are set.
+ */
 
 BdrvDirtyBitmap *sync_bitmap;
 
@@ -102,7 +110,17 @@ static void backup_clean(Job *job)
 {
 BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
 block_job_remove_all_bdrv(>common);
-bdrv_cbw_drop(s->cbw);
+if (s->cbw) {
+assert(!s->source_blk && !s->target_blk);
+bdrv_cbw_drop(s->cbw);
+} else {
+block_copy_state_free(s->bcs);
+s->bcs = NULL;
+blk_unref(s->source_blk);
+s->source_blk = NULL;
+blk_unref(s->target_blk);
+s->target_blk = NULL;
+}
 }
 
 void backup_do_checkpoint(BlockJob *job, Error **errp)
@@ -357,6 +375,7 @@ BlockJob *backup_job_create(const char *job_id, 
BlockDriverState *bs,
   BitmapSyncMode bitmap_mode,
   bool compress,
   const char *filter_node_name,
+  bool immutable_source,
   BackupPerf *perf,
   BlockdevOnError on_source_error,
   BlockdevOnError on_target_error,
@@ -369,6 +388,7 @@ BlockJob *backup_job_create(const char *job_id, 
BlockDriverState *bs,
 int64_t cluster_size;
 BlockDriverState *cbw = NULL;
 BlockCopyState *bcs = NULL;
+BlockBackend *source_blk = NULL, *target_blk = NULL;
 
 assert(bs);
 assert(target);
@@ -377,6 +397,12 @@ BlockJob *backup_job_create(const char *job_id, 
BlockDriverState *bs,
 assert(sync_mode != MIRROR_SYNC_MODE_INCREMENTAL);
 assert(sync_bitmap || sync_mode != MIRROR_SYNC_MODE_BITMAP);
 
+if

Re: [PATCH v4 2/3] s390x/cpumodel: Bump up QEMU model to a stripped-down IBM z15 GA1

2022-02-16 Thread David Hildenbrand

On 16.02.22 21:03, David Miller wrote:
> TCG implements everything we need to run basic z15 OS+software
> 
> Signed-off-by: David Miller 

Booting Fedora34 with an upstream kernel (compiled for z15) did work.

Reviewed-by: David Hildenbrand 

Thanks!


-- 
Thanks,

David / dhildenb

[PATCH v4 2/3] s390x/cpumodel: Bump up QEMU model to a stripped-down IBM z15 GA1

2022-02-16 Thread David Miller

TCG implements everything we need to run basic z15 OS+software

Signed-off-by: David Miller 
---
 hw/s390x/s390-virtio-ccw.c  | 3 +++
 target/s390x/cpu_models.c   | 6 +++---
 target/s390x/gen-features.c | 7 +--
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index 84e3e63c43..90480e7cf9 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -802,7 +802,10 @@ DEFINE_CCW_MACHINE(7_0, "7.0", true);
 
 static void ccw_machine_6_2_instance_options(MachineState *machine)
 {
+static const S390FeatInit qemu_cpu_feat = { S390_FEAT_LIST_QEMU_V6_2 };
+
 ccw_machine_7_0_instance_options(machine);
+s390_set_qemu_cpu_model(0x3906, 14, 2, qemu_cpu_feat);
 }
 
 static void ccw_machine_6_2_class_options(MachineClass *mc)
diff --git a/target/s390x/cpu_models.c b/target/s390x/cpu_models.c
index 11e06cc51f..89f83e81d5 100644
--- a/target/s390x/cpu_models.c
+++ b/target/s390x/cpu_models.c
@@ -85,9 +85,9 @@ static S390CPUDef s390_cpu_defs[] = {
 CPUDEF_INIT(0x3932, 16, 1, 47, 0x0800U, "gen16b", "IBM 3932 GA1"),
 };
 
-#define QEMU_MAX_CPU_TYPE 0x3906
-#define QEMU_MAX_CPU_GEN 14
-#define QEMU_MAX_CPU_EC_GA 2
+#define QEMU_MAX_CPU_TYPE 0x8561
+#define QEMU_MAX_CPU_GEN 15
+#define QEMU_MAX_CPU_EC_GA 1
 static const S390FeatInit qemu_max_cpu_feat_init = { S390_FEAT_LIST_QEMU_MAX };
 static S390FeatBitmap qemu_max_cpu_feat;
 
diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c
index a3f30f69d9..22846121c4 100644
--- a/target/s390x/gen-features.c
+++ b/target/s390x/gen-features.c
@@ -731,16 +731,18 @@ static uint16_t qemu_V6_0[] = {
 S390_FEAT_ESOP,
 };
 
-static uint16_t qemu_LATEST[] = {
+static uint16_t qemu_V6_2[] = {
 S390_FEAT_INSTRUCTION_EXEC_PROT,
 S390_FEAT_MISC_INSTRUCTION_EXT2,
 S390_FEAT_MSA_EXT_8,
 S390_FEAT_VECTOR_ENH,
 };
 
+static uint16_t qemu_LATEST[] = {
+S390_FEAT_MISC_INSTRUCTION_EXT3,
+};
 /* add all new definitions before this point */
 static uint16_t qemu_MAX[] = {
-S390_FEAT_MISC_INSTRUCTION_EXT3,
 /* generates a dependency warning, leave it out for now */
 S390_FEAT_MSA_EXT_5,
 };
@@ -863,6 +865,7 @@ static FeatGroupDefSpec QemuFeatDef[] = {
 QEMU_FEAT_INITIALIZER(V4_0),
 QEMU_FEAT_INITIALIZER(V4_1),
 QEMU_FEAT_INITIALIZER(V6_0),
+QEMU_FEAT_INITIALIZER(V6_2),
 QEMU_FEAT_INITIALIZER(LATEST),
 QEMU_FEAT_INITIALIZER(MAX),
 };
-- 
2.32.0

[PATCH v4 18/18] iotests/image-fleecing: test push backup with fleecing

2022-02-16 Thread Vladimir Sementsov-Ogievskiy

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 tests/qemu-iotests/tests/image-fleecing | 121 ++--
 tests/qemu-iotests/tests/image-fleecing.out |  63 ++
 2 files changed, 152 insertions(+), 32 deletions(-)

diff --git a/tests/qemu-iotests/tests/image-fleecing 
b/tests/qemu-iotests/tests/image-fleecing
index 33995612be..903cd50be9 100755
--- a/tests/qemu-iotests/tests/image-fleecing
+++ b/tests/qemu-iotests/tests/image-fleecing
@@ -49,9 +49,15 @@ remainder = [('0xd5', '0x108000',  '32k'), # Right-end of 
partial-left [1]
  ('0xdc', '32M',   '32k'), # Left-end of partial-right [2]
  ('0xcd', '0x3ff', '64k')] # patterns[3]
 
-def do_test(use_cbw, use_snapshot_access_filter, base_img_path,
-fleece_img_path, nbd_sock_path, vm,
+def do_test(vm, use_cbw, use_snapshot_access_filter, base_img_path,
+fleece_img_path, nbd_sock_path=None,
+target_img_path=None,
 bitmap=False):
+push_backup = target_img_path is not None
+assert (nbd_sock_path is not None) != push_backup
+if push_backup:
+assert use_cbw
+
 log('--- Setting up images ---')
 log('')
 
@@ -65,6 +71,9 @@ def do_test(use_cbw, use_snapshot_access_filter, 
base_img_path,
 else:
 assert qemu_img('create', '-f', 'qcow2', fleece_img_path, '64M') == 0
 
+if push_backup:
+assert qemu_img('create', '-f', 'qcow2', target_img_path, '64M') == 0
+
 for p in patterns:
 qemu_io('-f', iotests.imgfmt,
 '-c', 'write -P%s %s %s' % p, base_img_path)
@@ -139,27 +148,45 @@ def do_test(use_cbw, use_snapshot_access_filter, 
base_img_path,
 
 export_node = 'fl-access' if use_snapshot_access_filter else tmp_node
 
-log('')
-log('--- Setting up NBD Export ---')
-log('')
+if push_backup:
+log('')
+log('--- Starting actual backup ---')
+log('')
 
-nbd_uri = 'nbd+unix:///%s?socket=%s' % (export_node, nbd_sock_path)
-log(vm.qmp('nbd-server-start',
-   {'addr': {'type': 'unix',
- 'data': {'path': nbd_sock_path}}}))
+log(vm.qmp('blockdev-add', **{
+'driver': iotests.imgfmt,
+'node-name': 'target',
+'file': {
+'driver': 'file',
+'filename': target_img_path
+}
+}))
+log(vm.qmp('blockdev-backup', device=export_node,
+   sync='full', target='target',
+   immutable_source=True,
+   job_id='push-backup', speed=1))
+else:
+log('')
+log('--- Setting up NBD Export ---')
+log('')
 
-log(vm.qmp('nbd-server-add', device=export_node))
+nbd_uri = 'nbd+unix:///%s?socket=%s' % (export_node, nbd_sock_path)
+log(vm.qmp('nbd-server-start',
+   {'addr': { 'type': 'unix',
+  'data': { 'path': nbd_sock_path } } }))
 
-log('')
-log('--- Sanity Check ---')
-log('')
+log(vm.qmp('nbd-server-add', device=export_node))
 
-for p in patterns + zeroes:
-cmd = 'read -P%s %s %s' % p
-log(cmd)
-out, ret = qemu_io_pipe_and_status('-r', '-f', 'raw', '-c', cmd, 
nbd_uri)
-if ret != 0:
-print(out)
+log('')
+log('--- Sanity Check ---')
+log('')
+
+for p in patterns + zeroes:
+cmd = 'read -P%s %s %s' % p
+log(cmd)
+out, ret = qemu_io_pipe_and_status('-r', '-f', 'raw', '-c', cmd, 
nbd_uri)
+if ret != 0:
+print(out)
 
 log('')
 log('--- Testing COW ---')
@@ -170,6 +197,20 @@ def do_test(use_cbw, use_snapshot_access_filter, 
base_img_path,
 log(cmd)
 log(vm.hmp_qemu_io(qom_path, cmd, qdev=True))
 
+if push_backup:
+# Check that previous operations were done during backup, not after
+result = vm.qmp('query-block-jobs')
+if len(result['return']) != 1:
+log('Backup finished too fast, COW is not tested')
+
+result = vm.qmp('block-job-set-speed', device='push-backup', speed=0)
+assert result == {'return': {}}
+
+log(vm.event_wait(name='BLOCK_JOB_COMPLETED',
+  match={'data': {'device': 'push-backup'}}),
+  filters=[iotests.filter_qmp_event])
+log(vm.qmp('blockdev-del', node_name='target'))
+
 log('')
 log('--- Verifying Data ---')
 log('')
@@ -177,15 +218,19 @@ def do_test(use_cbw, use_snapshot_access_filter, 
base_img_path,
 for p in patterns + zeroes:
 cmd = 'read -P%s %s %s' % p
 log(cmd)
-out, ret = qemu_io_pipe_and_status('-r', '-f', 'raw', '-c', cmd, 
nbd_uri)
-if ret != 0:
-print(out)
+if push_backup:
+assert qemu_io_silent('-r', '-c', cmd, target_img_path) == 0
+else:
+out, ret = qemu_io_pipe_and_status('-r', '-f',

Re: [PATCH v4 1/3] s390x/tcg: Implement Miscellaneous-Instruction-Extensions Facility 3 for the s390x

2022-02-16 Thread David Hildenbrand

>  
> +/* SELECT */
> +C(0xb9f0, SELR,RRF_a, MIE3, r2, r3, new, r1_32, loc, 0)
> +C(0xb9e3, SELGR,   RRF_a, MIE3, r2, r3, r1, 0, loc, 0)
> +/* SELECT HIGH */
> +C(0xb9c0, SELFHR,  RRF_a, MIE3, r2, r3, new, r1_32h, loc, 0)
> +

Heh, note how I inverted r2 and r3 in my proposal? That's because op_loc
expects them in different order! With the current order, mie3-sel
actually fails (good!).

Apart from that

Reviewed-by: David Hildenbrand 

Thanks!

-- 
Thanks,

David / dhildenb

[PATCH v4 16/18] block: blk_root(): return non-const pointer

2022-02-16 Thread Vladimir Sementsov-Ogievskiy

In the following patch we'll want to pass blk children to block-copy.
Const pointers are not enough. So, return non const pointer from
blk_root().

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 include/sysemu/block-backend.h | 2 +-
 block/block-backend.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index e5e1524f06..904d70f49c 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -277,7 +277,7 @@ int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, 
int64_t off_in,
int64_t bytes, BdrvRequestFlags read_flags,
BdrvRequestFlags write_flags);
 
-const BdrvChild *blk_root(BlockBackend *blk);
+BdrvChild *blk_root(BlockBackend *blk);
 
 int blk_make_empty(BlockBackend *blk, Error **errp);
 
diff --git a/block/block-backend.c b/block/block-backend.c
index 4ff6b4d785..97913acfcd 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -2464,7 +2464,7 @@ int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, 
int64_t off_in,
   bytes, read_flags, write_flags);
 }
 
-const BdrvChild *blk_root(BlockBackend *blk)
+BdrvChild *blk_root(BlockBackend *blk)
 {
 return blk->root;
 }
-- 
2.31.1

[PATCH v4 12/18] block: copy-before-write: realize snapshot-access API

2022-02-16 Thread Vladimir Sementsov-Ogievskiy

Current scheme of image fleecing looks like this:

[guest][NBD export]
  |  |
  |root  | root
  v  v
[copy-before-write] -> [temp.qcow2]
  | target  |
  |file |backing
  v |
[active disk] <-+

 - On guest writes copy-before-write filter copies old data from active
   disk to temp.qcow2. So fleecing client (NBD export) when reads
   changed regions from temp.qcow2 image and unchanged from active disk
   through backing link.

This patch makes possible new image fleecing scheme:

[guest]   [NBD export]
   ||
   | root   | root
   v file   v
[copy-before-write]<--[x-snapshot-access]
   |   |
   | file  | target
   v   v
[active-disk] [temp.img]

 - copy-before-write does CBW operations and also provides
   snapshot-access API. The API may be accessed through
   x-snapshot-access driver.

Benefits of new scheme:

1. Access control: if remote client try to read data that not covered
   by original dirty bitmap used on copy-before-write open, client gets
   -EACCES.

2. Discard support: if remote client do DISCARD, this additionally to
   discarding data in temp.img informs block-copy process to not copy
   these clusters. Next read from discarded area will return -EACCES.
   This is significant thing: when fleecing user reads data that was
   not yet copied to temp.img, we can avoid copying it on further guest
   write.

3. Synchronisation between client reads and block-copy write is more
   efficient. In old scheme we just rely on BDRV_REQ_SERIALISING flag
   used for writes to temp.qcow2. New scheme is less blocking:
 - fleecing reads are never blocked: if data region is untouched or
   in-flight, we just read from active-disk, otherwise we read from
   temp.img
 - writes to temp.img are not blocked by fleecing reads
 - still, guest writes of-course are blocked by in-flight fleecing
   reads, that currently read from active-disk - it's the minimum
   necessary blocking

4. Temporary image may be of any format, as we don't rely on backing
   feature.

5. Permission relation are simplified. With old scheme we have to share
   write permission on target child of copy-before-write, otherwise
   backing link conflicts with copy-before-write file child write
   permissions. With new scheme we don't have backing link, and
   copy-before-write node may have unshared access to temporary node.
   (Not realized in this commit, will be in future).

6. Having control on fleecing reads we'll be able to implement
   alternative behavior on failed copy-before-write operations.
   Currently we just break guest request (that's a historical behavior
   of backup). But in some scenarios it's a bad behavior: better
   is to drop the backup as failed but don't break guest request.
   With new scheme we can simply unset some bits in a bitmap on CBW
   failure and further fleecing reads will -EACCES, or something like
   this. (Not implemented in this commit, will be in future)
   Additional application for this is implementing timeout for CBW
   operations.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/copy-before-write.c | 212 +-
 1 file changed, 211 insertions(+), 1 deletion(-)

diff --git a/block/copy-before-write.c b/block/copy-before-write.c
index 91a2288b66..a8c88f64eb 100644
--- a/block/copy-before-write.c
+++ b/block/copy-before-write.c
@@ -33,12 +33,37 @@
 #include "block/block-copy.h"
 
 #include "block/copy-before-write.h"
+#include "block/reqlist.h"
 
 #include "qapi/qapi-visit-block-core.h"
 
 typedef struct BDRVCopyBeforeWriteState {
 BlockCopyState *bcs;
 BdrvChild *target;
+
+/*
+ * @lock: protects access to @access_bitmap, @done_bitmap and
+ * @frozen_read_reqs
+ */
+CoMutex lock;
+
+/*
+ * @access_bitmap: represents areas allowed for reading by fleecing user.
+ * Reading from non-dirty areas leads to -EACCES.
+ */
+BdrvDirtyBitmap *access_bitmap;
+
+/*
+ * @done_bitmap: represents areas that was successfully copied to @target 
by
+ * copy-before-write operations.
+ */
+BdrvDirtyBitmap *done_bitmap;
+
+/*
+ * @frozen_read_reqs: current read requests for fleecing user in bs->file
+ * node. These areas must not be rewritten by guest.
+ */
+BlockReqList frozen_read_reqs;
 } BDRVCopyBeforeWriteState;
 
 static coroutine_fn int cbw_co_preadv(
@@ -48,10 +73,20 @@ static coroutine_fn int cbw_co_preadv(
 return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
 }
 
+/*
+ * Do copy-before-write operation.
+ *
+ * On failure guest request must be failed too.
+ *
+ * On success, we also wait for all in-flight fleecing read requests in source
+ * node, and it's guaranteed that

[PATCH v4 1/3] s390x/tcg: Implement Miscellaneous-Instruction-Extensions Facility 3 for the s390x

2022-02-16 Thread David Miller

resolves: https://gitlab.com/qemu-project/qemu/-/issues/737
implements:
AND WITH COMPLEMENT   (NCRK, NCGRK)
NAND  (NNRK, NNGRK)
NOT EXCLUSIVE OR  (NXRK, NXGRK)
NOR   (NORK, NOGRK)
OR WITH COMPLEMENT(OCRK, OCGRK)
SELECT(SELR, SELGR)
SELECT HIGH   (SELFHR)
MOVE RIGHT TO LEFT(MVCRL)
POPULATION COUNT  (POPCNT)

Signed-off-by: David Miller 
---
 target/s390x/gen-features.c|  1 +
 target/s390x/helper.h  |  1 +
 target/s390x/tcg/insn-data.def | 30 +--
 target/s390x/tcg/mem_helper.c  | 20 +
 target/s390x/tcg/translate.c   | 53 --
 5 files changed, 100 insertions(+), 5 deletions(-)

diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c
index 7cb1a6ec10..a3f30f69d9 100644
--- a/target/s390x/gen-features.c
+++ b/target/s390x/gen-features.c
@@ -740,6 +740,7 @@ static uint16_t qemu_LATEST[] = {
 
 /* add all new definitions before this point */
 static uint16_t qemu_MAX[] = {
+S390_FEAT_MISC_INSTRUCTION_EXT3,
 /* generates a dependency warning, leave it out for now */
 S390_FEAT_MSA_EXT_5,
 };
diff --git a/target/s390x/helper.h b/target/s390x/helper.h
index 271b081e8c..69f69cf718 100644
--- a/target/s390x/helper.h
+++ b/target/s390x/helper.h
@@ -4,6 +4,7 @@ DEF_HELPER_FLAGS_4(nc, TCG_CALL_NO_WG, i32, env, i32, i64, i64)
 DEF_HELPER_FLAGS_4(oc, TCG_CALL_NO_WG, i32, env, i32, i64, i64)
 DEF_HELPER_FLAGS_4(xc, TCG_CALL_NO_WG, i32, env, i32, i64, i64)
 DEF_HELPER_FLAGS_4(mvc, TCG_CALL_NO_WG, void, env, i32, i64, i64)
+DEF_HELPER_FLAGS_4(mvcrl, TCG_CALL_NO_WG, void, env, i64, i64, i64)
 DEF_HELPER_FLAGS_4(mvcin, TCG_CALL_NO_WG, void, env, i32, i64, i64)
 DEF_HELPER_FLAGS_4(clc, TCG_CALL_NO_WG, i32, env, i32, i64, i64)
 DEF_HELPER_3(mvcl, i32, env, i32, i32)
diff --git a/target/s390x/tcg/insn-data.def b/target/s390x/tcg/insn-data.def
index 1c3e115712..efb1d5bc19 100644
--- a/target/s390x/tcg/insn-data.def
+++ b/target/s390x/tcg/insn-data.def
@@ -105,6 +105,9 @@
 D(0xa507, NILL,RI_a,  Z,   r1_o, i2_16u, r1, 0, andi, 0, 0x1000)
 D(0x9400, NI,  SI,Z,   la1, i2_8u, new, 0, ni, nz64, MO_UB)
 D(0xeb54, NIY, SIY,   LD,  la1, i2_8u, new, 0, ni, nz64, MO_UB)
+/* AND WITH COMPLEMENT */
+C(0xb9f5, NCRK,RRF_a, MIE3, r2, r3, new, r1_32, andc, nz32)
+C(0xb9e5, NCGRK,   RRF_a, MIE3, r2, r3, r1, 0, andc, nz64)
 
 /* BRANCH AND LINK */
 C(0x0500, BALR,RR_a,  Z,   0, r2_nz, r1, 0, bal, 0)
@@ -640,6 +643,8 @@
 C(0xeb8e, MVCLU,   RSY_a, E2,  0, a2, 0, 0, mvclu, 0)
 /* MOVE NUMERICS */
 C(0xd100, MVN, SS_a,  Z,   la1, a2, 0, 0, mvn, 0)
+/* MOVE RIGHT TO LEFT */
+C(0xe50a, MVCRL,   SSE,  MIE3, la1, a2, 0, 0, mvcrl, 0)
 /* MOVE PAGE */
 C(0xb254, MVPG,RRE,   Z,   0, 0, 0, 0, mvpg, 0)
 /* MOVE STRING */
@@ -707,6 +712,16 @@
 F(0xed0f, MSEB,RXF,   Z,   e1, m2_32u, new, e1, mseb, 0, IF_BFP)
 F(0xed1f, MSDB,RXF,   Z,   f1, m2_64, new, f1, msdb, 0, IF_BFP)
 
+/* NAND */
+C(0xb974, NNRK,RRF_a, MIE3, r2, r3, new, r1_32, nand, nz32)
+C(0xb964, NNGRK,   RRF_a, MIE3, r2, r3, r1, 0, nand, nz64)
+/* NOR */
+C(0xb976, NORK,RRF_a, MIE3, r2, r3, new, r1_32, nor, nz32)
+C(0xb966, NOGRK,   RRF_a, MIE3, r2, r3, r1, 0, nor, nz64)
+/* NOT EXCLUSIVE OR */
+C(0xb977, NXRK,RRF_a, MIE3, r2, r3, new, r1_32, nxor, nz32)
+C(0xb967, NXGRK,   RRF_a, MIE3, r2, r3, r1, 0, nxor, nz64)
+
 /* OR */
 C(0x1600, OR,  RR_a,  Z,   r1, r2, new, r1_32, or, nz32)
 C(0xb9f6, ORK, RRF_a, DO,  r2, r3, new, r1_32, or, nz32)
@@ -725,6 +740,9 @@
 D(0xa50b, OILL,RI_a,  Z,   r1_o, i2_16u, r1, 0, ori, 0, 0x1000)
 D(0x9600, OI,  SI,Z,   la1, i2_8u, new, 0, oi, nz64, MO_UB)
 D(0xeb56, OIY, SIY,   LD,  la1, i2_8u, new, 0, oi, nz64, MO_UB)
+/* OR WITH COMPLEMENT */
+C(0xb975, OCRK,RRF_a, MIE3, r2, r3, new, r1_32, orc, nz32)
+C(0xb965, OCGRK,   RRF_a, MIE3, r2, r3, r1, 0, orc, nz64)
 
 /* PACK */
 /* Really format SS_b, but we pack both lengths into one argument
@@ -735,6 +753,9 @@
 /* PACK UNICODE */
 C(0xe100, PKU, SS_f,  E2,  la1, a2, 0, 0, pku, 0)
 
+/* POPULATION COUNT */
+C(0xb9e1, POPCNT,  RRF_c, PC,  0, r2_o, r1, 0, popcnt, nz64)
+
 /* PREFETCH */
 /* Implemented as nops of course.  */
 C(0xe336, PFD, RXY_b, GIE, 0, 0, 0, 0, 0, 0)
@@ -743,9 +764,6 @@
 /* Implemented as nop of course.  */
 C(0xb2e8, PPA, RRF_c, PPA, 0, 0, 0, 0, 0, 0)
 
-/* POPULATION COUNT */
-C(0xb9e1, POPCNT,  RRE,   PC,  0, r2_o, r1, 0, popcnt, nz64)
-
 /* ROTATE LEFT SINGLE LOGICAL */
 C(0xeb1d, RLL, RSY_a, Z,   r3_o, sh, new, r1_32, rll32, 0)
 C(0xeb1c, RLLG,RSY_a, Z,   r3_o, sh, r1, 0, rll64, 0)
@@ -765,6 +783,12 @@
 /* SEARCH STRING UNICODE */
 C(0xb9be, SRSTU,   RRE,   ETF3, 0, 0, 0, 0, srstu, 0)
 
+/* SELECT */
+C(0xb9f0, SELR,RRF_a, MIE3, r2, r3, new, r1_32, loc, 0)
+C(0xb9e3, SELGR,   RRF_a,

[PATCH v4 11/18] block: introduce snapshot-access filter

2022-02-16 Thread Vladimir Sementsov-Ogievskiy

The filter simply utilizes snapshot-access API of underlying block
node.

In further patches we want to use it like this:

[guest]   [NBD export]
   ||
   | root   | root
   v file   v
[copy-before-write]<--[snapshot-access]
   |   |
   | file  | target
   v   v
[active-disk] [temp.img]

This way, NBD client will be able to read snapshotted state of active
disk, when active disk is continued to be written by guest. This is
known as "fleecing", and currently uses another scheme based on qcow2
temporary image which backing file is active-disk. New scheme comes
with benefits - see next commit.

The other possible application is exporting internal snapshots of
qcow2, like this:

[guest]  [NBD export]
   |  |
   | root | root
   v   file   v
[qcow2]<-[snapshot-access]

For this, we'll need to implement snapshot-access API handlers in
qcow2 driver, and improve snapshot-access filter (and API) to make it
possibele to select snapshot by name.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 qapi/block-core.json|   4 +-
 block/snapshot-access.c | 132 
 MAINTAINERS |   1 +
 block/meson.build   |   1 +
 4 files changed, 137 insertions(+), 1 deletion(-)
 create mode 100644 block/snapshot-access.c

diff --git a/qapi/block-core.json b/qapi/block-core.json
index 3bab597506..a904755e98 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2914,13 +2914,14 @@
 # @blkreplay: Since 4.2
 # @compress: Since 5.0
 # @copy-before-write: Since 6.2
+# @snapshot-access: Since 7.0
 #
 # Since: 2.9
 ##
 { 'enum': 'BlockdevDriver',
   'data': [ 'blkdebug', 'blklogwrites', 'blkreplay', 'blkverify', 'bochs',
 'cloop', 'compress', 'copy-before-write', 'copy-on-read', 'dmg',
-'file', 'ftp', 'ftps', 'gluster',
+'file', 'snapshot-access', 'ftp', 'ftps', 'gluster',
 {'name': 'host_cdrom', 'if': 'HAVE_HOST_BLOCK_DEVICE' },
 {'name': 'host_device', 'if': 'HAVE_HOST_BLOCK_DEVICE' },
 'http', 'https', 'iscsi',
@@ -4267,6 +4268,7 @@
   'rbd':'BlockdevOptionsRbd',
   'replication': { 'type': 'BlockdevOptionsReplication',
'if': 'CONFIG_REPLICATION' },
+  'snapshot-access': 'BlockdevOptionsGenericFormat',
   'ssh':'BlockdevOptionsSsh',
   'throttle':   'BlockdevOptionsThrottle',
   'vdi':'BlockdevOptionsGenericFormat',
diff --git a/block/snapshot-access.c b/block/snapshot-access.c
new file mode 100644
index 00..77b87c1946
--- /dev/null
+++ b/block/snapshot-access.c
@@ -0,0 +1,132 @@
+/*
+ * snapshot_access block driver
+ *
+ * Copyright (c) 2022 Virtuozzo International GmbH.
+ *
+ * Author:
+ *  Sementsov-Ogievskiy Vladimir 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+
+#include "qemu/osdep.h"
+
+#include "sysemu/block-backend.h"
+#include "qemu/cutils.h"
+#include "block/block_int.h"
+
+static coroutine_fn int
+snapshot_access_co_preadv_part(BlockDriverState *bs,
+   int64_t offset, int64_t bytes,
+   QEMUIOVector *qiov, size_t qiov_offset,
+   BdrvRequestFlags flags)
+{
+if (flags) {
+return -ENOTSUP;
+}
+
+return bdrv_co_preadv_snapshot(bs->file, offset, bytes, qiov, qiov_offset);
+}
+
+static int coroutine_fn
+snapshot_access_co_block_status(BlockDriverState *bs,
+bool want_zero, int64_t offset,
+int64_t bytes, int64_t *pnum,
+int64_t *map, BlockDriverState **file)
+{
+return bdrv_co_snapshot_block_status(bs->file->bs, want_zero, offset,
+ bytes, pnum, map, file);
+}
+
+static int coroutine_fn snapshot_access_co_pdiscard(BlockDriverState *bs,
+ int64_t offset, int64_t bytes)
+{
+return bdrv_co_pdiscard_snapshot(bs->file->bs, offset, bytes);
+}
+
+static int coroutine_fn
+snapshot_access_co_pwrite_zeroes(BlockDriverState *bs,
+ int64_t offset, int64_t bytes,
+ BdrvRequestFlags flags)
+{
+return -ENOTSUP;

1 2 3 4 >

1 - 100 of 366 matches

Mail list logo