date:20240227

Re: [PATCH 1/7] qga/commands-posix: return fsinfo values directly as reported by statvfs

2024-02-27 Thread Marc-André Lureau

Hi

On Tue, Feb 27, 2024 at 4:38 PM Andrey Drobyshev
 wrote:
>
>
>
> On 2/26/24 20:50, Konstantin Kostiuk wrote:
> >
> > Best Regards,
> > Konstantin Kostiuk.
> >
> >
> > On Mon, Feb 26, 2024 at 7:02 PM Andrey Drobyshev
> > mailto:andrey.drobys...@virtuozzo.com>>
> > wrote:
> >
> > Since the commit 25b5ff1a86 ("qga: add mountpoint usage info to
> > GuestFilesystemInfo") we have 2 values reported in guest-get-fsinfo:
> > used = (f_blocks - f_bfree), total = (f_blocks - f_bfree + f_bavail).
> > These calculations might be obscure for the end user and require one to
> > actually get into QGA source to understand how they're obtained. Let's
> > just report the values f_blocks, f_bfree, f_bavail (in bytes) from
> > statvfs() as they are, letting the user decide how to process them
> > further.
> >
> > Originally-by: Yuri Pudgorodskiy  > >
> > Signed-off-by: Andrey Drobyshev  > >
> > ---
> >  qga/commands-posix.c | 16 +++-
> >  qga/qapi-schema.json | 11 +++
> >  2 files changed, 14 insertions(+), 13 deletions(-)
> >
> > diff --git a/qga/commands-posix.c b/qga/commands-posix.c
> > index 26008db497..752ef509d0 100644
> > --- a/qga/commands-posix.c
> > +++ b/qga/commands-posix.c
> > @@ -1554,8 +1554,7 @@ static GuestFilesystemInfo
> > *build_guest_fsinfo(strua5a0239ce5ct FsMount *mount,
> > Error **errp)
> >  {
> >  GuestFilesystemInfo *fs = g_malloc0(sizeof(*fs));
> > -struct statvfs buf;
> > -unsigned long used, nonroot_total, fr_size;
> > +struct statvfs st;
> >  char *devpath = g_strdup_printf("/sys/dev/block/%u:%u",
> >  mount->devmajor, mount->devminor);
> >
> > @@ -1563,15 +1562,14 @@ static GuestFilesystemInfo
> > *build_guest_fsinfo(struct FsMount *mount,
> >  fs->type = g_strdup(mount->devtype);
> >  build_guest_fsinfo_for_device(devpath, fs, errp);
> >
> > -if (statvfs(fs->mountpoint, ) == 0) {
> > -fr_size = buf.f_frsize;
> > -used = buf.f_blocks - buf.f_bfree;
> > -nonroot_total = used + buf.f_bavail;
> > -fs->used_bytes = used * fr_size;
> > -fs->total_bytes = nonroot_total * fr_size;
> > +if (statvfs(fs->mountpoint, ) == 0) {
> > +fs->total_bytes = st.f_blocks * st.f_frsize;
> > +fs->free_bytes = st.f_bfree * st.f_frsize;
> > +fs->avail_bytes = st.f_bavail * st.f_frsize;
> >
> >  fs->has_total_bytes = true;
> > -fs->has_used_bytes = true;
> > +fs->has_free_bytes = true;
> > +fs->has_avail_bytes = true;
> >  }
> >
> >  g_free(devpath);
> > diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json
> > index b8efe31897..1cce3c1df5 100644
> > --- a/qga/qapi-schema.json
> > +++ b/qga/qapi-schema.json
> > @@ -1030,9 +1030,12 @@
> >  #
> >  # @type: file system type string
> >  #
> > -# @used-bytes: file system used bytes (since 3.0)
> > +# @total-bytes: total file system size in bytes (since 8.3)
> >  #
> > -# @total-bytes: non-root file system total bytes (since 3.0)
> > +# @free-bytes: amount of free space in file system in bytes (since 8.3)
> >
> >
> > I don't agree with this as it breaks backward compatibility. If we want
> > to get
> > these changes we should release a new version with both old and new fields
> > and mark old as deprecated to get a time for everyone who uses this
> > API updates its solutions.
> >
> > A similar thing was with replacing the 'blacklist' command line.
> > https://gitlab.com/qemu-project/qemu/-/commit/582a098e6ca00dd42f317dad8affd13e5a20bc42
> >  
> > 
> > Currently, we support both 'blacklist' and 'block-rpcs' command line options
> > but the first one wrote a warning.
> >
>
> I agree that marking the old values as deprecated does make sense.
> Although my original intent with this patch is to make more sense of the
> existing names (e.g. total-bytes to indicate true fs size instead of
> just non-root fs).  If so, we'd eventually have to replace the original
> total-bytes value with the one having new semantics.  Or we could rename
> the existing value to smth like "total-bytes-nonroot".  But either way
> breaks backward compatibility after all.  How would you suggest to
> resolve it?


Why break backward compatibility? Don't break other systems (win32)
when you propose a patch.

QGA API aims to be cross-platform. Any system should be able to report
some kind of meaningful used and total disk space. I don't see much
reason to change that.

If we need Posix-specific values reported by statvfs(), we can have
extra optional

Re: [PATCH v2] ppc/pnv: Improve pervasive topology calculation for big-core

2024-02-27 Thread Cédric Le Goater


On 2/27/24 21:36, Caleb Schlossin wrote:

Big (SMT8) cores have a complicated function to map the core, thread ID
to pervasive topology (PIR). Fix this for power8, power9, and power10.

Signed-off-by: Caleb Schlossin 
---

Version 2 fixes the PIR calculation for core, thread ID
for power10 big cores (SMT8).


Looks good for SMT4 and this change prepares ground SMT8. We would need
a new CPU definition to activate big cores. It can come later.

Reviewed-by: Cédric Le Goater 

Thanks,

C.





  include/hw/ppc/pnv_chip.h |  2 +-
  include/hw/ppc/pnv_core.h |  1 +
  hw/ppc/pnv.c  | 71 ---
  hw/ppc/pnv_core.c |  8 ++---
  target/ppc/misc_helper.c  |  3 --
  5 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/include/hw/ppc/pnv_chip.h b/include/hw/ppc/pnv_chip.h
index af4cd7a8b8..8589f3291e 100644
--- a/include/hw/ppc/pnv_chip.h
+++ b/include/hw/ppc/pnv_chip.h
@@ -147,7 +147,7 @@ struct PnvChipClass {
  
  DeviceRealize parent_realize;
  
-uint32_t (*core_pir)(PnvChip *chip, uint32_t core_id);

+uint32_t (*chip_pir)(PnvChip *chip, uint32_t core_id, uint32_t thread_id);
  void (*intc_create)(PnvChip *chip, PowerPCCPU *cpu, Error **errp);
  void (*intc_reset)(PnvChip *chip, PowerPCCPU *cpu);
  void (*intc_destroy)(PnvChip *chip, PowerPCCPU *cpu);
diff --git a/include/hw/ppc/pnv_core.h b/include/hw/ppc/pnv_core.h
index 4db21229a6..c6d62fd145 100644
--- a/include/hw/ppc/pnv_core.h
+++ b/include/hw/ppc/pnv_core.h
@@ -36,6 +36,7 @@ struct PnvCore {
  /*< public >*/
  PowerPCCPU **threads;
  uint32_t pir;
+uint32_t hwid;
  uint64_t hrmor;
  PnvChip *chip;
  
diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c

index 0b47b92baa..aa5aba60b4 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -141,8 +141,10 @@ static void pnv_dt_core(PnvChip *chip, PnvCore *pc, void 
*fdt)
  int smt_threads = CPU_CORE(pc)->nr_threads;
  CPUPPCState *env = >env;
  PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cs);
+PnvChipClass *pnv_cc = PNV_CHIP_GET_CLASS(chip);
  g_autofree uint32_t *servers_prop = g_new(uint32_t, smt_threads);
  int i;
+uint32_t pir;
  uint32_t segs[] = {cpu_to_be32(28), cpu_to_be32(40),
 0x, 0x};
  uint32_t tbfreq = PNV_TIMEBASE_FREQ;
@@ -158,15 +160,17 @@ static void pnv_dt_core(PnvChip *chip, PnvCore *pc, void 
*fdt)
  char *nodename;
  int cpus_offset = get_cpus_node(fdt);
  
-nodename = g_strdup_printf("%s@%x", dc->fw_name, pc->pir);

+pir = pnv_cc->chip_pir(chip, pc->hwid, 0);
+
+nodename = g_strdup_printf("%s@%x", dc->fw_name, pir);
  offset = fdt_add_subnode(fdt, cpus_offset, nodename);
  _FDT(offset);
  g_free(nodename);
  
  _FDT((fdt_setprop_cell(fdt, offset, "ibm,chip-id", chip->chip_id)));
  
-_FDT((fdt_setprop_cell(fdt, offset, "reg", pc->pir)));

-_FDT((fdt_setprop_cell(fdt, offset, "ibm,pir", pc->pir)));
+_FDT((fdt_setprop_cell(fdt, offset, "reg", pir)));
+_FDT((fdt_setprop_cell(fdt, offset, "ibm,pir", pir)));
  _FDT((fdt_setprop_string(fdt, offset, "device_type", "cpu")));
  
  _FDT((fdt_setprop_cell(fdt, offset, "cpu-version", env->spr[SPR_PVR])));

@@ -241,15 +245,17 @@ static void pnv_dt_core(PnvChip *chip, PnvCore *pc, void 
*fdt)
  
  /* Build interrupt servers properties */

  for (i = 0; i < smt_threads; i++) {
-servers_prop[i] = cpu_to_be32(pc->pir + i);
+servers_prop[i] = cpu_to_be32(pnv_cc->chip_pir(chip, pc->hwid, i));
  }
  _FDT((fdt_setprop(fdt, offset, "ibm,ppc-interrupt-server#s",
 servers_prop, sizeof(*servers_prop) * smt_threads)));
  }
  
-static void pnv_dt_icp(PnvChip *chip, void *fdt, uint32_t pir,

+static void pnv_dt_icp(PnvChip *chip, void *fdt, uint32_t hwid,
 uint32_t nr_threads)
  {
+PnvChipClass *pcc = PNV_CHIP_GET_CLASS(chip);
+uint32_t pir = pcc->chip_pir(chip, hwid, 0);
  uint64_t addr = PNV_ICP_BASE(chip) | (pir << 12);
  char *name;
  const char compat[] = "IBM,power8-icp\0IBM,ppc-xicp";
@@ -263,6 +269,7 @@ static void pnv_dt_icp(PnvChip *chip, void *fdt, uint32_t 
pir,
  rsize = sizeof(uint64_t) * 2 * nr_threads;
  reg = g_malloc(rsize);
  for (i = 0; i < nr_threads; i++) {
+/* We know P8 PIR is linear with thread id */
  reg[i * 2] = cpu_to_be64(addr | ((pir + i) * 0x1000));
  reg[i * 2 + 1] = cpu_to_be64(0x1000);
  }
@@ -315,7 +322,7 @@ static void pnv_chip_power8_dt_populate(PnvChip *chip, void 
*fdt)
  pnv_dt_core(chip, pnv_core, fdt);
  
  /* Interrupt Control Presenters (ICP). One per core. */

-pnv_dt_icp(chip, fdt, pnv_core->pir, CPU_CORE(pnv_core)->nr_threads);
+pnv_dt_icp(chip, fdt, pnv_core->hwid, CPU_CORE(pnv_core)->nr_threads);
  }
  
  if (chip->ram_size) {

@@ -995,9 +1002,10 @@ static void pnv_init(MachineState *machine)
   *   25:28  Core

Re: [PATCH v7 1/2] qom: new object to associate device to numa node

2024-02-27 Thread Markus Armbruster

 writes:

> From: Ankit Agrawal 
>
> NVIDIA GPU's support MIG (Mult-Instance GPUs) feature [1], which allows
> partitioning of the GPU device resources (including device memory) into
> several (upto 8) isolated instances. Each of the partitioned memory needs
> a dedicated NUMA node to operate. The partitions are not fixed and they
> can be created/deleted at runtime.
>
> Unfortunately Linux OS does not provide a means to dynamically create/destroy
> NUMA nodes and such feature implementation is not expected to be trivial. The
> nodes that OS discovers at the boot time while parsing SRAT remains fixed. So
> we utilize the Generic Initiator Affinity structures that allows association
> between nodes and devices. Multiple GI structures per BDF is possible,
> allowing creation of multiple nodes by exposing unique PXM in each of these
> structures.
>
> Implement the mechanism to build the GI affinity structures as Qemu currently
> does not. Introduce a new acpi-generic-initiator object to allow host admin
> link a device with an associated NUMA node. Qemu maintains this association
> and use this object to build the requisite GI Affinity Structure.
>
> When multiple numa nodes are associated with a device, it is required to
> create those many number of acpi-generic-initiator objects, each representing
> a unique device:node association.
>
> Following is one of a decoded GI affinity structure in VM ACPI SRAT.
> [0C8h 0200   1]Subtable Type : 05 [Generic Initiator Affinity]
> [0C9h 0201   1]   Length : 20
>
> [0CAh 0202   1]Reserved1 : 00
> [0CBh 0203   1]   Device Handle Type : 01
> [0CCh 0204   4] Proximity Domain : 0007
> [0D0h 0208  16]Device Handle : 00 00 20 00 00 00 00 00 00 00 
> 00
> 00 00 00 00 00
> [0E0h 0224   4]Flags (decoded below) : 0001
>  Enabled : 1
> [0E4h 0228   4]Reserved2 : 
>
> [0E8h 0232   1]Subtable Type : 05 [Generic Initiator Affinity]
> [0E9h 0233   1]   Length : 20
>
> An admin can provide a range of acpi-generic-initiator objects, each
> associating a device (by providing the id through pci-dev argument)
> to the desired numa node (using the node argument). Currently, only PCI
> device is supported.
>
> For the grace hopper system, create a range of 8 nodes and associate that
> with the device using the acpi-generic-initiator object. While a configuration
> of less than 8 nodes per device is allowed, such configuration will prevent
> utilization of the feature to the fullest. The following sample creates 8
> nodes per PCI device for a VM with 2 PCI devices and link them to the
> respecitve PCI device using acpi-generic-initiator objects:
>
> -numa node,nodeid=2 -numa node,nodeid=3 -numa node,nodeid=4 \
> -numa node,nodeid=5 -numa node,nodeid=6 -numa node,nodeid=7 \
> -numa node,nodeid=8 -numa node,nodeid=9 \
> -device 
> vfio-pci-nohotplug,host=0009:01:00.0,bus=pcie.0,addr=04.0,rombar=0,id=dev0 \
> -object acpi-generic-initiator,id=gi0,pci-dev=dev0,node=2 \
> -object acpi-generic-initiator,id=gi1,pci-dev=dev0,node=3 \
> -object acpi-generic-initiator,id=gi2,pci-dev=dev0,node=4 \
> -object acpi-generic-initiator,id=gi3,pci-dev=dev0,node=5 \
> -object acpi-generic-initiator,id=gi4,pci-dev=dev0,node=6 \
> -object acpi-generic-initiator,id=gi5,pci-dev=dev0,node=7 \
> -object acpi-generic-initiator,id=gi6,pci-dev=dev0,node=8 \
> -object acpi-generic-initiator,id=gi7,pci-dev=dev0,node=9 \
>
> -numa node,nodeid=10 -numa node,nodeid=11 -numa node,nodeid=12 \
> -numa node,nodeid=13 -numa node,nodeid=14 -numa node,nodeid=15 \
> -numa node,nodeid=16 -numa node,nodeid=17 \
> -device 
> vfio-pci-nohotplug,host=0009:01:01.0,bus=pcie.0,addr=05.0,rombar=0,id=dev1 \
> -object acpi-generic-initiator,id=gi8,pci-dev=dev1,node=10 \
> -object acpi-generic-initiator,id=gi9,pci-dev=dev1,node=11 \
> -object acpi-generic-initiator,id=gi10,pci-dev=dev1,node=12 \
> -object acpi-generic-initiator,id=gi11,pci-dev=dev1,node=13 \
> -object acpi-generic-initiator,id=gi12,pci-dev=dev1,node=14 \
> -object acpi-generic-initiator,id=gi13,pci-dev=dev1,node=15 \
> -object acpi-generic-initiator,id=gi14,pci-dev=dev1,node=16 \
> -object acpi-generic-initiator,id=gi15,pci-dev=dev1,node=17 \
>
> The performance benefits can be realized by providing the NUMA node distances
> appropriately (through libvirt tags or Qemu params). The admin can get the
> distance among nodes in hardware using `numactl -H`.
>
> [1] https://www.nvidia.com/en-in/technologies/multi-instance-gpu
>
> Signed-off-by: Ankit Agrawal 
> ---
>  hw/acpi/acpi-generic-initiator.c | 70 
>  hw/acpi/meson.build  |  1 +
>  include/hw/acpi/acpi-generic-initiator.h | 32 +++
>  qapi/qom.json| 17 ++
>  4 files changed, 120 insertions(+)
>  create mode 100644

Re: [PATCH v2 1/7] hw/cxl/cxl-host: Fix missing ERRP_GUARD() in cxl_fixed_memory_window_config()

2024-02-27 Thread Markus Armbruster

Zhao Liu  writes:

> From: Zhao Liu 
>
> As the comment in qapi/error, dereferencing @errp requires
> ERRP_GUARD():
>
> * = Why, when and how to use ERRP_GUARD() =
> *
> * Without ERRP_GUARD(), use of the @errp parameter is restricted:
> * - It must not be dereferenced, because it may be null.
> ...
> * ERRP_GUARD() lifts these restrictions.
> *
> * To use ERRP_GUARD(), add it right at the beginning of the function.
> * @errp can then be used without worrying about the argument being
> * NULL or _fatal.
> *
> * Using it when it's not needed is safe, but please avoid cluttering
> * the source with useless code.
>
> But in cxl_fixed_memory_window_config(), @errp is dereferenced in 2
> places without ERRP_GUARD():
>
> fw->enc_int_ways = cxl_interleave_ways_enc(fw->num_targets, errp);
> if (*errp) {
> return;
> }
>
> and
>
> fw->enc_int_gran =
> cxl_interleave_granularity_enc(object->interleave_granularity,
>errp);
> if (*errp) {
> return;
> }

No need to quote the dereferences in full.

  But in cxl_fixed_memory_window_config(), @errp is dereferenced in 2
  places without ERRP_GUARD().

  In these two places, we check "*errp", because neither function

would do.

Same for the other commit messages.

Hardly worth a respin, though :)

> For the above 2 places, we check "*errp", because neither function
> returns a suitable error code. And since machine_set_cfmw() - the caller
> of cxl_fixed_memory_window_config() - doesn't get the NULL @errp
> parameter as the "set" method of object property,
> cxl_fixed_memory_window_config() hasn't triggered the bug that
> dereferencing the NULL @errp.
>
> To follow the requirement of @errp, add missing ERRP_GUARD() in
> cxl_fixed_memory_window_config().
>
> Suggested-by: Markus Armbruster 
> Signed-off-by: Zhao Liu 
> Reviewed-by: Markus Armbruster 
> ---
> Suggested by credit:
>  Markus: Referred his explanation about ERRP_GUARD().
> ---
> v2:
>  * Add the @errp dereference code in commit message to make review
>easier. (Markus)
> ---
>  hw/cxl/cxl-host.c | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/hw/cxl/cxl-host.c b/hw/cxl/cxl-host.c
> index 2aa776c79c74..c5f5fcfd64d0 100644
> --- a/hw/cxl/cxl-host.c
> +++ b/hw/cxl/cxl-host.c
> @@ -26,6 +26,7 @@ static void cxl_fixed_memory_window_config(CXLState 
> *cxl_state,
> CXLFixedMemoryWindowOptions 
> *object,
> Error **errp)
>  {
> +ERRP_GUARD();
>  g_autofree CXLFixedWindow *fw = g_malloc0(sizeof(*fw));
>  strList *target;
>  int i;

Re: [PATCH V4 14/14] migration: options incompatible with cpr

2024-02-27 Thread Markus Armbruster

Steve Sistare  writes:

> Fail the migration request if options are set that are incompatible
> with cpr.
>
> Signed-off-by: Steve Sistare 
> ---
>  migration/migration.c | 17 +
>  qapi/migration.json   |  2 ++
>  2 files changed, 19 insertions(+)
>
> diff --git a/migration/migration.c b/migration/migration.c
> index 90a9094..7652fd4 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -1953,6 +1953,23 @@ static bool migrate_prepare(MigrationState *s, bool 
> blk, bool blk_inc,
>  return false;
>  }
>  
> +if (migrate_mode_is_cpr(s)) {
> +const char *conflict = NULL;
> +
> +if (migrate_postcopy()) {
> +conflict = "postcopy";
> +} else if (migrate_background_snapshot()) {
> +conflict = "background snapshot";
> +} else if (migrate_colo()) {
> +conflict = "COLO";
> +}
> +
> +if (conflict) {
> +error_setg(errp, "Cannot use %s with CPR", conflict);
> +return false;
> +}
> +}
> +
>  if (blk || blk_inc) {
>  if (migrate_colo()) {
>  error_setg(errp, "No disk migration is required in COLO mode");
> diff --git a/qapi/migration.json b/qapi/migration.json
> index 0990297..c6bfe2e 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -657,6 +657,8 @@
>  # shared backend must be be non-volatile across reboot, such as by 
> backing
>  # it with a dax device.
>  #
> +# cpr-reboot may not be used with postcopy, colo, or background-snapshot.
> +#

@cpr-reboot

COLO

Wrap the line:

   # @cpr-reboot may not be used with postcopy, COLO, or
   # background-snapshot.

This doesn't tell the reader what settings exactly do not work with
@cpr-reboot.

For instance "background-snapshot" is about enabling migration
capability @background-snapshot.  We could write something like "is
incompatible with enabling migration capability @background-snapshot".

Same for the other two.  Worthwhile?

>  # (since 8.2)
>  ##
>  { 'enum': 'MigMode',

Re: [PATCH V4 13/14] migration: update cpr-reboot description

2024-02-27 Thread Markus Armbruster

Steve Sistare  writes:

> Clarify qapi for cpr-reboot migration mode, and add vfio support.

The patch only affects documentation, but that's less than clear from
the commit message.  Suggest

  Improve documentation for migration mode @cpr-reboot.  In particular,
  document VFIO support.

> Signed-off-by: Steve Sistare 
> ---
>  qapi/migration.json | 35 ++-
>  1 file changed, 22 insertions(+), 13 deletions(-)
>
> diff --git a/qapi/migration.json b/qapi/migration.json
> index 5a565d9..0990297 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -636,19 +636,28 @@
>  #
>  # @normal: the original form of migration. (since 8.2)
>  #
> -# @cpr-reboot: The migrate command saves state to a file, allowing one to
> -#  quit qemu, reboot to an updated kernel, and restart an updated
> -#  version of qemu.  The caller must specify a migration URI
> -#  that writes to and reads from a file.  Unlike normal mode,
> -#  the use of certain local storage options does not block the
> -#  migration, but the caller must not modify guest block devices
> -#  between the quit and restart.  To avoid saving guest RAM to 
> the
> -#  file, the memory backend must be shared, and the 
> @x-ignore-shared
> -#  migration capability must be set.  Guest RAM must be 
> non-volatile
> -#  across reboot, such as by backing it with a dax device, but 
> this
> -#  is not enforced.  The restarted qemu arguments must match 
> those
> -#  used to initially start qemu, plus the -incoming option.
> -#  (since 8.2)
> +# @cpr-reboot: The migrate command stops the VM and saves state to the URI.
> +# After quitting qemu, the user resumes by running qemu -incoming.

These two sentences apply to any migration mode, don't they?  Just
checking I understand.

> +#
> +# This mode allows the user to quit qemu, and restart an updated version
> +# of qemu.  The user may even update and reboot the OS before restarting,
> +# as long as the URI persists across a reboot.

Hmm, doesn't normal migration also support migrating to a newer QEMU?

> +#
> +# Unlike normal mode, the use of certain local storage options does not
> +# block the migration, but the user must not modify guest block devices
> +# between the quit and restart.

"Must not modify the contents of the guest block devices"?

> +#
> +# This mode supports vfio devices provided the user first puts the guest

"VFIO devices"

> +# in the suspended runstate, such as by issuing guest-suspend-ram to the
> +# qemu guest agent.
> +#
> +# Best performance is achieved when the memory backend is shared and the
> +# @x-ignore-shared migration capability is set, but this is not required.
> +# Further, if the user reboots before restarting such a configuration, 
> the
> +# shared backend must be be non-volatile across reboot, such as by 
> backing

Typo: "be be non-volatile"

Suggest "the shared memory must persist"

> +# it with a dax device.
> +#
> +# (since 8.2)
>  ##
>  { 'enum': 'MigMode',
>'data': [ 'normal', 'cpr-reboot' ] }

Thanks for adjusting indentation to conform to conventions.  Please
additionally reflow the text to limit line length to 70 characters.

Re: [PATCH v9 2/4] qmp: add dump machine type compatibility properties

2024-02-27 Thread Markus Armbruster

Maksim Davydov  writes:

> To control that creating new machine type doesn't affect the previous
> types (their compat_props) and to check complex compat_props inheritance
> we need qmp command to print machine type compatibility properties.
> This patch adds the ability to get list of all the compat_props of the
> corresponding supported machines for their comparison via new optional
> argument of "query-machines" command. Since information on compatibility
> properties can increase the command output by a factor of 40, add an
> argument to enable it, default off.
>
> Signed-off-by: Maksim Davydov 
> Reviewed-by: Vladimir Sementsov-Ogievskiy 

QAPI schema
Acked-by: Markus Armbruster

[PATCH] Fixed tlb huge page loading issue

2024-02-27 Thread Xianglai Li

The lddir and ldpte instruction emulation has
a problem with the use of large page processing above level 2.
The page size is not correctly calculated,
resulting in the wrong page size of the table entry found by tlb.

Signed-off-by: Xianglai Li 
---
 target/loongarch/cpu.h|  1 +
 target/loongarch/tcg/tlb_helper.c | 21 -
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/target/loongarch/cpu.h b/target/loongarch/cpu.h
index ec37579fd6..eab3e41c71 100644
--- a/target/loongarch/cpu.h
+++ b/target/loongarch/cpu.h
@@ -292,6 +292,7 @@ typedef struct CPUArchState {
 uint32_t fcsr0_mask;
 
 uint32_t cpucfg[21];
+uint32_t lddir_ps;
 
 uint64_t lladdr; /* LL virtual address compared against SC */
 uint64_t llval;
diff --git a/target/loongarch/tcg/tlb_helper.c 
b/target/loongarch/tcg/tlb_helper.c
index a08c08b05a..3594c800b3 100644
--- a/target/loongarch/tcg/tlb_helper.c
+++ b/target/loongarch/tcg/tlb_helper.c
@@ -38,6 +38,7 @@ static void raise_mmu_exception(CPULoongArchState *env, 
target_ulong address,
 cs->exception_index = EXCCODE_PIF;
 }
 env->CSR_TLBRERA = FIELD_DP64(env->CSR_TLBRERA, CSR_TLBRERA, ISTLBR, 
1);
+env->lddir_ps = 0;
 break;
 case TLBRET_INVALID:
 /* TLB match with no valid bit */
@@ -488,13 +489,6 @@ target_ulong helper_lddir(CPULoongArchState *env, 
target_ulong base,
 uint64_t dir_base, dir_width;
 bool huge = (base >> LOONGARCH_PAGE_HUGE_SHIFT) & 0x1;
 
-badvaddr = env->CSR_TLBRBADV;
-base = base & TARGET_PHYS_MASK;
-
-/* 0:64bit, 1:128bit, 2:192bit, 3:256bit */
-shift = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, PTEWIDTH);
-shift = (shift + 1) * 3;
-
 if (huge) {
 return base;
 }
@@ -519,9 +513,18 @@ target_ulong helper_lddir(CPULoongArchState *env, 
target_ulong base,
 do_raise_exception(env, EXCCODE_INE, GETPC());
 return 0;
 }
+
+/* 0:64bit, 1:128bit, 2:192bit, 3:256bit */
+shift = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, PTEWIDTH);
+shift = (shift + 1) * 3;
+badvaddr = env->CSR_TLBRBADV;
+base = base & TARGET_PHYS_MASK;
 index = (badvaddr >> dir_base) & ((1 << dir_width) - 1);
 phys = base | index << shift;
 ret = ldq_phys(cs->as, phys) & TARGET_PHYS_MASK;
+if (ret & BIT_ULL(LOONGARCH_PAGE_HUGE_SHIFT)) {
+env->lddir_ps = dir_base;
+}
 return ret;
 }
 
@@ -538,13 +541,13 @@ void helper_ldpte(CPULoongArchState *env, target_ulong 
base, target_ulong odd,
 base = base & TARGET_PHYS_MASK;
 
 if (huge) {
-/* Huge Page. base is paddr */
 tmp0 = base ^ (1 << LOONGARCH_PAGE_HUGE_SHIFT);
 /* Move Global bit */
 tmp0 = ((tmp0 & (1 << LOONGARCH_HGLOBAL_SHIFT))  >>
 LOONGARCH_HGLOBAL_SHIFT) << R_TLBENTRY_G_SHIFT |
 (tmp0 & (~(1 << LOONGARCH_HGLOBAL_SHIFT)));
-ps = ptbase + ptwidth - 1;
+
+ps = env->lddir_ps - 1;
 if (odd) {
 tmp0 += MAKE_64BIT_MASK(ps, 1);
 }
-- 
2.39.1

Re: [EXT] Re: [PATCH v3] virtio-pci: correctly set virtio pci queue mem multiplier

2024-02-27 Thread Michael S. Tsirkin

On Wed, Feb 28, 2024 at 06:13:03AM +, Srujana Challa wrote:
> > Subject: [EXT] Re: [PATCH v3] virtio-pci: correctly set virtio pci queue mem
> > multiplier
> > 
> > External Email
> > 
> > --
> > On Fri, Feb 23, 2024 at 10:56:17AM +0530, Srujana Challa wrote:
> > > Currently, virtio_pci_queue_mem_mult function always returns 4K when
> > > VIRTIO_PCI_FLAG_PAGE_PER_VQ is set. But this won't work for vhost vdpa
> > > when host has page size other than 4K.
> > > This patch introduces a new property(host-page-per-vq) for vdpa use
> > > case to fix the same.
> > >
> > > Signed-off-by: Srujana Challa 
> > 
> > Looks good. I'd like to fail realize if both
> >(proxy->flags & VIRTIO_PCI_FLAG_PAGE_PER_VQ) and
> >(proxy->flags & VIRTIO_PCI_FLAG_HOST_PAGE_PER_VQ) so users do not
> > start depending on this combination.
> Could you confirm if we can add assertion for this case in 
> virtio_pci_mem_mult() function?

No, reporting an error would be better since it's user-triggerable -
it is not nice to report user errors through assertions,
assertions are for conditions that can not be reached.


> > 
> > 
> > 
> > > ---
> > > v2->v3:
> > > - Modified property name, page-per-vdpa-vq to host-page-per-vq.
> > >
> > > v1->v2:
> > > - Introduced a new property to get virtqueue mem multiplier for
> > >   vdpa use case.
> > >
> > >  hw/virtio/virtio-pci.c | 10 --
> > >  include/hw/virtio/virtio-pci.h |  5 +
> > >  2 files changed, 13 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index
> > > 1a7039fb0c..f29e60830b 100644
> > > --- a/hw/virtio/virtio-pci.c
> > > +++ b/hw/virtio/virtio-pci.c
> > > @@ -320,8 +320,12 @@ static bool
> > > virtio_pci_ioeventfd_enabled(DeviceState *d)
> > >
> > >  static inline int virtio_pci_queue_mem_mult(struct VirtIOPCIProxy
> > > *proxy)  {
> > > -return (proxy->flags & VIRTIO_PCI_FLAG_PAGE_PER_VQ) ?
> > > -QEMU_VIRTIO_PCI_QUEUE_MEM_MULT : 4;
> > > +if (proxy->flags & VIRTIO_PCI_FLAG_PAGE_PER_VQ)
> > > +return QEMU_VIRTIO_PCI_QUEUE_MEM_MULT;
> > > +else if (proxy->flags & VIRTIO_PCI_FLAG_HOST_PAGE_PER_VQ)
> > > +return qemu_real_host_page_size();
> > > +else
> > > +return 4;
> > >  }
> > >
> > >  static int virtio_pci_ioeventfd_assign(DeviceState *d, EventNotifier
> > > *notifier, @@ -2301,6 +2305,8 @@ static Property virtio_pci_properties[] =
> > {
> > >  VIRTIO_PCI_FLAG_INIT_FLR_BIT, true),
> > >  DEFINE_PROP_BIT("aer", VirtIOPCIProxy, flags,
> > >  VIRTIO_PCI_FLAG_AER_BIT, false),
> > > +DEFINE_PROP_BIT("host-page-per-vq", VirtIOPCIProxy, flags,
> > > +VIRTIO_PCI_FLAG_HOST_PAGE_PER_VQ_BIT, false),
> > >  DEFINE_PROP_END_OF_LIST(),
> > >  };
> > >
> > > diff --git a/include/hw/virtio/virtio-pci.h
> > > b/include/hw/virtio/virtio-pci.h index 59d88018c1..81b6de4291 100644
> > > --- a/include/hw/virtio/virtio-pci.h
> > > +++ b/include/hw/virtio/virtio-pci.h
> > > @@ -43,6 +43,7 @@ enum {
> > >  VIRTIO_PCI_FLAG_INIT_FLR_BIT,
> > >  VIRTIO_PCI_FLAG_AER_BIT,
> > >  VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED_BIT,
> > > +VIRTIO_PCI_FLAG_HOST_PAGE_PER_VQ_BIT,
> > >  };
> > >
> > >  /* Need to activate work-arounds for buggy guests at vmstate load. */
> > > @@ -89,6 +90,10 @@ enum {  #define
> > VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED \
> > >(1 << VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED_BIT)
> > >
> > > +/* page per vdpa vq flag to be used for vhost vdpa backends */
> > > +#define VIRTIO_PCI_FLAG_HOST_PAGE_PER_VQ \
> > > +(1 << VIRTIO_PCI_FLAG_HOST_PAGE_PER_VQ_BIT)
> > > +
> > >  typedef struct {
> > >  MSIMessage msg;
> > >  int virq;
> > > --
> > > 2.25.1

Re: [PATCH] qapi: Craft the BlockdevCreateOptionsLUKS comment

2024-02-27 Thread Markus Armbruster

Yong Huang  writes:

> On Wed, Feb 21, 2024 at 4:26 PM Markus Armbruster  wrote:
>
>> Yong Huang  writes:
>>
>> > On Wed, Feb 21, 2024 at 2:43 PM Markus Armbruster 
>> wrote:
>> >
>> >> Hyman Huang  writes:
>> >>
>> >> > Add comment in detail for commit 433957bb7f (qapi:
>> >> > Make parameter 'file' optional for
>> >> > BlockdevCreateOptionsLUKS).
>> >> >
>> >> > Signed-off-by: Hyman Huang 
>> >> > ---
>> >> >  qapi/block-core.json | 20 +++-
>> >> >  1 file changed, 19 insertions(+), 1 deletion(-)
>> >> >
>> >> > diff --git a/qapi/block-core.json b/qapi/block-core.json
>> >> > index ab5a93a966..42b0840d43 100644
>> >> > --- a/qapi/block-core.json
>> >> > +++ b/qapi/block-core.json
>> >> > @@ -4973,7 +4973,25 @@
>> >> >  ##
>> >> >  # @BlockdevCreateOptionsLUKS:
>> >> >  #
>> >> > -# Driver specific image creation options for LUKS.
>> >> > +# Driver specific image creation options for LUKS. Note that
>> >> > +# @file is required if @preallocation is specified and equals
>> >> > +# PREALLOC_MODE_ON. The following three scenarios determine how
>> >> > +# creation logic behaves when @preallocation is either equal to
>> >> > +# PREALLOC_MODE_OFF or is not given:
>> >> > +#
>> >> > +#  1) When @file is given only, format the block device referenced
>> >> > +# by @file as the LUKS specification and trunk it to the @size.
>> >>
>> >> Do you mean "truncate it to @size"?
>> >>
>> > Yes, :( sorry for the spelling mistake.
>>
>> Writing good documentation in a second language is *hard*.  All we can
>> reasonably expect from contributors to try their best.  And then we
>> improve the text together in review.  Just like we do for code :)
>>
>> >> > +# In this case, the @size should reflect amount of space made
>> >> > +# available to the guest, so the trunk size must take account
>> >> > +# of that which will be used by the crypto header.
>> >> > +#
>> >> > +#  2) When @header is given only, just format the block device
>> >> > +# referenced by @header as the LUKS specification.
>> >> > +#
>> >> > +#  3) When both @file and @header are given, block device
>> >> > +# referenced by @file should be trunked to @size, and block
>> >> > +# device referenced by @header should be formatted as the LUKS
>> >> > +# specification.
>> >> >  #
>> >> >  # @file: Node to create the image format on, mandatory except when
>> >> >  #'preallocation' is not requested
>> >>
>> >> Let's see whether I understand.
>> >>
>> >> blockdev-create with "driver": "luks" can work in three different ways:
>> >>
>> >> 1. Create an image with a LUKS header
>> >>
>> >> 2. Create just a detached LUKS header
>> >>
>> >> 3. Create an image and a detached LUKS header
>> >>
>> >> Correct?
>> >>
>> >
>> > Yes
>> >
>> >
>> >> @file and @header are BlockdevRef, which means they refer to existing
>> >> images with arbitrary driver.  Could be "file", "qcow2", or anything.
>> >>
>> >> Correct?
>> >>
>> > Yes
>> >
>> >
>> >>
>> >> To get 1., specify @file, but not @header.
>> >>
>> >> To get 2., specify @header, but not @file.
>> >>
>> >> To get 3., specify both.
>> >>
>> >> Specifying neither is an error.
>> >>
>> >> Correct?
>> >>
>> >
>> > Yes
>> >
>> >
>> >> In any case, @size is the logical size of the image (how much data it
>> >> can hold).
>> >>
>> >
>> > Yes
>> >
>> >
>> >>
>> >> With 1., the actual image size is a bit larger due to the LUKS header.
>> >> The @file image is resized to that size: if it's shorter, it's grown, if
>> >> it's longer, it's truncated.
>> >>
>> >
>> > Yes
>> >
>> >
>> >> With 2., @size is merely recorded in the detached LUKS header.
>> >>
>> >
>> > In LUKS1 specification, payload data size is not contained in the header,
>> > so in this case, @size is not recorded in the detached LUKS header.
>> > The creation logic just does the LUKS header formatting only.
>>
>> Is @size unused then?
>>
>
> IIUC, yes. Creation logic will ignore the @size. See the following code
> in function block_crypto_co_create_luks:
>
> if (luks_opts->header) {
> /* LUKS volume with detached header */
> hdr_bs = bdrv_co_open_blockdev_ref(luks_opts->header, errp);
> if (hdr_bs == NULL) {
> return -EIO;
> }
>
> cflags |= QCRYPTO_BLOCK_CREATE_DETACHED;
>
> /* Format the LUKS header node, here just ignore the size
>   * and passed zero to block_crypto_co_create_generic */
> ret = block_crypto_co_create_generic(hdr_bs, 0, _opts,
>  PREALLOC_MODE_OFF, cflags, errp);
> if (ret < 0) {
> goto fail;
> }
>
> /* Format the LUKS payload node */
> if (luks_opts->file) {
> ret = block_crypto_co_format_luks_payload(luks_opts, errp);
> if (ret < 0) {
> goto fail;
> }
> }

@size is a required argument, but silently ignored when @header is
present and @file is absent (2. Create just a detached LUKS

Re: [PATCH 0/6] Add ivshmem-flat device

2024-02-27 Thread Markus Armbruster

Gustavo Romero  writes:

[...]

> This patchset introduces a new device, ivshmem-flat, which is similar to the
> current ivshmem device but does not require a PCI bus. It implements the 
> ivshmem
> status and control registers as MMRs and the shared memory as a directly
> accessible memory region in the VM memory layout. It's meant to be used on
> machines like those with Cortex-M MCUs, which usually lack a PCI bus, e.g.,
> lm3s6965evb and mps2-an385. Additionally, it has the benefit of requiring a 
> tiny
> 'device driver,' which is helpful on some RTOSes, like Zephyr, that run on
> memory-constrained resource targets.
>
> The patchset includes a QTest for the ivshmem-flat device, however, it's also
> possible to experiment with it in two ways:
>
> (a) using two Cortex-M VMs running Zephyr; or
> (b) using one aarch64 VM running Linux with the ivshmem PCI device and another
> arm (Cortex-M) VM running Zephyr with the new ivshmem-flat device.
>
> Please note that for running the ivshmem-flat QTests the following patch, 
> which
> is not committed to the tree yet, must be applied:
>
> https://lists.nongnu.org/archive/html/qemu-devel/2023-11/msg03176.html

What problem are you trying to solve with ivshmem?

Shared memory is not a solution to any communication problem, it's
merely a building block for building such solutions: you invariably have
to layer some protocol on top.  What do you intend to put on top of
ivshmem?

[...]

RE: [EXT] Re: [PATCH v3] virtio-pci: correctly set virtio pci queue mem multiplier

2024-02-27 Thread Srujana Challa

> Subject: [EXT] Re: [PATCH v3] virtio-pci: correctly set virtio pci queue mem
> multiplier
> 
> External Email
> 
> --
> On Fri, Feb 23, 2024 at 10:56:17AM +0530, Srujana Challa wrote:
> > Currently, virtio_pci_queue_mem_mult function always returns 4K when
> > VIRTIO_PCI_FLAG_PAGE_PER_VQ is set. But this won't work for vhost vdpa
> > when host has page size other than 4K.
> > This patch introduces a new property(host-page-per-vq) for vdpa use
> > case to fix the same.
> >
> > Signed-off-by: Srujana Challa 
> 
> Looks good. I'd like to fail realize if both
>(proxy->flags & VIRTIO_PCI_FLAG_PAGE_PER_VQ) and
>(proxy->flags & VIRTIO_PCI_FLAG_HOST_PAGE_PER_VQ) so users do not
> start depending on this combination.
Could you confirm if we can add assertion for this case in 
virtio_pci_mem_mult() function?

> 
> 
> 
> > ---
> > v2->v3:
> > - Modified property name, page-per-vdpa-vq to host-page-per-vq.
> >
> > v1->v2:
> > - Introduced a new property to get virtqueue mem multiplier for
> >   vdpa use case.
> >
> >  hw/virtio/virtio-pci.c | 10 --
> >  include/hw/virtio/virtio-pci.h |  5 +
> >  2 files changed, 13 insertions(+), 2 deletions(-)
> >
> > diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index
> > 1a7039fb0c..f29e60830b 100644
> > --- a/hw/virtio/virtio-pci.c
> > +++ b/hw/virtio/virtio-pci.c
> > @@ -320,8 +320,12 @@ static bool
> > virtio_pci_ioeventfd_enabled(DeviceState *d)
> >
> >  static inline int virtio_pci_queue_mem_mult(struct VirtIOPCIProxy
> > *proxy)  {
> > -return (proxy->flags & VIRTIO_PCI_FLAG_PAGE_PER_VQ) ?
> > -QEMU_VIRTIO_PCI_QUEUE_MEM_MULT : 4;
> > +if (proxy->flags & VIRTIO_PCI_FLAG_PAGE_PER_VQ)
> > +return QEMU_VIRTIO_PCI_QUEUE_MEM_MULT;
> > +else if (proxy->flags & VIRTIO_PCI_FLAG_HOST_PAGE_PER_VQ)
> > +return qemu_real_host_page_size();
> > +else
> > +return 4;
> >  }
> >
> >  static int virtio_pci_ioeventfd_assign(DeviceState *d, EventNotifier
> > *notifier, @@ -2301,6 +2305,8 @@ static Property virtio_pci_properties[] =
> {
> >  VIRTIO_PCI_FLAG_INIT_FLR_BIT, true),
> >  DEFINE_PROP_BIT("aer", VirtIOPCIProxy, flags,
> >  VIRTIO_PCI_FLAG_AER_BIT, false),
> > +DEFINE_PROP_BIT("host-page-per-vq", VirtIOPCIProxy, flags,
> > +VIRTIO_PCI_FLAG_HOST_PAGE_PER_VQ_BIT, false),
> >  DEFINE_PROP_END_OF_LIST(),
> >  };
> >
> > diff --git a/include/hw/virtio/virtio-pci.h
> > b/include/hw/virtio/virtio-pci.h index 59d88018c1..81b6de4291 100644
> > --- a/include/hw/virtio/virtio-pci.h
> > +++ b/include/hw/virtio/virtio-pci.h
> > @@ -43,6 +43,7 @@ enum {
> >  VIRTIO_PCI_FLAG_INIT_FLR_BIT,
> >  VIRTIO_PCI_FLAG_AER_BIT,
> >  VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED_BIT,
> > +VIRTIO_PCI_FLAG_HOST_PAGE_PER_VQ_BIT,
> >  };
> >
> >  /* Need to activate work-arounds for buggy guests at vmstate load. */
> > @@ -89,6 +90,10 @@ enum {  #define
> VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED \
> >(1 << VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED_BIT)
> >
> > +/* page per vdpa vq flag to be used for vhost vdpa backends */
> > +#define VIRTIO_PCI_FLAG_HOST_PAGE_PER_VQ \
> > +(1 << VIRTIO_PCI_FLAG_HOST_PAGE_PER_VQ_BIT)
> > +
> >  typedef struct {
> >  MSIMessage msg;
> >  int virq;
> > --
> > 2.25.1

RE: [RFC 4/8] hw/core: Add cache topology options in -smp

2024-02-27 Thread JeeHeng Sia




> -Original Message-
> From: Zhao Liu 
> Sent: Tuesday, February 20, 2024 5:25 PM
> To: Daniel P . Berrangé ; Eduardo Habkost 
> ; Marcel Apfelbaum
> ; Philippe Mathieu-Daudé ; 
> Yanan Wang ;
> Michael S . Tsirkin ; Paolo Bonzini ; 
> Richard Henderson ;
> Eric Blake ; Markus Armbruster ; 
> Marcelo Tosatti ; Alex Bennée
> ; Peter Maydell ; Jonathan 
> Cameron ;
> JeeHeng Sia 
> Cc: qemu-devel@nongnu.org; k...@vger.kernel.org; qemu-ri...@nongnu.org; 
> qemu-...@nongnu.org; Zhenyu Wang
> ; Dapeng Mi ; Yongwei Ma 
> ; Zhao Liu
> 
> Subject: [RFC 4/8] hw/core: Add cache topology options in -smp
> 
> From: Zhao Liu 
> 
> Add "l1d-cache", "l1i-cache". "l2-cache", and "l3-cache" options in
> -smp to define the cache topology for SMP system.
> 
> Signed-off-by: Zhao Liu 
> ---
>  hw/core/machine-smp.c | 128 ++
>  hw/core/machine.c |   4 ++
>  qapi/machine.json |  14 -
>  system/vl.c   |  15 +
>  4 files changed, 160 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/core/machine-smp.c b/hw/core/machine-smp.c
> index 8a8296b0d05b..2cbd19f4aa57 100644
> --- a/hw/core/machine-smp.c
> +++ b/hw/core/machine-smp.c
> @@ -61,6 +61,132 @@ static char *cpu_hierarchy_to_string(MachineState *ms)
>  return g_string_free(s, false);
>  }
> 
> +static bool machine_check_topo_support(MachineState *ms,
> +   CPUTopoLevel topo)
> +{
> +MachineClass *mc = MACHINE_GET_CLASS(ms);
> +
> +if (topo == CPU_TOPO_LEVEL_MODULE && !mc->smp_props.modules_supported) {
> +return false;
> +}
> +
> +if (topo == CPU_TOPO_LEVEL_CLUSTER && !mc->smp_props.clusters_supported) 
> {
> +return false;
> +}
> +
> +if (topo == CPU_TOPO_LEVEL_DIE && !mc->smp_props.dies_supported) {
> +return false;
> +}
> +
> +if (topo == CPU_TOPO_LEVEL_BOOK && !mc->smp_props.books_supported) {
> +return false;
> +}
> +
> +if (topo == CPU_TOPO_LEVEL_DRAWER && !mc->smp_props.drawers_supported) {
> +return false;
> +}
> +
> +return true;
> +}
> +
> +static int smp_cache_string_to_topology(MachineState *ms,
> +char *topo_str,
> +CPUTopoLevel *topo,
> +Error **errp)
> +{
> +*topo = string_to_cpu_topo(topo_str);
> +
> +if (*topo == CPU_TOPO_LEVEL_MAX || *topo == CPU_TOPO_LEVEL_INVALID) {
> +error_setg(errp, "Invalid cache topology level: %s. The cache "
> +   "topology should match the CPU topology level", topo_str);
> +return -1;
> +}
> +
> +if (!machine_check_topo_support(ms, *topo)) {
> +error_setg(errp, "Invalid cache topology level: %s. The topology "
> +   "level is not supported by this machine", topo_str);
> +return -1;
> +}
> +
> +return 0;
> +}
> +
> +static void machine_parse_smp_cache_config(MachineState *ms,
> +   const SMPConfiguration *config,
> +   Error **errp)
> +{
> +MachineClass *mc = MACHINE_GET_CLASS(ms);
> +
> +if (config->l1d_cache) {
> +if (!mc->smp_props.l1_separated_cache_supported) {
> +error_setg(errp, "L1 D-cache topology not "
> +   "supported by this machine");
> +return;
> +}
> +
> +if (smp_cache_string_to_topology(ms, config->l1d_cache,
> +>smp_cache.l1d, errp)) {
> +return;
> +}
> +}
> +
> +if (config->l1i_cache) {
> +if (!mc->smp_props.l1_separated_cache_supported) {
> +error_setg(errp, "L1 I-cache topology not "
> +   "supported by this machine");
> +return;
> +}
> +
> +if (smp_cache_string_to_topology(ms, config->l1i_cache,
> +>smp_cache.l1i, errp)) {
> +return;
> +}
> +}
> +
> +if (config->l2_cache) {
> +if (!mc->smp_props.l2_unified_cache_supported) {
> +error_setg(errp, "L2 cache topology not "
> +   "supported by this machine");
> +return;
> +}
> +
> +if (smp_cache_string_to_topology(ms, config->l2_cache,
> +>smp_cache.l2, errp)) {
> +return;
> +}
> +
> +if (ms->smp_cache.l1d > ms->smp_cache.l2 ||
> +ms->smp_cache.l1i > ms->smp_cache.l2) {
> +error_setg(errp, "Invalid L2 cache topology. "
> +   "L2 cache topology level should not be "
> +   "lower than L1 D-cache/L1 I-cache");
> +return;
> +}
> +}
> +
> +if (config->l3_cache) {
> +if (!mc->smp_props.l2_unified_cache_supported) {
> +error_setg(errp, "L3 cache topology not "
> +   "supported by this machine");
> +return;
> +}
> +
> +

Re: [PATCH v7 1/2] qom: new object to associate device to numa node

2024-02-27 Thread Ankit Agrawal

>> diff --git a/include/hw/acpi/acpi-generic-initiator.h 
>> b/include/hw/acpi/acpi-generic-initiator.h
>> new file mode 100644
>> index 00..2f183b029a
>> --- /dev/null
>> +++ b/include/hw/acpi/acpi-generic-initiator.h
>
>> +typedef struct AcpiGenericInitiatorClass {
>> +    ObjectClass parent_class;
>Too indented.

Yes, will fix it.

>> +} AcpiGenericInitiatorClass;

[PULL 21/25] migration: update cpr-reboot description

2024-02-27 Thread peterx

From: Steve Sistare 

Clarify qapi for cpr-reboot migration mode, and add vfio support.

Signed-off-by: Steve Sistare 
Reviewed-by: Peter Xu 
Link: 
https://lore.kernel.org/r/1708622920-68779-14-git-send-email-steven.sist...@oracle.com
Signed-off-by: Peter Xu 
---
 qapi/migration.json | 35 ++-
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/qapi/migration.json b/qapi/migration.json
index 7303e57e8e..bee5e71fe3 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -636,19 +636,28 @@
 #
 # @normal: the original form of migration. (since 8.2)
 #
-# @cpr-reboot: The migrate command saves state to a file, allowing one to
-#  quit qemu, reboot to an updated kernel, and restart an updated
-#  version of qemu.  The caller must specify a migration URI
-#  that writes to and reads from a file.  Unlike normal mode,
-#  the use of certain local storage options does not block the
-#  migration, but the caller must not modify guest block devices
-#  between the quit and restart.  To avoid saving guest RAM to the
-#  file, the memory backend must be shared, and the 
@x-ignore-shared
-#  migration capability must be set.  Guest RAM must be 
non-volatile
-#  across reboot, such as by backing it with a dax device, but this
-#  is not enforced.  The restarted qemu arguments must match those
-#  used to initially start qemu, plus the -incoming option.
-#  (since 8.2)
+# @cpr-reboot: The migrate command stops the VM and saves state to the URI.
+# After quitting qemu, the user resumes by running qemu -incoming.
+#
+# This mode allows the user to quit qemu, and restart an updated version
+# of qemu.  The user may even update and reboot the OS before restarting,
+# as long as the URI persists across a reboot.
+#
+# Unlike normal mode, the use of certain local storage options does not
+# block the migration, but the user must not modify guest block devices
+# between the quit and restart.
+#
+# This mode supports vfio devices provided the user first puts the guest
+# in the suspended runstate, such as by issuing guest-suspend-ram to the
+# qemu guest agent.
+#
+# Best performance is achieved when the memory backend is shared and the
+# @x-ignore-shared migration capability is set, but this is not required.
+# Further, if the user reboots before restarting such a configuration, the
+# shared backend must be be non-volatile across reboot, such as by backing
+# it with a dax device.
+#
+# (since 8.2)
 ##
 { 'enum': 'MigMode',
   'data': [ 'normal', 'cpr-reboot' ] }
-- 
2.43.0

[PULL 17/25] migration: per-mode notifiers

2024-02-27 Thread peterx

From: Steve Sistare 

Keep a separate list of migration notifiers for each migration mode.

Suggested-by: Peter Xu 
Signed-off-by: Steve Sistare 
Reviewed-by: Peter Xu 
Reviewed-by: David Hildenbrand 
Link: 
https://lore.kernel.org/r/1708622920-68779-8-git-send-email-steven.sist...@oracle.com
Signed-off-by: Peter Xu 
---
 include/migration/misc.h |  6 ++
 migration/migration.c| 22 +-
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/include/migration/misc.h b/include/migration/misc.h
index e36a1f3ec4..4dc06a92b7 100644
--- a/include/migration/misc.h
+++ b/include/migration/misc.h
@@ -86,6 +86,12 @@ typedef int (*MigrationNotifyFunc)(NotifierWithReturn 
*notify,
 void migration_add_notifier(NotifierWithReturn *notify,
 MigrationNotifyFunc func);
 
+/*
+ * Same as migration_add_notifier, but applies to be specified @mode.
+ */
+void migration_add_notifier_mode(NotifierWithReturn *notify,
+ MigrationNotifyFunc func, MigMode mode);
+
 void migration_remove_notifier(NotifierWithReturn *notify);
 void migration_call_notifiers(MigrationState *s, MigrationEventType type);
 bool migration_in_setup(MigrationState *);
diff --git a/migration/migration.c b/migration/migration.c
index 33149c462c..925103b61a 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -69,8 +69,13 @@
 #include "qemu/sockets.h"
 #include "sysemu/kvm.h"
 
-static NotifierWithReturnList migration_state_notifiers =
-NOTIFIER_WITH_RETURN_LIST_INITIALIZER(migration_state_notifiers);
+#define NOTIFIER_ELEM_INIT(array, elem)\
+[elem] = NOTIFIER_WITH_RETURN_LIST_INITIALIZER((array)[elem])
+
+static NotifierWithReturnList migration_state_notifiers[] = {
+NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_NORMAL),
+NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_CPR_REBOOT),
+};
 
 /* Messages sent on the return path from destination to source */
 enum mig_rp_message_type {
@@ -1463,11 +1468,17 @@ static void migrate_fd_cancel(MigrationState *s)
 }
 }
 
+void migration_add_notifier_mode(NotifierWithReturn *notify,
+ MigrationNotifyFunc func, MigMode mode)
+{
+notify->notify = (NotifierWithReturnFunc)func;
+notifier_with_return_list_add(_state_notifiers[mode], notify);
+}
+
 void migration_add_notifier(NotifierWithReturn *notify,
 MigrationNotifyFunc func)
 {
-notify->notify = (NotifierWithReturnFunc)func;
-notifier_with_return_list_add(_state_notifiers, notify);
+migration_add_notifier_mode(notify, func, MIG_MODE_NORMAL);
 }
 
 void migration_remove_notifier(NotifierWithReturn *notify)
@@ -1480,10 +1491,11 @@ void migration_remove_notifier(NotifierWithReturn 
*notify)
 
 void migration_call_notifiers(MigrationState *s, MigrationEventType type)
 {
+MigMode mode = s->parameters.mode;
 MigrationEvent e;
 
 e.type = type;
-notifier_with_return_list_notify(_state_notifiers, , 0);
+notifier_with_return_list_notify(_state_notifiers[mode], , 0);
 }
 
 bool migration_in_setup(MigrationState *s)
-- 
2.43.0

[PULL 13/25] migration: convert to NotifierWithReturn

2024-02-27 Thread peterx

From: Steve Sistare 

Change all migration notifiers to type NotifierWithReturn, so notifiers
can return an error status in a future patch.  For now, pass NULL for the
notifier error parameter, and do not check the return value.

Signed-off-by: Steve Sistare 
Reviewed-by: Peter Xu 
Reviewed-by: David Hildenbrand 
Link: 
https://lore.kernel.org/r/1708622920-68779-4-git-send-email-steven.sist...@oracle.com
[peterx: dropped unexpected update to roms/seabios-hppa]
Signed-off-by: Peter Xu 
---
 include/hw/vfio/vfio-common.h  |  2 +-
 include/hw/virtio/virtio-net.h |  2 +-
 include/migration/misc.h   |  6 +++---
 include/qemu/notify.h  |  1 +
 hw/net/virtio-net.c|  4 +++-
 hw/vfio/migration.c|  4 +++-
 migration/migration.c  | 16 
 net/vhost-vdpa.c   |  6 --
 ui/spice-core.c|  8 +---
 9 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 9b7ef7d02b..4a6c262f77 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -62,7 +62,7 @@ typedef struct VFIORegion {
 typedef struct VFIOMigration {
 struct VFIODevice *vbasedev;
 VMChangeStateEntry *vm_state;
-Notifier migration_state;
+NotifierWithReturn migration_state;
 uint32_t device_state;
 int data_fd;
 void *data_buffer;
diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
index 55977f01f0..eaee8f4243 100644
--- a/include/hw/virtio/virtio-net.h
+++ b/include/hw/virtio/virtio-net.h
@@ -221,7 +221,7 @@ struct VirtIONet {
 DeviceListener primary_listener;
 QDict *primary_opts;
 bool primary_opts_from_json;
-Notifier migration_state;
+NotifierWithReturn migration_state;
 VirtioNetRssData rss_data;
 struct NetRxPkt *rx_pkt;
 struct EBPFRSSContext ebpf_rss;
diff --git a/include/migration/misc.h b/include/migration/misc.h
index 5e65c18f1a..b62e351d96 100644
--- a/include/migration/misc.h
+++ b/include/migration/misc.h
@@ -60,9 +60,9 @@ void migration_object_init(void);
 void migration_shutdown(void);
 bool migration_is_idle(void);
 bool migration_is_active(MigrationState *);
-void migration_add_notifier(Notifier *notify,
-void (*func)(Notifier *notifier, void *data));
-void migration_remove_notifier(Notifier *notify);
+void migration_add_notifier(NotifierWithReturn *notify,
+NotifierWithReturnFunc func);
+void migration_remove_notifier(NotifierWithReturn *notify);
 void migration_call_notifiers(MigrationState *s);
 bool migration_in_setup(MigrationState *);
 bool migration_has_finished(MigrationState *);
diff --git a/include/qemu/notify.h b/include/qemu/notify.h
index 9a85631864..abf18dbf59 100644
--- a/include/qemu/notify.h
+++ b/include/qemu/notify.h
@@ -45,6 +45,7 @@ bool notifier_list_empty(NotifierList *list);
 /* Same as Notifier but allows .notify() to return errors */
 typedef struct NotifierWithReturn NotifierWithReturn;
 
+/* Return int to allow for different failure modes and recovery actions */
 typedef int (*NotifierWithReturnFunc)(NotifierWithReturn *notifier, void *data,
   Error **errp);
 
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 5a79bc3a3a..75f4e8664d 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -3534,11 +3534,13 @@ static void 
virtio_net_handle_migration_primary(VirtIONet *n, MigrationState *s)
 }
 }
 
-static void virtio_net_migration_state_notifier(Notifier *notifier, void *data)
+static int virtio_net_migration_state_notifier(NotifierWithReturn *notifier,
+   void *data, Error **errp)
 {
 MigrationState *s = data;
 VirtIONet *n = container_of(notifier, VirtIONet, migration_state);
 virtio_net_handle_migration_primary(n, s);
+return 0;
 }
 
 static bool failover_hide_primary_device(DeviceListener *listener,
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 70e6b1a709..6b6acc4140 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -754,7 +754,8 @@ static void vfio_vmstate_change(void *opaque, bool running, 
RunState state)
   mig_state_to_str(new_state));
 }
 
-static void vfio_migration_state_notifier(Notifier *notifier, void *data)
+static int vfio_migration_state_notifier(NotifierWithReturn *notifier,
+ void *data, Error **errp)
 {
 MigrationState *s = data;
 VFIOMigration *migration = container_of(notifier, VFIOMigration,
@@ -770,6 +771,7 @@ static void vfio_migration_state_notifier(Notifier 
*notifier, void *data)
 case MIGRATION_STATUS_FAILED:
 vfio_migration_set_state_or_reset(vbasedev, VFIO_DEVICE_STATE_RUNNING);
 }
+return 0;
 }
 
 static void vfio_migration_free(VFIODevice *vbasedev)
diff --git a/migration/migration.c b/migration/migration.c
index

[PULL 25/25] migration: Use migrate_has_error() in close_return_path_on_source()

2024-02-27 Thread peterx

From: Cédric Le Goater 

close_return_path_on_source() retrieves the migration error from the
the QEMUFile '->to_dst_file' to know if a shutdown is required. This
shutdown is required to exit the return-path thread.

Avoid relying on '->to_dst_file' and use migrate_has_error() instead.

(using to_dst_file is a heuristic to infer whether
rp_state.from_dst_file might be stuck on a recvmsg(). Using a generic
method for detecting errors is more reliable. We also want to reduce
dependency on QEMUFile::last_error)

Suggested-by: Peter Xu 
Signed-off-by: Cédric Le Goater 
Reviewed-by: Peter Xu 
[added some words about the motivation for this patch]
Signed-off-by: Fabiano Rosas 
Link: https://lore.kernel.org/r/20240226203122.22894-3-faro...@suse.de
Signed-off-by: Peter Xu 
---
 migration/migration.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 7ba2b60e46..bab68bcbef 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -2429,8 +2429,7 @@ static bool close_return_path_on_source(MigrationState 
*ms)
  * cause it to unblock if it's stuck waiting for the destination.
  */
 WITH_QEMU_LOCK_GUARD(>qemu_file_lock) {
-if (ms->to_dst_file && ms->rp_state.from_dst_file &&
-qemu_file_get_error(ms->to_dst_file)) {
+if (migrate_has_error(ms) && ms->rp_state.from_dst_file) {
 qemu_file_shutdown(ms->rp_state.from_dst_file);
 }
 }
-- 
2.43.0

[PULL 16/25] migration: MigrationNotifyFunc

2024-02-27 Thread peterx

From: Steve Sistare 

Define MigrationNotifyFunc to improve type safety and simplify migration
notifiers.

Signed-off-by: Steve Sistare 
Reviewed-by: Peter Xu 
Reviewed-by: David Hildenbrand 
Link: 
https://lore.kernel.org/r/1708622920-68779-7-git-send-email-steven.sist...@oracle.com
Signed-off-by: Peter Xu 
---
 include/migration/misc.h | 5 -
 hw/net/virtio-net.c  | 4 +---
 hw/vfio/migration.c  | 3 +--
 migration/migration.c| 4 ++--
 net/vhost-vdpa.c | 6 ++
 ui/spice-core.c  | 4 +---
 6 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/include/migration/misc.h b/include/migration/misc.h
index e6150009e0..e36a1f3ec4 100644
--- a/include/migration/misc.h
+++ b/include/migration/misc.h
@@ -72,6 +72,9 @@ typedef struct MigrationEvent {
 MigrationEventType type;
 } MigrationEvent;
 
+typedef int (*MigrationNotifyFunc)(NotifierWithReturn *notify,
+   MigrationEvent *e, Error **errp);
+
 /*
  * Register the notifier @notify to be called when a migration event occurs
  * for MIG_MODE_NORMAL, as specified by the MigrationEvent passed to @func.
@@ -81,7 +84,7 @@ typedef struct MigrationEvent {
  *- MIG_EVENT_PRECOPY_FAILED
  */
 void migration_add_notifier(NotifierWithReturn *notify,
-NotifierWithReturnFunc func);
+MigrationNotifyFunc func);
 
 void migration_remove_notifier(NotifierWithReturn *notify);
 void migration_call_notifiers(MigrationState *s, MigrationEventType type);
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index e803f98c3a..a3c711b56d 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -3535,10 +3535,8 @@ static void 
virtio_net_handle_migration_primary(VirtIONet *n, MigrationEvent *e)
 }
 
 static int virtio_net_migration_state_notifier(NotifierWithReturn *notifier,
-   void *data, Error **errp)
+   MigrationEvent *e, Error **errp)
 {
-MigrationEvent *e = data;
-
 VirtIONet *n = container_of(notifier, VirtIONet, migration_state);
 virtio_net_handle_migration_primary(n, e);
 return 0;
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 869d8417d6..50140eda87 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -755,9 +755,8 @@ static void vfio_vmstate_change(void *opaque, bool running, 
RunState state)
 }
 
 static int vfio_migration_state_notifier(NotifierWithReturn *notifier,
- void *data, Error **errp)
+ MigrationEvent *e, Error **errp)
 {
-MigrationEvent *e = data;
 VFIOMigration *migration = container_of(notifier, VFIOMigration,
 migration_state);
 VFIODevice *vbasedev = migration->vbasedev;
diff --git a/migration/migration.c b/migration/migration.c
index 8f7f2d92f4..33149c462c 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1464,9 +1464,9 @@ static void migrate_fd_cancel(MigrationState *s)
 }
 
 void migration_add_notifier(NotifierWithReturn *notify,
-NotifierWithReturnFunc func)
+MigrationNotifyFunc func)
 {
-notify->notify = func;
+notify->notify = (NotifierWithReturnFunc)func;
 notifier_with_return_list_add(_state_notifiers, notify);
 }
 
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index a29d18a9ef..e6bdb4562d 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -323,11 +323,9 @@ static void 
vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool enable)
 }
 
 static int vdpa_net_migration_state_notifier(NotifierWithReturn *notifier,
- void *data, Error **errp)
+ MigrationEvent *e, Error **errp)
 {
-MigrationEvent *e = data;
-VhostVDPAState *s = container_of(notifier, VhostVDPAState,
- migration_state);
+VhostVDPAState *s = container_of(notifier, VhostVDPAState, 
migration_state);
 
 if (e->type == MIG_EVENT_PRECOPY_SETUP) {
 vhost_vdpa_net_log_global_enable(s, true);
diff --git a/ui/spice-core.c b/ui/spice-core.c
index 0a59876da2..15be640286 100644
--- a/ui/spice-core.c
+++ b/ui/spice-core.c
@@ -569,10 +569,8 @@ static SpiceInfo *qmp_query_spice_real(Error **errp)
 }
 
 static int migration_state_notifier(NotifierWithReturn *notifier,
-void *data, Error **errp)
+MigrationEvent *e, Error **errp)
 {
-MigrationEvent *e = data;
-
 if (!spice_have_target_host) {
 return 0;
 }
-- 
2.43.0

[PULL 22/25] migration: options incompatible with cpr

2024-02-27 Thread peterx

From: Steve Sistare 

Fail the migration request if options are set that are incompatible
with cpr.

Signed-off-by: Steve Sistare 
Reviewed-by: Peter Xu 
Link: 
https://lore.kernel.org/r/1708622920-68779-15-git-send-email-steven.sist...@oracle.com
Signed-off-by: Peter Xu 
---
 qapi/migration.json   |  2 ++
 migration/migration.c | 17 +
 2 files changed, 19 insertions(+)

diff --git a/qapi/migration.json b/qapi/migration.json
index bee5e71fe3..0b33a71ab4 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -657,6 +657,8 @@
 # shared backend must be be non-volatile across reboot, such as by backing
 # it with a dax device.
 #
+# cpr-reboot may not be used with postcopy, colo, or background-snapshot.
+#
 # (since 8.2)
 ##
 { 'enum': 'MigMode',
diff --git a/migration/migration.c b/migration/migration.c
index 90a90947fb..7652fd4d14 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1953,6 +1953,23 @@ static bool migrate_prepare(MigrationState *s, bool blk, 
bool blk_inc,
 return false;
 }
 
+if (migrate_mode_is_cpr(s)) {
+const char *conflict = NULL;
+
+if (migrate_postcopy()) {
+conflict = "postcopy";
+} else if (migrate_background_snapshot()) {
+conflict = "background snapshot";
+} else if (migrate_colo()) {
+conflict = "COLO";
+}
+
+if (conflict) {
+error_setg(errp, "Cannot use %s with CPR", conflict);
+return false;
+}
+}
+
 if (blk || blk_inc) {
 if (migrate_colo()) {
 error_setg(errp, "No disk migration is required in COLO mode");
-- 
2.43.0

[PULL 19/25] migration: notifier error checking

2024-02-27 Thread peterx

From: Steve Sistare 

Check the status returned by migration notifiers for event type
MIG_EVENT_PRECOPY_SETUP, and report errors.  None of the notifiers
return an error status at this time.

Signed-off-by: Steve Sistare 
Reviewed-by: Peter Xu 
Link: 
https://lore.kernel.org/r/1708622920-68779-10-git-send-email-steven.sist...@oracle.com
Signed-off-by: Peter Xu 
---
 include/migration/misc.h |  8 +++-
 migration/migration.c| 25 -
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/include/migration/misc.h b/include/migration/misc.h
index 4dc06a92b7..e4933b815b 100644
--- a/include/migration/misc.h
+++ b/include/migration/misc.h
@@ -72,6 +72,11 @@ typedef struct MigrationEvent {
 MigrationEventType type;
 } MigrationEvent;
 
+/*
+ * A MigrationNotifyFunc may return an error code and an Error object,
+ * but only when @e->type is MIG_EVENT_PRECOPY_SETUP.  The code is an int
+ * to allow for different failure modes and recovery actions.
+ */
 typedef int (*MigrationNotifyFunc)(NotifierWithReturn *notify,
MigrationEvent *e, Error **errp);
 
@@ -93,7 +98,8 @@ void migration_add_notifier_mode(NotifierWithReturn *notify,
  MigrationNotifyFunc func, MigMode mode);
 
 void migration_remove_notifier(NotifierWithReturn *notify);
-void migration_call_notifiers(MigrationState *s, MigrationEventType type);
+int migration_call_notifiers(MigrationState *s, MigrationEventType type,
+ Error **errp);
 bool migration_in_setup(MigrationState *);
 bool migration_has_finished(MigrationState *);
 bool migration_has_failed(MigrationState *);
diff --git a/migration/migration.c b/migration/migration.c
index 6a115d28b8..37c836b0b0 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1376,7 +1376,7 @@ static void migrate_fd_cleanup(MigrationState *s)
 }
 type = migration_has_failed(s) ? MIG_EVENT_PRECOPY_FAILED :
  MIG_EVENT_PRECOPY_DONE;
-migration_call_notifiers(s, type);
+migration_call_notifiers(s, type, NULL);
 block_cleanup_parameters();
 yank_unregister_instance(MIGRATION_YANK_INSTANCE);
 }
@@ -1489,13 +1489,18 @@ void migration_remove_notifier(NotifierWithReturn 
*notify)
 }
 }
 
-void migration_call_notifiers(MigrationState *s, MigrationEventType type)
+int migration_call_notifiers(MigrationState *s, MigrationEventType type,
+ Error **errp)
 {
 MigMode mode = s->parameters.mode;
 MigrationEvent e;
+int ret;
 
 e.type = type;
-notifier_with_return_list_notify(_state_notifiers[mode], , 0);
+ret = notifier_with_return_list_notify(_state_notifiers[mode],
+   , errp);
+assert(!ret || type == MIG_EVENT_PRECOPY_SETUP);
+return ret;
 }
 
 bool migration_in_setup(MigrationState *s)
@@ -2549,7 +2554,7 @@ static int postcopy_start(MigrationState *ms, Error 
**errp)
  * at the transition to postcopy and after the device state; in particular
  * spice needs to trigger a transition now
  */
-migration_call_notifiers(ms, MIG_EVENT_PRECOPY_DONE);
+migration_call_notifiers(ms, MIG_EVENT_PRECOPY_DONE, NULL);
 
 migration_downtime_end(ms);
 
@@ -2569,11 +2574,10 @@ static int postcopy_start(MigrationState *ms, Error 
**errp)
 
 ret = qemu_file_get_error(ms->to_dst_file);
 if (ret) {
-error_setg(errp, "postcopy_start: Migration stream errored");
-migrate_set_state(>state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
-  MIGRATION_STATUS_FAILED);
+error_setg_errno(errp, -ret, "postcopy_start: Migration stream error");
+bql_lock();
+goto fail;
 }
-
 trace_postcopy_preempt_enabled(migrate_postcopy_preempt());
 
 return ret;
@@ -2594,6 +2598,7 @@ fail:
 error_report_err(local_err);
 }
 }
+migration_call_notifiers(ms, MIG_EVENT_PRECOPY_FAILED, NULL);
 bql_unlock();
 return -1;
 }
@@ -3613,7 +3618,9 @@ void migrate_fd_connect(MigrationState *s, Error 
*error_in)
 rate_limit = migrate_max_bandwidth();
 
 /* Notify before starting migration thread */
-migration_call_notifiers(s, MIG_EVENT_PRECOPY_SETUP);
+if (migration_call_notifiers(s, MIG_EVENT_PRECOPY_SETUP, _err)) {
+goto fail;
+}
 }
 
 migration_rate_set(rate_limit);
-- 
2.43.0

[PULL 24/25] migration: Join the return path thread before releasing to_dst_file

2024-02-27 Thread peterx

From: Fabiano Rosas 

The return path thread might hang at a blocking system call. Before
joining the thread we might need to issue a shutdown() on the socket
file descriptor to release it. To determine whether the shutdown() is
necessary we look at the QEMUFile error.

Make sure we only clean up the QEMUFile after the return path has been
waited for.

This fixes a hang when qemu_savevm_state_setup() produced an error
that was detected by migration_detect_error(). That skips
migration_completion() so close_return_path_on_source() would get
stuck waiting for the RP thread to terminate.

Reported-by: Cédric Le Goater 
Tested-by: Cédric Le Goater 
Signed-off-by: Fabiano Rosas 
Link: https://lore.kernel.org/r/20240226203122.22894-2-faro...@suse.de
Signed-off-by: Peter Xu 
---
 migration/migration.c | 22 +-
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index ccb13fa94a..7ba2b60e46 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1342,6 +1342,8 @@ static void migrate_fd_cleanup(MigrationState *s)
 
 qemu_savevm_state_cleanup();
 
+close_return_path_on_source(s);
+
 if (s->to_dst_file) {
 QEMUFile *tmp;
 
@@ -1366,12 +1368,6 @@ static void migrate_fd_cleanup(MigrationState *s)
 qemu_fclose(tmp);
 }
 
-/*
- * We already cleaned up to_dst_file, so errors from the return
- * path might be due to that, ignore them.
- */
-close_return_path_on_source(s);
-
 assert(!migration_is_active(s));
 
 if (s->state == MIGRATION_STATUS_CANCELLING) {
@@ -2914,6 +2910,13 @@ static MigThrError postcopy_pause(MigrationState *s)
 while (true) {
 QEMUFile *file;
 
+/*
+ * We're already pausing, so ignore any errors on the return
+ * path and just wait for the thread to finish. It will be
+ * re-created when we resume.
+ */
+close_return_path_on_source(s);
+
 /*
  * Current channel is possibly broken. Release it.  Note that this is
  * guaranteed even without lock because to_dst_file should only be
@@ -2933,13 +2936,6 @@ static MigThrError postcopy_pause(MigrationState *s)
 qemu_file_shutdown(file);
 qemu_fclose(file);
 
-/*
- * We're already pausing, so ignore any errors on the return
- * path and just wait for the thread to finish. It will be
- * re-created when we resume.
- */
-close_return_path_on_source(s);
-
 migrate_set_state(>state, s->state,
   MIGRATION_STATUS_POSTCOPY_PAUSED);
 
-- 
2.43.0

[PULL 15/25] migration: remove postcopy_after_devices

2024-02-27 Thread peterx

From: Steve Sistare 

postcopy_after_devices and migration_in_postcopy_after_devices are no
longer used, so delete them.

Signed-off-by: Steve Sistare 
Reviewed-by: Peter Xu 
Link: 
https://lore.kernel.org/r/1708622920-68779-6-git-send-email-steven.sist...@oracle.com
Signed-off-by: Peter Xu 
---
 include/migration/misc.h | 1 -
 migration/migration.h| 2 --
 migration/migration.c| 7 ---
 3 files changed, 10 deletions(-)

diff --git a/include/migration/misc.h b/include/migration/misc.h
index 9e4abae97f..e6150009e0 100644
--- a/include/migration/misc.h
+++ b/include/migration/misc.h
@@ -89,7 +89,6 @@ bool migration_in_setup(MigrationState *);
 bool migration_has_finished(MigrationState *);
 bool migration_has_failed(MigrationState *);
 /* ...and after the device transmission */
-bool migration_in_postcopy_after_devices(MigrationState *);
 /* True if incoming migration entered POSTCOPY_INCOMING_DISCARD */
 bool migration_in_incoming_postcopy(void);
 /* True if incoming migration entered POSTCOPY_INCOMING_ADVISE */
diff --git a/migration/migration.h b/migration/migration.h
index f2c8b8f286..aef8afbe1f 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -348,8 +348,6 @@ struct MigrationState {
 
 /* Flag set once the migration has been asked to enter postcopy */
 bool start_postcopy;
-/* Flag set after postcopy has sent the device state */
-bool postcopy_after_devices;
 
 /* Flag set once the migration thread is running (and needs joining) */
 bool migration_thread_running;
diff --git a/migration/migration.c b/migration/migration.c
index 4650c21f67..8f7f2d92f4 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1527,11 +1527,6 @@ bool migration_postcopy_is_alive(int state)
 }
 }
 
-bool migration_in_postcopy_after_devices(MigrationState *s)
-{
-return migration_in_postcopy() && s->postcopy_after_devices;
-}
-
 bool migration_in_incoming_postcopy(void)
 {
 PostcopyState ps = postcopy_state_get();
@@ -1613,7 +1608,6 @@ int migrate_init(MigrationState *s, Error **errp)
 s->expected_downtime = 0;
 s->setup_time = 0;
 s->start_postcopy = false;
-s->postcopy_after_devices = false;
 s->migration_thread_running = false;
 error_free(s->error);
 s->error = NULL;
@@ -2543,7 +2537,6 @@ static int postcopy_start(MigrationState *ms, Error 
**errp)
  * at the transition to postcopy and after the device state; in particular
  * spice needs to trigger a transition now
  */
-ms->postcopy_after_devices = true;
 migration_call_notifiers(ms, MIG_EVENT_PRECOPY_DONE);
 
 migration_downtime_end(ms);
-- 
2.43.0

[PULL 20/25] migration: stop vm for cpr

2024-02-27 Thread peterx

From: Steve Sistare 

When migration for cpr is initiated, stop the vm and set state
RUN_STATE_FINISH_MIGRATE before ram is saved.  This eliminates the
possibility of ram and device state being out of sync, and guarantees
that a guest in the suspended state remains suspended, because qmp_cont
rejects a cont command in the RUN_STATE_FINISH_MIGRATE state.

Signed-off-by: Steve Sistare 
Reviewed-by: Peter Xu 
Link: 
https://lore.kernel.org/r/1708622920-68779-11-git-send-email-steven.sist...@oracle.com
Signed-off-by: Peter Xu 
---
 include/migration/misc.h |  1 +
 migration/migration.h|  2 --
 migration/migration.c| 51 
 3 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/include/migration/misc.h b/include/migration/misc.h
index e4933b815b..5d1aa593ed 100644
--- a/include/migration/misc.h
+++ b/include/migration/misc.h
@@ -60,6 +60,7 @@ void migration_object_init(void);
 void migration_shutdown(void);
 bool migration_is_idle(void);
 bool migration_is_active(MigrationState *);
+bool migrate_mode_is_cpr(MigrationState *);
 
 typedef enum MigrationEventType {
 MIG_EVENT_PRECOPY_SETUP,
diff --git a/migration/migration.h b/migration/migration.h
index aef8afbe1f..65c0b61cbd 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -541,6 +541,4 @@ int migration_rp_wait(MigrationState *s);
  */
 void migration_rp_kick(MigrationState *s);
 
-int migration_stop_vm(RunState state);
-
 #endif
diff --git a/migration/migration.c b/migration/migration.c
index 37c836b0b0..90a90947fb 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -167,11 +167,19 @@ static gint page_request_addr_cmp(gconstpointer ap, 
gconstpointer bp)
 return (a > b) - (a < b);
 }
 
-int migration_stop_vm(RunState state)
+static int migration_stop_vm(MigrationState *s, RunState state)
 {
-int ret = vm_stop_force_state(state);
+int ret;
+
+migration_downtime_start(s);
+
+s->vm_old_state = runstate_get();
+global_state_store();
+
+ret = vm_stop_force_state(state);
 
 trace_vmstate_downtime_checkpoint("src-vm-stopped");
+trace_migration_completion_vm_stop(ret);
 
 return ret;
 }
@@ -1602,6 +1610,11 @@ bool migration_is_active(MigrationState *s)
 s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
 }
 
+bool migrate_mode_is_cpr(MigrationState *s)
+{
+return s->parameters.mode == MIG_MODE_CPR_REBOOT;
+}
+
 int migrate_init(MigrationState *s, Error **errp)
 {
 int ret;
@@ -2454,10 +2467,7 @@ static int postcopy_start(MigrationState *ms, Error 
**errp)
 bql_lock();
 trace_postcopy_start_set_run();
 
-migration_downtime_start(ms);
-
-global_state_store();
-ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
+ret = migration_stop_vm(ms, RUN_STATE_FINISH_MIGRATE);
 if (ret < 0) {
 goto fail;
 }
@@ -2652,15 +2662,12 @@ static int migration_completion_precopy(MigrationState 
*s,
 int ret;
 
 bql_lock();
-migration_downtime_start(s);
-
-s->vm_old_state = runstate_get();
-global_state_store();
 
-ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
-trace_migration_completion_vm_stop(ret);
-if (ret < 0) {
-goto out_unlock;
+if (!migrate_mode_is_cpr(s)) {
+ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
+if (ret < 0) {
+goto out_unlock;
+}
 }
 
 ret = migration_maybe_pause(s, current_active_state,
@@ -3500,15 +3507,10 @@ static void *bg_migration_thread(void *opaque)
 s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
 
 trace_migration_thread_setup_complete();
-migration_downtime_start(s);
 
 bql_lock();
 
-s->vm_old_state = runstate_get();
-
-global_state_store();
-/* Forcibly stop VM before saving state of vCPUs and devices */
-if (migration_stop_vm(RUN_STATE_PAUSED)) {
+if (migration_stop_vm(s, RUN_STATE_PAUSED)) {
 goto fail;
 }
 /*
@@ -3584,6 +3586,7 @@ void migrate_fd_connect(MigrationState *s, Error 
*error_in)
 Error *local_err = NULL;
 uint64_t rate_limit;
 bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED;
+int ret;
 
 /*
  * If there's a previous error, free it and prepare for another one.
@@ -3655,6 +3658,14 @@ void migrate_fd_connect(MigrationState *s, Error 
*error_in)
 return;
 }
 
+if (migrate_mode_is_cpr(s)) {
+ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
+if (ret < 0) {
+error_setg(_err, "migration_stop_vm failed, error %d", -ret);
+goto fail;
+}
+}
+
 if (migrate_background_snapshot()) {
 qemu_thread_create(>thread, "bg_snapshot",
 bg_migration_thread, s, QEMU_THREAD_JOINABLE);
-- 
2.43.0

[PULL 23/25] migration: Fix qmp_query_migrate mbps value

2024-02-27 Thread peterx

From: Fabiano Rosas 

The QMP command query_migrate might see incorrect throughput numbers
if it runs after we've set the migration completion status but before
migration_calculate_complete() has updated s->total_time and s->mbps.

The migration status would show COMPLETED, but the throughput value
would be the one from the last iteration and not the one from the
whole migration. This will usually be a larger value due to the time
period being smaller (one iteration).

Move migration_calculate_complete() earlier so that the status
MIGRATION_STATUS_COMPLETED is only emitted after the final counters
update. Keep everything under the BQL so the QMP thread sees the
updates as atomic.

Rename migration_calculate_complete to migration_completion_end to
reflect its new purpose of also updating s->state.

Signed-off-by: Fabiano Rosas 
Link: https://lore.kernel.org/r/20240226143335.14282-1-faro...@suse.de
Signed-off-by: Peter Xu 
---
 migration/migration.c | 23 ++-
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 7652fd4d14..ccb13fa94a 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -107,6 +107,7 @@ static int migration_maybe_pause(MigrationState *s,
  int new_state);
 static void migrate_fd_cancel(MigrationState *s);
 static bool close_return_path_on_source(MigrationState *s);
+static void migration_completion_end(MigrationState *s);
 
 static void migration_downtime_start(MigrationState *s)
 {
@@ -2787,8 +2788,7 @@ static void migration_completion(MigrationState *s)
 migrate_set_state(>state, MIGRATION_STATUS_ACTIVE,
   MIGRATION_STATUS_COLO);
 } else {
-migrate_set_state(>state, current_active_state,
-  MIGRATION_STATUS_COMPLETED);
+migration_completion_end(s);
 }
 
 return;
@@ -2825,8 +2825,7 @@ static void bg_migration_completion(MigrationState *s)
 goto fail;
 }
 
-migrate_set_state(>state, current_active_state,
-  MIGRATION_STATUS_COMPLETED);
+migration_completion_end(s);
 return;
 
 fail:
@@ -3028,18 +3027,28 @@ static MigThrError 
migration_detect_error(MigrationState *s)
 }
 }
 
-static void migration_calculate_complete(MigrationState *s)
+static void migration_completion_end(MigrationState *s)
 {
 uint64_t bytes = migration_transferred_bytes();
 int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 int64_t transfer_time;
 
+/*
+ * Take the BQL here so that query-migrate on the QMP thread sees:
+ * - atomic update of s->total_time and s->mbps;
+ * - correct ordering of s->mbps update vs. s->state;
+ */
+bql_lock();
 migration_downtime_end(s);
 s->total_time = end_time - s->start_time;
 transfer_time = s->total_time - s->setup_time;
 if (transfer_time) {
 s->mbps = ((double) bytes * 8.0) / transfer_time / 1000;
 }
+
+migrate_set_state(>state, s->state,
+  MIGRATION_STATUS_COMPLETED);
+bql_unlock();
 }
 
 static void update_iteration_initial_status(MigrationState *s)
@@ -3186,7 +3195,6 @@ static void migration_iteration_finish(MigrationState *s)
 bql_lock();
 switch (s->state) {
 case MIGRATION_STATUS_COMPLETED:
-migration_calculate_complete(s);
 runstate_set(RUN_STATE_POSTMIGRATE);
 break;
 case MIGRATION_STATUS_COLO:
@@ -3230,9 +3238,6 @@ static void bg_migration_iteration_finish(MigrationState 
*s)
 bql_lock();
 switch (s->state) {
 case MIGRATION_STATUS_COMPLETED:
-migration_calculate_complete(s);
-break;
-
 case MIGRATION_STATUS_ACTIVE:
 case MIGRATION_STATUS_FAILED:
 case MIGRATION_STATUS_CANCELLED:
-- 
2.43.0

[PULL 18/25] migration: refactor migrate_fd_connect failures

2024-02-27 Thread peterx

From: Steve Sistare 

Move common code for the error path in migrate_fd_connect to a shared
fail label.  No functional change.

Signed-off-by: Steve Sistare 
Reviewed-by: Peter Xu 
Reviewed-by: David Hildenbrand 
Link: 
https://lore.kernel.org/r/1708622920-68779-9-git-send-email-steven.sist...@oracle.com
Signed-off-by: Peter Xu 
---
 migration/migration.c | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 925103b61a..6a115d28b8 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -3627,11 +3627,7 @@ void migrate_fd_connect(MigrationState *s, Error 
*error_in)
 if (migrate_postcopy_ram() || migrate_return_path()) {
 if (open_return_path_on_source(s)) {
 error_setg(_err, "Unable to open return-path for postcopy");
-migrate_set_state(>state, s->state, MIGRATION_STATUS_FAILED);
-migrate_set_error(s, local_err);
-error_report_err(local_err);
-migrate_fd_cleanup(s);
-return;
+goto fail;
 }
 }
 
@@ -3660,6 +3656,13 @@ void migrate_fd_connect(MigrationState *s, Error 
*error_in)
 migration_thread, s, QEMU_THREAD_JOINABLE);
 }
 s->migration_thread_running = true;
+return;
+
+fail:
+migrate_set_error(s, local_err);
+migrate_set_state(>state, s->state, MIGRATION_STATUS_FAILED);
+error_report_err(local_err);
+migrate_fd_cleanup(s);
 }
 
 static void migration_class_init(ObjectClass *klass, void *data)
-- 
2.43.0

[PULL 12/25] migration: remove error from notifier data

2024-02-27 Thread peterx

From: Steve Sistare 

Remove the error object from opaque data passed to notifiers.
Use the new error parameter passed to the notifier instead.

Signed-off-by: Steve Sistare 
Reviewed-by: Peter Xu 
Reviewed-by: David Hildenbrand 
Link: 
https://lore.kernel.org/r/1708622920-68779-3-git-send-email-steven.sist...@oracle.com
Signed-off-by: Peter Xu 
---
 include/migration/misc.h | 1 -
 migration/postcopy-ram.h | 1 -
 hw/virtio/vhost-user.c   | 8 
 migration/postcopy-ram.c | 1 -
 migration/ram.c  | 1 -
 5 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/include/migration/misc.h b/include/migration/misc.h
index 1bc8902e6d..5e65c18f1a 100644
--- a/include/migration/misc.h
+++ b/include/migration/misc.h
@@ -31,7 +31,6 @@ typedef enum PrecopyNotifyReason {
 
 typedef struct PrecopyNotifyData {
 enum PrecopyNotifyReason reason;
-Error **errp;
 } PrecopyNotifyData;
 
 void precopy_infrastructure_init(void);
diff --git a/migration/postcopy-ram.h b/migration/postcopy-ram.h
index 442ab89752..ecae941211 100644
--- a/migration/postcopy-ram.h
+++ b/migration/postcopy-ram.h
@@ -128,7 +128,6 @@ enum PostcopyNotifyReason {
 
 struct PostcopyNotifyData {
 enum PostcopyNotifyReason reason;
-Error **errp;
 };
 
 void postcopy_add_notifier(NotifierWithReturn *nn);
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index f502345f37..a1eea8547e 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -2096,20 +2096,20 @@ static int 
vhost_user_postcopy_notifier(NotifierWithReturn *notifier,
 if (!virtio_has_feature(dev->protocol_features,
 VHOST_USER_PROTOCOL_F_PAGEFAULT)) {
 /* TODO: Get the device name into this error somehow */
-error_setg(pnd->errp,
+error_setg(errp,
"vhost-user backend not capable of postcopy");
 return -ENOENT;
 }
 break;
 
 case POSTCOPY_NOTIFY_INBOUND_ADVISE:
-return vhost_user_postcopy_advise(dev, pnd->errp);
+return vhost_user_postcopy_advise(dev, errp);
 
 case POSTCOPY_NOTIFY_INBOUND_LISTEN:
-return vhost_user_postcopy_listen(dev, pnd->errp);
+return vhost_user_postcopy_listen(dev, errp);
 
 case POSTCOPY_NOTIFY_INBOUND_END:
-return vhost_user_postcopy_end(dev, pnd->errp);
+return vhost_user_postcopy_end(dev, errp);
 
 default:
 /* We ignore notifications we don't know */
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
index 3ab2f6b8fd..0273dc6a94 100644
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@@ -77,7 +77,6 @@ int postcopy_notify(enum PostcopyNotifyReason reason, Error 
**errp)
 {
 struct PostcopyNotifyData pnd;
 pnd.reason = reason;
-pnd.errp = errp;
 
 return notifier_with_return_list_notify(_notifier_list,
 , errp);
diff --git a/migration/ram.c b/migration/ram.c
index 5b6b09edd9..45a00b45ed 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -426,7 +426,6 @@ int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 {
 PrecopyNotifyData pnd;
 pnd.reason = reason;
-pnd.errp = errp;
 
 return notifier_with_return_list_notify(_notifier_list, , 
errp);
 }
-- 
2.43.0

[PULL 05/25] migration/multifd: Release recv sem_sync earlier

2024-02-27 Thread peterx

From: Fabiano Rosas 

Now that multifd_recv_terminate_threads() is called only once, release
the recv side sem_sync earlier like we do for the send side.

Signed-off-by: Fabiano Rosas 
Reviewed-by: Peter Xu 
Link: https://lore.kernel.org/r/20240220224138.24759-6-faro...@suse.de
Signed-off-by: Peter Xu 
---
 migration/multifd.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/migration/multifd.c b/migration/multifd.c
index fba00b9e8f..43f0820996 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -1104,6 +1104,12 @@ static void multifd_recv_terminate_threads(Error *err)
 for (i = 0; i < migrate_multifd_channels(); i++) {
 MultiFDRecvParams *p = _recv_state->params[i];
 
+/*
+ * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code,
+ * however try to wakeup it without harm in cleanup phase.
+ */
+qemu_sem_post(>sem_sync);
+
 /*
  * We could arrive here for two reasons:
  *  - normal quit, i.e. everything went fine, just finished
@@ -1162,12 +1168,6 @@ void multifd_recv_cleanup(void)
 for (i = 0; i < migrate_multifd_channels(); i++) {
 MultiFDRecvParams *p = _recv_state->params[i];
 
-/*
- * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code,
- * however try to wakeup it without harm in cleanup phase.
- */
-qemu_sem_post(>sem_sync);
-
 if (p->thread_created) {
 qemu_thread_join(>thread);
 }
-- 
2.43.0

[PULL 14/25] migration: MigrationEvent for notifiers

2024-02-27 Thread peterx

From: Steve Sistare 

Passing MigrationState to notifiers is unsound because they could access
unstable migration state internals or even modify the state.  Instead, pass
the minimal info needed in a new MigrationEvent struct, which could be
extended in the future if needed.

Suggested-by: Peter Xu 
Signed-off-by: Steve Sistare 
Reviewed-by: Peter Xu 
Reviewed-by: David Hildenbrand 
Link: 
https://lore.kernel.org/r/1708622920-68779-5-git-send-email-steven.sist...@oracle.com
Signed-off-by: Peter Xu 
---
 include/migration/misc.h | 23 ++-
 hw/net/virtio-net.c  | 11 ++-
 hw/vfio/migration.c  | 10 +++---
 migration/migration.c| 17 -
 net/vhost-vdpa.c |  6 +++---
 ui/spice-core.c  |  9 -
 hw/vfio/trace-events |  2 +-
 7 files changed, 51 insertions(+), 27 deletions(-)

diff --git a/include/migration/misc.h b/include/migration/misc.h
index b62e351d96..9e4abae97f 100644
--- a/include/migration/misc.h
+++ b/include/migration/misc.h
@@ -60,10 +60,31 @@ void migration_object_init(void);
 void migration_shutdown(void);
 bool migration_is_idle(void);
 bool migration_is_active(MigrationState *);
+
+typedef enum MigrationEventType {
+MIG_EVENT_PRECOPY_SETUP,
+MIG_EVENT_PRECOPY_DONE,
+MIG_EVENT_PRECOPY_FAILED,
+MIG_EVENT_MAX
+} MigrationEventType;
+
+typedef struct MigrationEvent {
+MigrationEventType type;
+} MigrationEvent;
+
+/*
+ * Register the notifier @notify to be called when a migration event occurs
+ * for MIG_MODE_NORMAL, as specified by the MigrationEvent passed to @func.
+ * Notifiers may receive events in any of the following orders:
+ *- MIG_EVENT_PRECOPY_SETUP -> MIG_EVENT_PRECOPY_DONE
+ *- MIG_EVENT_PRECOPY_SETUP -> MIG_EVENT_PRECOPY_FAILED
+ *- MIG_EVENT_PRECOPY_FAILED
+ */
 void migration_add_notifier(NotifierWithReturn *notify,
 NotifierWithReturnFunc func);
+
 void migration_remove_notifier(NotifierWithReturn *notify);
-void migration_call_notifiers(MigrationState *s);
+void migration_call_notifiers(MigrationState *s, MigrationEventType type);
 bool migration_in_setup(MigrationState *);
 bool migration_has_finished(MigrationState *);
 bool migration_has_failed(MigrationState *);
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 75f4e8664d..e803f98c3a 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -3504,7 +3504,7 @@ out:
 return !err;
 }
 
-static void virtio_net_handle_migration_primary(VirtIONet *n, MigrationState 
*s)
+static void virtio_net_handle_migration_primary(VirtIONet *n, MigrationEvent 
*e)
 {
 bool should_be_hidden;
 Error *err = NULL;
@@ -3516,7 +3516,7 @@ static void virtio_net_handle_migration_primary(VirtIONet 
*n, MigrationState *s)
 
 should_be_hidden = qatomic_read(>failover_primary_hidden);
 
-if (migration_in_setup(s) && !should_be_hidden) {
+if (e->type == MIG_EVENT_PRECOPY_SETUP && !should_be_hidden) {
 if (failover_unplug_primary(n, dev)) {
 vmstate_unregister(VMSTATE_IF(dev), qdev_get_vmsd(dev), dev);
 qapi_event_send_unplug_primary(dev->id);
@@ -3524,7 +3524,7 @@ static void virtio_net_handle_migration_primary(VirtIONet 
*n, MigrationState *s)
 } else {
 warn_report("couldn't unplug primary device");
 }
-} else if (migration_has_failed(s)) {
+} else if (e->type == MIG_EVENT_PRECOPY_FAILED) {
 /* We already unplugged the device let's plug it back */
 if (!failover_replug_primary(n, dev, )) {
 if (err) {
@@ -3537,9 +3537,10 @@ static void 
virtio_net_handle_migration_primary(VirtIONet *n, MigrationState *s)
 static int virtio_net_migration_state_notifier(NotifierWithReturn *notifier,
void *data, Error **errp)
 {
-MigrationState *s = data;
+MigrationEvent *e = data;
+
 VirtIONet *n = container_of(notifier, VirtIONet, migration_state);
-virtio_net_handle_migration_primary(n, s);
+virtio_net_handle_migration_primary(n, e);
 return 0;
 }
 
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 6b6acc4140..869d8417d6 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -757,18 +757,14 @@ static void vfio_vmstate_change(void *opaque, bool 
running, RunState state)
 static int vfio_migration_state_notifier(NotifierWithReturn *notifier,
  void *data, Error **errp)
 {
-MigrationState *s = data;
+MigrationEvent *e = data;
 VFIOMigration *migration = container_of(notifier, VFIOMigration,
 migration_state);
 VFIODevice *vbasedev = migration->vbasedev;
 
-trace_vfio_migration_state_notifier(vbasedev->name,
-MigrationStatus_str(s->state));
+trace_vfio_migration_state_notifier(vbasedev->name, e->type);
 
-switch (s->state) {
-case MIGRATION_STATUS_CANCELLING:
-

[PULL 11/25] notify: pass error to notifier with return

2024-02-27 Thread peterx

From: Steve Sistare 

Pass an error object as the third parameter to "notifier with return"
notifiers, so clients no longer need to bundle an error object in the
opaque data.  The new parameter is used in a later patch.

Signed-off-by: Steve Sistare 
Reviewed-by: Peter Xu 
Reviewed-by: David Hildenbrand 
Link: 
https://lore.kernel.org/r/1708622920-68779-2-git-send-email-steven.sist...@oracle.com
Signed-off-by: Peter Xu 
---
 include/qemu/notify.h  | 7 +--
 hw/virtio/vhost-user.c | 2 +-
 hw/virtio/virtio-balloon.c | 3 ++-
 migration/postcopy-ram.c   | 2 +-
 migration/ram.c| 2 +-
 util/notify.c  | 5 +++--
 6 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/include/qemu/notify.h b/include/qemu/notify.h
index bcfa70fb2e..9a85631864 100644
--- a/include/qemu/notify.h
+++ b/include/qemu/notify.h
@@ -45,12 +45,15 @@ bool notifier_list_empty(NotifierList *list);
 /* Same as Notifier but allows .notify() to return errors */
 typedef struct NotifierWithReturn NotifierWithReturn;
 
+typedef int (*NotifierWithReturnFunc)(NotifierWithReturn *notifier, void *data,
+  Error **errp);
+
 struct NotifierWithReturn {
 /**
  * Return 0 on success (next notifier will be invoked), otherwise
  * notifier_with_return_list_notify() will stop and return the value.
  */
-int (*notify)(NotifierWithReturn *notifier, void *data);
+NotifierWithReturnFunc notify;
 QLIST_ENTRY(NotifierWithReturn) node;
 };
 
@@ -69,6 +72,6 @@ void notifier_with_return_list_add(NotifierWithReturnList 
*list,
 void notifier_with_return_remove(NotifierWithReturn *notifier);
 
 int notifier_with_return_list_notify(NotifierWithReturnList *list,
- void *data);
+ void *data, Error **errp);
 
 #endif
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index f214df804b..f502345f37 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -2084,7 +2084,7 @@ static int vhost_user_postcopy_end(struct vhost_dev *dev, 
Error **errp)
 }
 
 static int vhost_user_postcopy_notifier(NotifierWithReturn *notifier,
-void *opaque)
+void *opaque, Error **errp)
 {
 struct PostcopyNotifyData *pnd = opaque;
 struct vhost_user *u = container_of(notifier, struct vhost_user,
diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index 486fe3da32..89f853fa9e 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -633,7 +633,8 @@ static void virtio_balloon_free_page_done(VirtIOBalloon *s)
 }
 
 static int
-virtio_balloon_free_page_hint_notify(NotifierWithReturn *n, void *data)
+virtio_balloon_free_page_hint_notify(NotifierWithReturn *n, void *data,
+ Error **errp)
 {
 VirtIOBalloon *dev = container_of(n, VirtIOBalloon, free_page_hint_notify);
 VirtIODevice *vdev = VIRTIO_DEVICE(dev);
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
index 893ec8fa89..3ab2f6b8fd 100644
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@@ -80,7 +80,7 @@ int postcopy_notify(enum PostcopyNotifyReason reason, Error 
**errp)
 pnd.errp = errp;
 
 return notifier_with_return_list_notify(_notifier_list,
-);
+, errp);
 }
 
 /*
diff --git a/migration/ram.c b/migration/ram.c
index 4649a81204..5b6b09edd9 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -428,7 +428,7 @@ int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 pnd.reason = reason;
 pnd.errp = errp;
 
-return notifier_with_return_list_notify(_notifier_list, );
+return notifier_with_return_list_notify(_notifier_list, , 
errp);
 }
 
 uint64_t ram_bytes_remaining(void)
diff --git a/util/notify.c b/util/notify.c
index 76bab212ae..c6e158ffb3 100644
--- a/util/notify.c
+++ b/util/notify.c
@@ -61,13 +61,14 @@ void notifier_with_return_remove(NotifierWithReturn 
*notifier)
 QLIST_REMOVE(notifier, node);
 }
 
-int notifier_with_return_list_notify(NotifierWithReturnList *list, void *data)
+int notifier_with_return_list_notify(NotifierWithReturnList *list, void *data,
+ Error **errp)
 {
 NotifierWithReturn *notifier, *next;
 int ret = 0;
 
 QLIST_FOREACH_SAFE(notifier, >notifiers, node, next) {
-ret = notifier->notify(notifier, data);
+ret = notifier->notify(notifier, data, errp);
 if (ret != 0) {
 break;
 }
-- 
2.43.0

[PULL 10/25] migration/multifd: Drop unnecessary helper to destroy IOC

2024-02-27 Thread peterx

From: Peter Xu 

Both socket_send_channel_destroy() and multifd_send_channel_destroy() are
unnecessary wrappers to destroy an IOC, as the only thing to do is to
release the final IOC reference.  We have plenty of code that destroys an
IOC using direct unref() already; keep that style.

Reviewed-by: Fabiano Rosas 
Link: https://lore.kernel.org/r/20240222095301.171137-6-pet...@redhat.com
Signed-off-by: Peter Xu 
---
 migration/socket.h  | 1 -
 migration/multifd.c | 7 +--
 migration/socket.c  | 7 ---
 3 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/migration/socket.h b/migration/socket.h
index 5f52eddd4c..46c233ecd2 100644
--- a/migration/socket.h
+++ b/migration/socket.h
@@ -23,7 +23,6 @@
 
 void socket_send_channel_create(QIOTaskFunc f, void *data);
 QIOChannel *socket_send_channel_create_sync(Error **errp);
-int socket_send_channel_destroy(QIOChannel *send);
 
 void socket_start_incoming_migration(SocketAddress *saddr, Error **errp);
 
diff --git a/migration/multifd.c b/migration/multifd.c
index fa33fd98b4..6c07f19af1 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -659,16 +659,11 @@ static void multifd_send_terminate_threads(void)
 }
 }
 
-static int multifd_send_channel_destroy(QIOChannel *send)
-{
-return socket_send_channel_destroy(send);
-}
-
 static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp)
 {
 if (p->c) {
 migration_ioc_unregister_yank(p->c);
-multifd_send_channel_destroy(p->c);
+object_unref(OBJECT(p->c));
 p->c = NULL;
 }
 qemu_sem_destroy(>sem);
diff --git a/migration/socket.c b/migration/socket.c
index 3184c7c3c1..9ab89b1e08 100644
--- a/migration/socket.c
+++ b/migration/socket.c
@@ -60,13 +60,6 @@ QIOChannel *socket_send_channel_create_sync(Error **errp)
 return QIO_CHANNEL(sioc);
 }
 
-int socket_send_channel_destroy(QIOChannel *send)
-{
-/* Remove channel */
-object_unref(OBJECT(send));
-return 0;
-}
-
 struct SocketConnectData {
 MigrationState *s;
 char *hostname;
-- 
2.43.0

[PULL 06/25] migration/multifd: Cleanup TLS iochannel referencing

2024-02-27 Thread peterx

From: Peter Xu 

Commit a1af605bd5 ("migration/multifd: fix hangup with TLS-Multifd due to
blocking handshake") introduced a thread for TLS channels, which will
resolve the issue on blocking the main thread.  However in the same commit
p->c is slightly abused just to be able to pass over the pointer "p" into
the thread.

That's the major reason we'll need to conditionally free the io channel in
the fault paths.

To clean it up, using a separate structure to pass over both "p" and "tioc"
in the tls handshake thread.  Then we can make it a rule that p->c will
never be set until the channel is completely setup.  With that, we can drop
the tricky conditional unref of the io channel in the error path.

Reviewed-by: Fabiano Rosas 
Link: https://lore.kernel.org/r/20240222095301.171137-2-pet...@redhat.com
Signed-off-by: Peter Xu 
---
 migration/multifd.c | 37 +++--
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/migration/multifd.c b/migration/multifd.c
index 43f0820996..84a6b9e58f 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -891,16 +891,22 @@ out:
 
 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque);
 
+typedef struct {
+MultiFDSendParams *p;
+QIOChannelTLS *tioc;
+} MultiFDTLSThreadArgs;
+
 static void *multifd_tls_handshake_thread(void *opaque)
 {
-MultiFDSendParams *p = opaque;
-QIOChannelTLS *tioc = QIO_CHANNEL_TLS(p->c);
+MultiFDTLSThreadArgs *args = opaque;
 
-qio_channel_tls_handshake(tioc,
+qio_channel_tls_handshake(args->tioc,
   multifd_new_send_channel_async,
-  p,
+  args->p,
   NULL,
   NULL);
+g_free(args);
+
 return NULL;
 }
 
@@ -910,6 +916,7 @@ static bool multifd_tls_channel_connect(MultiFDSendParams 
*p,
 {
 MigrationState *s = migrate_get_current();
 const char *hostname = s->hostname;
+MultiFDTLSThreadArgs *args;
 QIOChannelTLS *tioc;
 
 tioc = migration_tls_client_create(ioc, hostname, errp);
@@ -924,11 +931,14 @@ static bool multifd_tls_channel_connect(MultiFDSendParams 
*p,
 object_unref(OBJECT(ioc));
 trace_multifd_tls_outgoing_handshake_start(ioc, tioc, hostname);
 qio_channel_set_name(QIO_CHANNEL(tioc), "multifd-tls-outgoing");
-p->c = QIO_CHANNEL(tioc);
+
+args = g_new0(MultiFDTLSThreadArgs, 1);
+args->tioc = tioc;
+args->p = p;
 
 p->tls_thread_created = true;
 qemu_thread_create(>tls_thread, "multifd-tls-handshake-worker",
-   multifd_tls_handshake_thread, p,
+   multifd_tls_handshake_thread, args,
QEMU_THREAD_JOINABLE);
 return true;
 }
@@ -941,6 +951,7 @@ static bool multifd_channel_connect(MultiFDSendParams *p,
 
 migration_ioc_register_yank(ioc);
 p->registered_yank = true;
+/* Setup p->c only if the channel is completely setup */
 p->c = ioc;
 
 p->thread_created = true;
@@ -994,14 +1005,12 @@ out:
 
 trace_multifd_new_send_channel_async_error(p->id, local_err);
 multifd_send_set_error(local_err);
-if (!p->c) {
-/*
- * If no channel has been created, drop the initial
- * reference. Otherwise cleanup happens at
- * multifd_send_channel_destroy()
- */
-object_unref(OBJECT(ioc));
-}
+/*
+ * For error cases (TLS or non-TLS), IO channel is always freed here
+ * rather than when cleanup multifd: since p->c is not set, multifd
+ * cleanup code doesn't even know its existence.
+ */
+object_unref(OBJECT(ioc));
 error_free(local_err);
 }
 
-- 
2.43.0

[PULL 09/25] migration/multifd: Cleanup outgoing_args in state destroy

2024-02-27 Thread peterx

From: Peter Xu 

outgoing_args is a global cache of socket address to be reused in multifd.
Freeing the cache in per-channel destructor is more or less a hack.  Move
it to multifd_send_cleanup_state() so it only get checked once.  Use a
small helper to do so because it's internal of socket.c.

Reviewed-by: Fabiano Rosas 
Link: https://lore.kernel.org/r/20240222095301.171137-5-pet...@redhat.com
Signed-off-by: Peter Xu 
---
 migration/socket.h  |  2 ++
 migration/multifd.c |  1 +
 migration/socket.c  | 12 
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/migration/socket.h b/migration/socket.h
index 5e4c33b8ea..5f52eddd4c 100644
--- a/migration/socket.h
+++ b/migration/socket.h
@@ -29,4 +29,6 @@ void socket_start_incoming_migration(SocketAddress *saddr, 
Error **errp);
 
 void socket_start_outgoing_migration(MigrationState *s,
  SocketAddress *saddr, Error **errp);
+void socket_cleanup_outgoing_migration(void);
+
 #endif
diff --git a/migration/multifd.c b/migration/multifd.c
index af89e05915..fa33fd98b4 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -689,6 +689,7 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams 
*p, Error **errp)
 
 static void multifd_send_cleanup_state(void)
 {
+socket_cleanup_outgoing_migration();
 qemu_sem_destroy(_send_state->channels_created);
 qemu_sem_destroy(_send_state->channels_ready);
 g_free(multifd_send_state->params);
diff --git a/migration/socket.c b/migration/socket.c
index 98e3ea1514..3184c7c3c1 100644
--- a/migration/socket.c
+++ b/migration/socket.c
@@ -64,10 +64,6 @@ int socket_send_channel_destroy(QIOChannel *send)
 {
 /* Remove channel */
 object_unref(OBJECT(send));
-if (outgoing_args.saddr) {
-qapi_free_SocketAddress(outgoing_args.saddr);
-outgoing_args.saddr = NULL;
-}
 return 0;
 }
 
@@ -137,6 +133,14 @@ void socket_start_outgoing_migration(MigrationState *s,
  NULL);
 }
 
+void socket_cleanup_outgoing_migration(void)
+{
+if (outgoing_args.saddr) {
+qapi_free_SocketAddress(outgoing_args.saddr);
+outgoing_args.saddr = NULL;
+}
+}
+
 static void socket_accept_incoming_migration(QIONetListener *listener,
  QIOChannelSocket *cioc,
  gpointer opaque)
-- 
2.43.0

[PULL 07/25] migration/multifd: Drop registered_yank

2024-02-27 Thread peterx

From: Peter Xu 

With a clear definition of p->c protocol, where we only set it up if the
channel is fully established (TLS or non-TLS), registered_yank boolean will
have equal meaning of "p->c != NULL".

Drop registered_yank by checking p->c instead.

Reviewed-by: Fabiano Rosas 
Link: https://lore.kernel.org/r/20240222095301.171137-3-pet...@redhat.com
Signed-off-by: Peter Xu 
---
 migration/multifd.h | 2 --
 migration/multifd.c | 7 +++
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/migration/multifd.h b/migration/multifd.h
index 8a1cad0996..b3fe27ae93 100644
--- a/migration/multifd.h
+++ b/migration/multifd.h
@@ -78,8 +78,6 @@ typedef struct {
 bool tls_thread_created;
 /* communication channel */
 QIOChannel *c;
-/* is the yank function registered */
-bool registered_yank;
 /* packet allocated len */
 uint32_t packet_len;
 /* guest page size */
diff --git a/migration/multifd.c b/migration/multifd.c
index 84a6b9e58f..1d039a4840 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -666,11 +666,11 @@ static int multifd_send_channel_destroy(QIOChannel *send)
 
 static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp)
 {
-if (p->registered_yank) {
+if (p->c) {
 migration_ioc_unregister_yank(p->c);
+multifd_send_channel_destroy(p->c);
+p->c = NULL;
 }
-multifd_send_channel_destroy(p->c);
-p->c = NULL;
 qemu_sem_destroy(>sem);
 qemu_sem_destroy(>sem_sync);
 g_free(p->name);
@@ -950,7 +950,6 @@ static bool multifd_channel_connect(MultiFDSendParams *p,
 qio_channel_set_delay(ioc, false);
 
 migration_ioc_register_yank(ioc);
-p->registered_yank = true;
 /* Setup p->c only if the channel is completely setup */
 p->c = ioc;
 
-- 
2.43.0

[PULL 08/25] migration/multifd: Make multifd_channel_connect() return void

2024-02-27 Thread peterx

From: Peter Xu 

It never fails, drop the retval and also the Error**.

Suggested-by: Avihai Horon 
Reviewed-by: Fabiano Rosas 
Link: https://lore.kernel.org/r/20240222095301.171137-4-pet...@redhat.com
Signed-off-by: Peter Xu 
---
 migration/multifd.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/migration/multifd.c b/migration/multifd.c
index 1d039a4840..af89e05915 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -943,9 +943,7 @@ static bool multifd_tls_channel_connect(MultiFDSendParams 
*p,
 return true;
 }
 
-static bool multifd_channel_connect(MultiFDSendParams *p,
-QIOChannel *ioc,
-Error **errp)
+static void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc)
 {
 qio_channel_set_delay(ioc, false);
 
@@ -956,7 +954,6 @@ static bool multifd_channel_connect(MultiFDSendParams *p,
 p->thread_created = true;
 qemu_thread_create(>thread, p->name, multifd_send_thread, p,
QEMU_THREAD_JOINABLE);
-return true;
 }
 
 /*
@@ -988,7 +985,8 @@ static void multifd_new_send_channel_async(QIOTask *task, 
gpointer opaque)
 return;
 }
 } else {
-ret = multifd_channel_connect(p, ioc, _err);
+multifd_channel_connect(p, ioc);
+ret = true;
 }
 
 out:
-- 
2.43.0

[PULL 01/25] docs/devel/migration.rst: Document the file transport

2024-02-27 Thread peterx

From: Fabiano Rosas 

When adding the support for file migration with the file: transport,
we missed adding documentation for it.

Signed-off-by: Fabiano Rosas 
Reviewed-by: Peter Xu 
Link: https://lore.kernel.org/r/20240220224138.24759-2-faro...@suse.de
Signed-off-by: Peter Xu 
---
 docs/devel/migration/main.rst | 4 
 1 file changed, 4 insertions(+)

diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst
index 331252a92c..8024275d6d 100644
--- a/docs/devel/migration/main.rst
+++ b/docs/devel/migration/main.rst
@@ -41,6 +41,10 @@ over any transport.
 - exec migration: do the migration using the stdin/stdout through a process.
 - fd migration: do the migration using a file descriptor that is
   passed to QEMU.  QEMU doesn't care how this file descriptor is opened.
+- file migration: do the migration using a file that is passed to QEMU
+  by path. A file offset option is supported to allow a management
+  application to add its own metadata to the start of the file without
+  QEMU interference.
 
 In addition, support is included for migration using RDMA, which
 transports the page data using ``RDMA``, where the hardware takes care of
-- 
2.43.0

[PULL 03/25] tests/qtest/migration: Add a fd + file test

2024-02-27 Thread peterx

From: Fabiano Rosas 

The fd URI supports an fd that is backed by a file. The code should
select between QIOChannelFile and QIOChannelSocket, depending on the
type of the fd. Add a test for that.

Signed-off-by: Fabiano Rosas 
Reviewed-by: Peter Xu 
Link: https://lore.kernel.org/r/20240220224138.24759-4-faro...@suse.de
Signed-off-by: Peter Xu 
---
 tests/qtest/migration-test.c | 41 
 1 file changed, 41 insertions(+)

diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index b729ce4d22..83512bce85 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -2433,6 +2433,45 @@ static void test_migrate_precopy_fd_socket(void)
 };
 test_precopy_common();
 }
+
+static void *migrate_precopy_fd_file_start(QTestState *from, QTestState *to)
+{
+g_autofree char *file = g_strdup_printf("%s/%s", tmpfs, 
FILE_TEST_FILENAME);
+int src_flags = O_CREAT | O_RDWR;
+int dst_flags = O_CREAT | O_RDWR;
+int fds[2];
+
+fds[0] = open(file, src_flags, 0660);
+assert(fds[0] != -1);
+
+fds[1] = open(file, dst_flags, 0660);
+assert(fds[1] != -1);
+
+
+qtest_qmp_fds_assert_success(to, [0], 1,
+ "{ 'execute': 'getfd',"
+ "  'arguments': { 'fdname': 'fd-mig' }}");
+
+qtest_qmp_fds_assert_success(from, [1], 1,
+ "{ 'execute': 'getfd',"
+ "  'arguments': { 'fdname': 'fd-mig' }}");
+
+close(fds[0]);
+close(fds[1]);
+
+return NULL;
+}
+
+static void test_migrate_precopy_fd_file(void)
+{
+MigrateCommon args = {
+.listen_uri = "defer",
+.connect_uri = "fd:fd-mig",
+.start_hook = migrate_precopy_fd_file_start,
+.finish_hook = test_migrate_fd_finish_hook
+};
+test_file_common(, true);
+}
 #endif /* _WIN32 */
 
 static void do_test_validate_uuid(MigrateStart *args, bool should_fail)
@@ -3529,6 +3568,8 @@ int main(int argc, char **argv)
 #ifndef _WIN32
 migration_test_add("/migration/precopy/fd/tcp",
test_migrate_precopy_fd_socket);
+migration_test_add("/migration/precopy/fd/file",
+   test_migrate_precopy_fd_file);
 #endif
 migration_test_add("/migration/validate_uuid", test_validate_uuid);
 migration_test_add("/migration/validate_uuid_error",
-- 
2.43.0

[PULL 02/25] tests/qtest/migration: Rename fd_proto test

2024-02-27 Thread peterx

From: Fabiano Rosas 

Next patch adds another fd test. Rename the existing one closer to
what's used on other tests, with the 'precopy' prefix.

Signed-off-by: Fabiano Rosas 
Reviewed-by: Peter Xu 
Link: https://lore.kernel.org/r/20240220224138.24759-3-faro...@suse.de
Signed-off-by: Peter Xu 
---
 tests/qtest/migration-test.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index 8a5bb1752e..b729ce4d22 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -2423,7 +2423,7 @@ static void test_migrate_fd_finish_hook(QTestState *from,
 qobject_unref(rsp);
 }
 
-static void test_migrate_fd_proto(void)
+static void test_migrate_precopy_fd_socket(void)
 {
 MigrateCommon args = {
 .listen_uri = "defer",
@@ -3527,7 +3527,8 @@ int main(int argc, char **argv)
 
 /* migration_test_add("/migration/ignore_shared", test_ignore_shared); */
 #ifndef _WIN32
-migration_test_add("/migration/fd_proto", test_migrate_fd_proto);
+migration_test_add("/migration/precopy/fd/tcp",
+   test_migrate_precopy_fd_socket);
 #endif
 migration_test_add("/migration/validate_uuid", test_validate_uuid);
 migration_test_add("/migration/validate_uuid_error",
-- 
2.43.0

[PULL 04/25] migration/multifd: Remove p->quit from recv side

2024-02-27 Thread peterx

From: Fabiano Rosas 

Like we did on the sending side, replace the p->quit per-channel flag
with a global atomic 'exiting' flag.

Signed-off-by: Fabiano Rosas 
Reviewed-by: Peter Xu 
Link: https://lore.kernel.org/r/20240220224138.24759-5-faro...@suse.de
Signed-off-by: Peter Xu 
---
 migration/multifd.c | 41 -
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/migration/multifd.c b/migration/multifd.c
index adfe8c9a0a..fba00b9e8f 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -79,6 +79,19 @@ struct {
 MultiFDMethods *ops;
 } *multifd_send_state;
 
+struct {
+MultiFDRecvParams *params;
+/* number of created threads */
+int count;
+/* syncs main thread and channels */
+QemuSemaphore sem_sync;
+/* global number of generated multifd packets */
+uint64_t packet_num;
+int exiting;
+/* multifd ops */
+MultiFDMethods *ops;
+} *multifd_recv_state;
+
 /* Multifd without compression */
 
 /**
@@ -440,6 +453,11 @@ static bool multifd_send_should_exit(void)
 return qatomic_read(_send_state->exiting);
 }
 
+static bool multifd_recv_should_exit(void)
+{
+return qatomic_read(_recv_state->exiting);
+}
+
 /*
  * The migration thread can wait on either of the two semaphores.  This
  * function can be used to kick the main thread out of waiting on either of
@@ -1063,24 +1081,16 @@ bool multifd_send_setup(void)
 return true;
 }
 
-struct {
-MultiFDRecvParams *params;
-/* number of created threads */
-int count;
-/* syncs main thread and channels */
-QemuSemaphore sem_sync;
-/* global number of generated multifd packets */
-uint64_t packet_num;
-/* multifd ops */
-MultiFDMethods *ops;
-} *multifd_recv_state;
-
 static void multifd_recv_terminate_threads(Error *err)
 {
 int i;
 
 trace_multifd_recv_terminate_threads(err != NULL);
 
+if (qatomic_xchg(_recv_state->exiting, 1)) {
+return;
+}
+
 if (err) {
 MigrationState *s = migrate_get_current();
 migrate_set_error(s, err);
@@ -1094,8 +1104,6 @@ static void multifd_recv_terminate_threads(Error *err)
 for (i = 0; i < migrate_multifd_channels(); i++) {
 MultiFDRecvParams *p = _recv_state->params[i];
 
-qemu_mutex_lock(>mutex);
-p->quit = true;
 /*
  * We could arrive here for two reasons:
  *  - normal quit, i.e. everything went fine, just finished
@@ -1105,7 +1113,6 @@ static void multifd_recv_terminate_threads(Error *err)
 if (p->c) {
 qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
 }
-qemu_mutex_unlock(>mutex);
 }
 }
 
@@ -1210,7 +1217,7 @@ static void *multifd_recv_thread(void *opaque)
 while (true) {
 uint32_t flags;
 
-if (p->quit) {
+if (multifd_recv_should_exit()) {
 break;
 }
 
@@ -1274,6 +1281,7 @@ int multifd_recv_setup(Error **errp)
 multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
 multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
 qatomic_set(_recv_state->count, 0);
+qatomic_set(_recv_state->exiting, 0);
 qemu_sem_init(_recv_state->sem_sync, 0);
 multifd_recv_state->ops = multifd_ops[migrate_multifd_compression()];
 
@@ -1282,7 +1290,6 @@ int multifd_recv_setup(Error **errp)
 
 qemu_mutex_init(>mutex);
 qemu_sem_init(>sem_sync, 0);
-p->quit = false;
 p->id = i;
 p->packet_len = sizeof(MultiFDPacket_t)
   + sizeof(uint64_t) * page_count;
-- 
2.43.0

[PULL 00/25] Migration next patches

2024-02-27 Thread peterx

From: Peter Xu 

The following changes since commit 158a054c4d1a40179f5e83cd7e1cfe65de457b92:

  Merge tag 'pull-target-arm-20240227-1' of 
https://git.linaro.org/people/pmaydell/qemu-arm into staging (2024-02-27 
15:34:41 +)

are available in the Git repository at:

  https://gitlab.com/peterx/qemu.git tags/migration-next-pull-request

for you to fetch changes up to 9425ef3f990a42b98329d5059362f40714e70442:

  migration: Use migrate_has_error() in close_return_path_on_source() 
(2024-02-28 11:31:28 +0800)


Migration pull request

- Fabiano's fixed-ram patches (1-5 only)
- Peter's cleanups on multifd tls IOC referencing
- Steve's cpr patches for vfio (migration patches only)
- Fabiano's fix on mbps stats racing with COMPLETE state
- Fabiano's fix on return path thread hang



Cédric Le Goater (1):
  migration: Use migrate_has_error() in close_return_path_on_source()

Fabiano Rosas (7):
  docs/devel/migration.rst: Document the file transport
  tests/qtest/migration: Rename fd_proto test
  tests/qtest/migration: Add a fd + file test
  migration/multifd: Remove p->quit from recv side
  migration/multifd: Release recv sem_sync earlier
  migration: Fix qmp_query_migrate mbps value
  migration: Join the return path thread before releasing to_dst_file

Peter Xu (5):
  migration/multifd: Cleanup TLS iochannel referencing
  migration/multifd: Drop registered_yank
  migration/multifd: Make multifd_channel_connect() return void
  migration/multifd: Cleanup outgoing_args in state destroy
  migration/multifd: Drop unnecessary helper to destroy IOC

Steve Sistare (12):
  notify: pass error to notifier with return
  migration: remove error from notifier data
  migration: convert to NotifierWithReturn
  migration: MigrationEvent for notifiers
  migration: remove postcopy_after_devices
  migration: MigrationNotifyFunc
  migration: per-mode notifiers
  migration: refactor migrate_fd_connect failures
  migration: notifier error checking
  migration: stop vm for cpr
  migration: update cpr-reboot description
  migration: options incompatible with cpr

 docs/devel/migration/main.rst  |   4 +
 qapi/migration.json|  37 ---
 include/hw/vfio/vfio-common.h  |   2 +-
 include/hw/virtio/virtio-net.h |   2 +-
 include/migration/misc.h   |  47 +++-
 include/qemu/notify.h  |   8 +-
 migration/migration.h  |   4 -
 migration/multifd.h|   2 -
 migration/postcopy-ram.h   |   1 -
 migration/socket.h |   3 +-
 hw/net/virtio-net.c|  13 ++-
 hw/vfio/migration.c|  13 +--
 hw/virtio/vhost-user.c |  10 +-
 hw/virtio/virtio-balloon.c |   3 +-
 migration/migration.c  | 196 +
 migration/multifd.c| 111 ++-
 migration/postcopy-ram.c   |   3 +-
 migration/ram.c|   3 +-
 migration/socket.c |  19 ++--
 net/vhost-vdpa.c   |  14 +--
 tests/qtest/migration-test.c   |  46 +++-
 ui/spice-core.c|  17 ++-
 util/notify.c  |   5 +-
 hw/vfio/trace-events   |   2 +-
 24 files changed, 354 insertions(+), 211 deletions(-)

-- 
2.43.0

Re: [PATCH v4 23/29] plugins: add an API to read registers

2024-02-27 Thread Akihiko Odaki


On 2024/02/27 23:43, Alex Bennée wrote:

We can only request a list of registers once the vCPU has been
initialised so the user needs to use either call the get function on
vCPU initialisation or during the translation phase.

We don't expose the reg number to the plugin instead hiding it behind
an opaque handle. For now this is just the gdb_regnum encapsulated in
an anonymous GPOINTER but in future as we add more state for plugins
to track we can expand it.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1706
Cc: Akihiko Odaki 
Message-Id: <20240103173349.398526-39-alex.ben...@linaro.org>
Based-on: <20231025093128.33116-18-akihiko.od...@daynix.com>
Signed-off-by: Alex Bennée 
Reviewed-by: Pierrick Bouvier 

---
v4
   - the get/read_registers functions are now implicitly for current
   vCPU only to accidental cpu != current_cpu uses.
v5
   - make reg_handles as per-CPUPluginState variable.
v6
   - for now just wrap gdb_regnum
v7
   - minor style fixes
---
  include/qemu/qemu-plugin.h   | 48 +--
  plugins/api.c| 55 
  plugins/qemu-plugins.symbols |  2 ++
  3 files changed, 103 insertions(+), 2 deletions(-)

diff --git a/include/qemu/qemu-plugin.h b/include/qemu/qemu-plugin.h
index 93981f8f89f..6c5580f4428 100644
--- a/include/qemu/qemu-plugin.h
+++ b/include/qemu/qemu-plugin.h
@@ -11,6 +11,7 @@
  #ifndef QEMU_QEMU_PLUGIN_H
  #define QEMU_QEMU_PLUGIN_H
  
+#include 

  #include 
  #include 
  #include 
@@ -229,8 +230,8 @@ struct qemu_plugin_insn;
   * @QEMU_PLUGIN_CB_R_REGS: callback reads the CPU's regs
   * @QEMU_PLUGIN_CB_RW_REGS: callback reads and writes the CPU's regs
   *
- * Note: currently unused, plugins cannot read or change system
- * register state.
+ * Note: currently QEMU_PLUGIN_CB_RW_REGS is unused, plugins cannot change
+ * system register state.
   */
  enum qemu_plugin_cb_flags {
  QEMU_PLUGIN_CB_NO_REGS,
@@ -707,4 +708,47 @@ uint64_t qemu_plugin_end_code(void);
  QEMU_PLUGIN_API
  uint64_t qemu_plugin_entry_code(void);
  
+/** struct qemu_plugin_register - Opaque handle for register access */

+struct qemu_plugin_register;
+
+/**
+ * typedef qemu_plugin_reg_descriptor - register descriptions
+ *
+ * @handle: opaque handle for retrieving value with qemu_plugin_read_register
+ * @name: register name
+ * @feature: optional feature descriptor, can be NULL
+ */
+typedef struct {
+struct qemu_plugin_register *handle;
+const char *name;
+const char *feature;
+} qemu_plugin_reg_descriptor;
+
+/**
+ * qemu_plugin_get_registers() - return register list for current vCPU
+ *
+ * Returns a potentially empty GArray of qemu_plugin_reg_descriptor.
+ * Caller frees the array (but not the const strings).
+ *
+ * Should be used from a qemu_plugin_register_vcpu_init_cb() callback
+ * after the vCPU is initialised, i.e. in the vCPU context.
+ */


Qualify with QEMU_PLUGIN_API, which was apparently added after this 
patch was authored.



+GArray *qemu_plugin_get_registers(void);
+
+/**
+ * qemu_plugin_read_register() - read register for current vCPU
+ *
+ * @handle: a @qemu_plugin_reg_handle handle
+ * @buf: A GByteArray for the data owned by the plugin
+ *
+ * This function is only available in a context that register read access is
+ * explicitly requested via the QEMU_PLUGIN_CB_R_REGS flag.
+ *
+ * Returns the size of the read register. The content of @buf is in target byte
+ * order. On failure returns -1


Add a period after -1.


+ */
+int qemu_plugin_read_register(struct qemu_plugin_register *handle,
+  GByteArray *buf);
+
+
  #endif /* QEMU_QEMU_PLUGIN_H */
diff --git a/plugins/api.c b/plugins/api.c
index 54df72c1c00..908fe7e6fa3 100644
--- a/plugins/api.c
+++ b/plugins/api.c
@@ -8,6 +8,7 @@
   *
   *  qemu_plugin_tb
   *  qemu_plugin_insn
+ *  qemu_plugin_register
   *
   * Which can then be passed back into the API to do additional things.
   * As such all the public functions in here are exported in
@@ -35,10 +36,12 @@
   */
  
  #include "qemu/osdep.h"

+#include "qemu/main-loop.h"
  #include "qemu/plugin.h"
  #include "qemu/log.h"
  #include "tcg/tcg.h"
  #include "exec/exec-all.h"
+#include "exec/gdbstub.h"
  #include "exec/ram_addr.h"
  #include "disas/disas.h"
  #include "plugin.h"
@@ -410,3 +413,55 @@ uint64_t qemu_plugin_entry_code(void)
  #endif
  return entry;
  }
+
+/*
+ * Create register handles.
+ *
+ * We need to create a handle for each register so the plugin
+ * infrastructure can call gdbstub to read a register. They are
+ * currently just a pointer encapsulation of the gdb_regnum but in


s/gdb_regnum/gdb_reg/

With all comments fixed,

Reviewed-by: Akihiko Odaki 


+ * future may hold internal plugin state so its important plugin
+ * authors are not tempted to treat them as numbers.
+ *
+ * We also construct a result array with those handles and some
+ * ancillary data the plugin might find useful.
+ */
+
+static GArray

Re: [PATCH v4 21/29] gdbstub: expose api to find registers

2024-02-27 Thread Akihiko Odaki


On 2024/02/27 23:43, Alex Bennée wrote:

Expose an internal API to QEMU to return all the registers for a vCPU.
The list containing the details required to called gdb_read_register().

Based-on: <20231025093128.33116-15-akihiko.od...@daynix.com>
Cc: Akihiko Odaki 
Message-Id: <20240103173349.398526-38-alex.ben...@linaro.org>
Signed-off-by: Alex Bennée 


Reviewed-by: Akihiko Odaki 



---
v3
   - rm unused api functions left over
---
  include/exec/gdbstub.h | 28 
  gdbstub/gdbstub.c  | 27 ++-
  2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/include/exec/gdbstub.h b/include/exec/gdbstub.h
index da9ddfe54c5..eb14b91139b 100644
--- a/include/exec/gdbstub.h
+++ b/include/exec/gdbstub.h
@@ -111,6 +111,34 @@ void gdb_feature_builder_end(const GDBFeatureBuilder 
*builder);
   */
  const GDBFeature *gdb_find_static_feature(const char *xmlname);
  
+/**

+ * gdb_read_register() - Read a register associated with a CPU.
+ * @cpu: The CPU associated with the register.
+ * @buf: The buffer that the read register will be appended to.
+ * @reg: The register's number returned by gdb_find_feature_register().
+ *
+ * Return: The number of read bytes.
+ */
+int gdb_read_register(CPUState *cpu, GByteArray *buf, int reg);
+
+/**
+ * typedef GDBRegDesc - a register description from gdbstub
+ */
+typedef struct {
+int gdb_reg;
+const char *name;
+const char *feature_name;
+} GDBRegDesc;
+
+/**
+ * gdb_get_register_list() - Return list of all registers for CPU
+ * @cpu: The CPU being searched
+ *
+ * Returns a GArray of GDBRegDesc, caller frees array but not the
+ * const strings.
+ */
+GArray *gdb_get_register_list(CPUState *cpu);
+
  void gdb_set_stop_cpu(CPUState *cpu);
  
  /* in gdbstub-xml.c, generated by scripts/feature_to_c.py */

diff --git a/gdbstub/gdbstub.c b/gdbstub/gdbstub.c
index a55b5e6581a..2909bc8c69f 100644
--- a/gdbstub/gdbstub.c
+++ b/gdbstub/gdbstub.c
@@ -490,7 +490,32 @@ const GDBFeature *gdb_find_static_feature(const char 
*xmlname)
  g_assert_not_reached();
  }
  
-static int gdb_read_register(CPUState *cpu, GByteArray *buf, int reg)

+GArray *gdb_get_register_list(CPUState *cpu)
+{
+GArray *results = g_array_new(true, true, sizeof(GDBRegDesc));
+
+/* registers are only available once the CPU is initialised */
+if (!cpu->gdb_regs) {
+return results;
+}
+
+for (int f = 0; f < cpu->gdb_regs->len; f++) {
+GDBRegisterState *r = _array_index(cpu->gdb_regs, GDBRegisterState, 
f);
+for (int i = 0; i < r->feature->num_regs; i++) {
+const char *name = r->feature->regs[i];
+GDBRegDesc desc = {
+r->base_reg + i,
+name,
+r->feature->name
+};
+g_array_append_val(results, desc);
+}
+}
+
+return results;
+}
+
+int gdb_read_register(CPUState *cpu, GByteArray *buf, int reg)
  {
  CPUClass *cc = CPU_GET_CLASS(cpu);
  GDBRegisterState *r;

Re: [PATCH 4/4] replay: simple auto-snapshot mode for record

2024-02-27 Thread Pavel Dovgalyuk


On 26.02.2024 10:36, Nicholas Piggin wrote:

On Fri Aug 18, 2023 at 2:36 PM AEST, Pavel Dovgalyuk wrote:

On 14.08.2023 19:31, Nicholas Piggin wrote:

record makes an initial snapshot when the machine is created, to enable
reverse-debugging. Often the issue being debugged appears near the end of
the trace, so it is important for performance to keep snapshots close to
the end.

This implements a periodic snapshot mode that keeps a rolling set of
recent snapshots.

Arguably this should be done by the debugger or a program that talks to
QMP, but for setting up simple scenarios and tests, it is convenient to
have this feature.


I'm looking at resurrecting this to help add a bit of testing...

[snip]


+static void replay_snapshot_timer_cb(void *opaque)
+{
+Error *err = NULL;
+char *name;
+
+if (!replay_can_snapshot()) {
+/* Try again soon */
+timer_mod(replay_snapshot_timer,
+  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) +
+  replay_snapshot_periodic_delay / 10);
+return;
+}
+
+name = g_strdup_printf("%s-%d", replay_snapshot, replay_snapshot_count);
+if (!save_snapshot(name,
+   true, NULL, false, NULL, )) {
+error_report_err(err);
+error_report("Could not create periodic snapshot "
+ "for icount record, disabling");
+g_free(name);
+return;
+}
+g_free(name);
+replay_snapshot_count++;
+
+if (replay_snapshot_periodic_nr_keep >= 1 &&
+replay_snapshot_count > replay_snapshot_periodic_nr_keep) {
+int del_nr;
+
+del_nr = replay_snapshot_count - replay_snapshot_periodic_nr_keep - 1;
+name = g_strdup_printf("%s-%d", replay_snapshot, del_nr);
+if (!delete_snapshot(name, false, NULL, )) {
+error_report_err(err);
+error_report("Could not delete periodic snapshot "
+ "for icount record");
+}
+g_free(name);
+}
+
+timer_mod(replay_snapshot_timer,
+  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) +
+  replay_snapshot_periodic_delay);


I'm not sure that realtime is not the best choice for such of a timer.
Virtual machine may be stopped or slowed down for some reason.


My thinking was that, say if you snapshot every 10 seconds of real time
executed, then you should have an upper limit on the order of 10 seconds
to perform a reverse-debug operation (so long as you don't exceed your
nr_keep limit).


But in some cases savevm itself could take more than 10 seconds.
We'll have infinite saving in this case. That's why I propose using 
virtual clock with the QEMU_TIMER_ATTR_EXTERNAL attribute.




Is it worth worrying about complexity of slowdowns and vm pausing?
Maybe it could stop snapshotting on a host pause.


+}
+
   void replay_vmstate_init(void)
   {
   Error *err = NULL;
@@ -81,6 +128,16 @@ void replay_vmstate_init(void)
   error_report("Could not create snapshot for icount record");
   exit(1);
   }
+
+if (replay_snapshot_mode == REPLAY_SNAPSHOT_MODE_PERIODIC) {
+replay_snapshot_timer = timer_new_ms(QEMU_CLOCK_REALTIME,
+ replay_snapshot_timer_cb,
+ NULL);
+timer_mod(replay_snapshot_timer,
+  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) +
+  replay_snapshot_periodic_delay);
+}
+


Please also delete placeholder comment for the snapshotting timer
in replay_enable function.


Wil do.


   } else if (replay_mode == REPLAY_MODE_PLAY) {
   if (!load_snapshot(replay_snapshot, NULL, false, NULL, )) {
   error_report_err(err);
diff --git a/replay/replay.c b/replay/replay.c
index e64f71209a..fa5930700d 100644
--- a/replay/replay.c
+++ b/replay/replay.c
@@ -29,6 +29,10 @@
   ReplayMode replay_mode = REPLAY_MODE_NONE;
   char *replay_snapshot;
   
+ReplaySnapshotMode replay_snapshot_mode;

+uint64_t replay_snapshot_periodic_delay;
+int replay_snapshot_periodic_nr_keep;
+
   /* Name of replay file  */
   static char *replay_filename;
   ReplayState replay_state;
@@ -313,6 +317,27 @@ void replay_configure(QemuOpts *opts)
   }
   
   replay_snapshot = g_strdup(qemu_opt_get(opts, "rrsnapshot"));

+if (replay_snapshot && mode == REPLAY_MODE_RECORD) {


Can such a snapshotting may be useful in replay mode?


Does snapshotting do anything in replay mode? 


Yes, you can create as many snapshots as you want if 'snapshot=on'
option of the disk image was not used.


I assume if we did
snapshotting based on the machine timer then we'd have to support
it here so the timer events get replayed properly, at least. But
I was trying to get by with minimum complexity :)


Use QEMU_TIMER_ATTR_EXTERNAL attribute for the timer and then its
events will not affect the

[PATCH v1 07/11] vfio/iommufd: Implement host_iommu_device_create callback in iommufd mode

2024-02-27 Thread Zhenzhong Duan

This callback will be used to initialize base and public elements
in IOMMUFDDevice.

Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/iommufd.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 9bfddc1360..1c2f5da0d0 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -619,6 +619,15 @@ out_single:
 return ret;
 }
 
+static void vfio_cdev_host_iommu_device_create(VFIODevice *vbasedev)
+{
+IOMMUFDDevice *idev = g_malloc0(sizeof(IOMMUFDDevice));
+
+vbasedev->base_hdev = >base;
+
+iommufd_device_init(idev, vbasedev->iommufd, vbasedev->devid);
+}
+
 static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data)
 {
 VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
@@ -628,6 +637,7 @@ static void vfio_iommu_iommufd_class_init(ObjectClass 
*klass, void *data)
 vioc->attach_device = iommufd_cdev_attach;
 vioc->detach_device = iommufd_cdev_detach;
 vioc->pci_hot_reset = iommufd_cdev_pci_hot_reset;
+vioc->host_iommu_device_create = vfio_cdev_host_iommu_device_create;
 };
 
 static const TypeInfo types[] = {
-- 
2.34.1

[PATCH v1 05/11] vfio: Introduce host_iommu_device_create callback

2024-02-27 Thread Zhenzhong Duan

Introduce host_iommu_device_create callback and a wrapper for it.

This callback is used to allocate a host iommu device instance and
initialize it based on type.

Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h | 1 +
 include/hw/vfio/vfio-container-base.h | 1 +
 hw/vfio/common.c  | 8 
 3 files changed, 10 insertions(+)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index b6676c9f79..9fefea4b89 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -208,6 +208,7 @@ struct vfio_device_info *vfio_get_device_info(int fd);
 int vfio_attach_device(char *name, VFIODevice *vbasedev,
AddressSpace *as, Error **errp);
 void vfio_detach_device(VFIODevice *vbasedev);
+void host_iommu_device_create(VFIODevice *vbasedev);
 
 int vfio_kvm_device_add_fd(int fd, Error **errp);
 int vfio_kvm_device_del_fd(int fd, Error **errp);
diff --git a/include/hw/vfio/vfio-container-base.h 
b/include/hw/vfio/vfio-container-base.h
index b2813b0c11..dc003f6eb2 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -120,6 +120,7 @@ struct VFIOIOMMUClass {
 int (*attach_device)(const char *name, VFIODevice *vbasedev,
  AddressSpace *as, Error **errp);
 void (*detach_device)(VFIODevice *vbasedev);
+void (*host_iommu_device_create)(VFIODevice *vbasedev);
 /* migration feature */
 int (*set_dirty_page_tracking)(const VFIOContainerBase *bcontainer,
bool start);
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 059bfdc07a..41e9031c59 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1521,3 +1521,11 @@ void vfio_detach_device(VFIODevice *vbasedev)
 }
 vbasedev->bcontainer->ops->detach_device(vbasedev);
 }
+
+void host_iommu_device_create(VFIODevice *vbasedev)
+{
+const VFIOIOMMUClass *ops = vbasedev->bcontainer->ops;
+
+assert(ops->host_iommu_device_create);
+ops->host_iommu_device_create(vbasedev);
+}
-- 
2.34.1

[PATCH v1 04/11] vfio: Add HostIOMMUDevice handle into VFIODevice

2024-02-27 Thread Zhenzhong Duan

This handle points to either IOMMULegacyDevice or IOMMUFDDevice variant,
neither both.

Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 8bfb9cbe94..b6676c9f79 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -130,6 +130,7 @@ typedef struct VFIODevice {
 OnOffAuto pre_copy_dirty_page_tracking;
 bool dirty_pages_supported;
 bool dirty_tracking;
+HostIOMMUDevice *base_hdev;
 int devid;
 IOMMUFDBackend *iommufd;
 } VFIODevice;
-- 
2.34.1

[PATCH v1 11/11] backends/iommufd: Introduce helper function iommufd_device_get_info()

2024-02-27 Thread Zhenzhong Duan

Introduce a helper function iommufd_device_get_info() to get
host IOMMU related information through iommufd uAPI.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 include/sysemu/iommufd.h |  4 
 backends/iommufd.c   | 23 ++-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index d509ff88ef..518b97bfed 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h
@@ -4,6 +4,7 @@
 #include "qom/object.h"
 #include "exec/hwaddr.h"
 #include "exec/cpu-common.h"
+#include 
 #include "sysemu/host_iommu_device.h"
 
 #define TYPE_IOMMUFD_BACKEND "iommufd"
@@ -48,4 +49,7 @@ typedef struct IOMMUFDDevice {
 
 void iommufd_device_init(IOMMUFDDevice *idev,
  IOMMUFDBackend *iommufd, int devid);
+int iommufd_device_get_info(IOMMUFDDevice *idev,
+enum iommu_hw_info_type *type,
+uint32_t len, void *data, Error **errp);
 #endif
diff --git a/backends/iommufd.c b/backends/iommufd.c
index 6d280e4aea..69f3f75ea5 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -20,7 +20,6 @@
 #include "monitor/monitor.h"
 #include "trace.h"
 #include 
-#include 
 
 static void iommufd_backend_init(Object *obj)
 {
@@ -240,3 +239,25 @@ void iommufd_device_init(IOMMUFDDevice *idev,
 idev->iommufd = iommufd;
 idev->devid = devid;
 }
+
+int iommufd_device_get_info(IOMMUFDDevice *idev,
+enum iommu_hw_info_type *type,
+uint32_t len, void *data, Error **errp)
+{
+struct iommu_hw_info info = {
+.size = sizeof(info),
+.dev_id = idev->devid,
+.data_len = len,
+.data_uptr = (uintptr_t)data,
+};
+int ret;
+
+ret = ioctl(idev->iommufd->fd, IOMMU_GET_HW_INFO, );
+if (ret) {
+error_setg_errno(errp, errno, "Failed to get hardware info");
+} else {
+*type = info.out_data_type;
+}
+
+return ret;
+}
-- 
2.34.1

[PATCH v1 09/11] hw/pci: Introduce pci_device_set/unset_iommu_device()

2024-02-27 Thread Zhenzhong Duan

From: Yi Liu 

This adds pci_device_set/unset_iommu_device() to set/unset
HostIOMMUDevice for a given PCIe device. Caller of set
should fail if set operation fails.

Extract out pci_device_get_iommu_bus_devfn() to facilitate
implementation of pci_device_set/unset_iommu_device().

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Nicolin Chen 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/pci/pci.h | 38 ++-
 hw/pci/pci.c | 62 +---
 2 files changed, 96 insertions(+), 4 deletions(-)

diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index fa6313aabc..8fe6f746d7 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -3,6 +3,7 @@
 
 #include "exec/memory.h"
 #include "sysemu/dma.h"
+#include "sysemu/host_iommu_device.h"
 
 /* PCI includes legacy ISA access.  */
 #include "hw/isa/isa.h"
@@ -384,10 +385,45 @@ typedef struct PCIIOMMUOps {
  *
  * @devfn: device and function number
  */
-   AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn);
+AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn);
+/**
+ * @set_iommu_device: attach a HostIOMMUDevice to a vIOMMU
+ *
+ * Optional callback, if not implemented in vIOMMU, then vIOMMU can't
+ * retrieve host information from the associated HostIOMMUDevice.
+ *
+ * Return true if HostIOMMUDevice is attached, or else return false
+ * with errp set.
+ *
+ * @bus: the #PCIBus of the PCI device.
+ *
+ * @opaque: the data passed to pci_setup_iommu().
+ *
+ * @devfn: device and function number of the PCI device.
+ *
+ * @dev: the data structure representing host IOMMU device.
+ *
+ */
+int (*set_iommu_device)(PCIBus *bus, void *opaque, int devfn,
+HostIOMMUDevice *dev, Error **errp);
+/**
+ * @unset_iommu_device: detach a HostIOMMUDevice from a vIOMMU
+ *
+ * Optional callback.
+ *
+ * @bus: the #PCIBus of the PCI device.
+ *
+ * @opaque: the data passed to pci_setup_iommu().
+ *
+ * @devfn: device and function number of the PCI device.
+ */
+void (*unset_iommu_device)(PCIBus *bus, void *opaque, int devfn);
 } PCIIOMMUOps;
 
 AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
+int pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *base_dev,
+Error **errp);
+void pci_device_unset_iommu_device(PCIDevice *dev);
 
 /**
  * pci_setup_iommu: Initialize specific IOMMU handlers for a PCIBus
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 76080af580..8078307963 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2672,11 +2672,14 @@ static void pci_device_class_base_init(ObjectClass 
*klass, void *data)
 }
 }
 
-AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
+static void pci_device_get_iommu_bus_devfn(PCIDevice *dev,
+   PCIBus **aliased_bus,
+   PCIBus **piommu_bus,
+   int *aliased_devfn)
 {
 PCIBus *bus = pci_get_bus(dev);
 PCIBus *iommu_bus = bus;
-uint8_t devfn = dev->devfn;
+int devfn = dev->devfn;
 
 while (iommu_bus && !iommu_bus->iommu_ops && iommu_bus->parent_dev) {
 PCIBus *parent_bus = pci_get_bus(iommu_bus->parent_dev);
@@ -2717,13 +2720,66 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice 
*dev)
 
 iommu_bus = parent_bus;
 }
-if (!pci_bus_bypass_iommu(bus) && iommu_bus->iommu_ops) {
+
+assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+assert(iommu_bus);
+
+if (pci_bus_bypass_iommu(bus) || !iommu_bus->iommu_ops) {
+iommu_bus = NULL;
+}
+
+*piommu_bus = iommu_bus;
+
+if (aliased_bus) {
+*aliased_bus = bus;
+}
+
+if (aliased_devfn) {
+*aliased_devfn = devfn;
+}
+}
+
+AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
+{
+PCIBus *bus;
+PCIBus *iommu_bus;
+int devfn;
+
+pci_device_get_iommu_bus_devfn(dev, , _bus, );
+if (iommu_bus) {
 return iommu_bus->iommu_ops->get_address_space(bus,
  iommu_bus->iommu_opaque, devfn);
 }
 return _space_memory;
 }
 
+int pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *base_dev,
+Error **errp)
+{
+PCIBus *iommu_bus;
+
+pci_device_get_iommu_bus_devfn(dev, NULL, _bus, NULL);
+if (iommu_bus && iommu_bus->iommu_ops->set_iommu_device) {
+return iommu_bus->iommu_ops->set_iommu_device(pci_get_bus(dev),
+  iommu_bus->iommu_opaque,
+  dev->devfn, base_dev,
+  errp);
+}
+return 0;
+}
+
+void pci_device_unset_iommu_device(PCIDevice *dev)
+{
+PCIBus

[PATCH v1 10/11] vfio: Pass HostIOMMUDevice to vIOMMU

2024-02-27 Thread Zhenzhong Duan

Support both iommufd and legacy backend.

Originally-by: Yi Liu 
Signed-off-by: Nicolin Chen 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/pci.c | 20 +++-
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 6cc7de5d10..ed9f386fde 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3112,11 +3112,17 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 
 vfio_bars_register(vdev);
 
-ret = vfio_add_capabilities(vdev, errp);
+ret = pci_device_set_iommu_device(pdev, vbasedev->base_hdev, errp);
 if (ret) {
+error_prepend(errp, "Failed to set iommu_device: ");
 goto out_teardown;
 }
 
+ret = vfio_add_capabilities(vdev, errp);
+if (ret) {
+goto out_unset_idev;
+}
+
 if (vdev->vga) {
 vfio_vga_quirk_setup(vdev);
 }
@@ -3133,7 +3139,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 error_setg(errp,
"cannot support IGD OpRegion feature on hotplugged "
"device");
-goto out_teardown;
+goto out_unset_idev;
 }
 
 ret = vfio_get_dev_region_info(vbasedev,
@@ -3142,13 +3148,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 if (ret) {
 error_setg_errno(errp, -ret,
  "does not support requested IGD OpRegion 
feature");
-goto out_teardown;
+goto out_unset_idev;
 }
 
 ret = vfio_pci_igd_opregion_init(vdev, opregion, errp);
 g_free(opregion);
 if (ret) {
-goto out_teardown;
+goto out_unset_idev;
 }
 }
 
@@ -3234,6 +3240,8 @@ out_deregister:
 if (vdev->intx.mmap_timer) {
 timer_free(vdev->intx.mmap_timer);
 }
+out_unset_idev:
+pci_device_unset_iommu_device(pdev);
 out_teardown:
 vfio_teardown_msi(vdev);
 vfio_bars_exit(vdev);
@@ -3263,6 +3271,7 @@ static void vfio_instance_finalize(Object *obj)
 static void vfio_exitfn(PCIDevice *pdev)
 {
 VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+VFIODevice *vbasedev = >vbasedev;
 
 vfio_unregister_req_notifier(vdev);
 vfio_unregister_err_notifier(vdev);
@@ -3277,7 +3286,8 @@ static void vfio_exitfn(PCIDevice *pdev)
 vfio_teardown_msi(vdev);
 vfio_pci_disable_rp_atomics(vdev);
 vfio_bars_exit(vdev);
-vfio_migration_exit(>vbasedev);
+vfio_migration_exit(vbasedev);
+pci_device_unset_iommu_device(pdev);
 }
 
 static void vfio_pci_reset(DeviceState *dev)
-- 
2.34.1

[PATCH v1 03/11] vfio: Introduce IOMMULegacyDevice

2024-02-27 Thread Zhenzhong Duan

Similar as IOMMUFDDevice, IOMMULegacyDevice represents a device in
legacy mode and can be used as a communication interface between
devices (i.e., VFIO, VDPA) and vIOMMU.

Currently it includes nothing legacy specific, but could be extended
with any wanted info of legacy mode when necessary.

IOMMULegacyDevice is willingly not a QOM object because we don't want
it to be visible from the user interface.

Suggested-by: Eric Auger 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 9b7ef7d02b..8bfb9cbe94 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -31,6 +31,7 @@
 #endif
 #include "sysemu/sysemu.h"
 #include "hw/vfio/vfio-container-base.h"
+#include "sysemu/host_iommu_device.h"
 
 #define VFIO_MSG_PREFIX "vfio %s: "
 
@@ -97,6 +98,11 @@ typedef struct VFIOIOMMUFDContainer {
 uint32_t ioas_id;
 } VFIOIOMMUFDContainer;
 
+/* Abstraction of host IOMMU legacy device */
+typedef struct IOMMULegacyDevice {
+HostIOMMUDevice base;
+} IOMMULegacyDevice;
+
 typedef struct VFIODeviceOps VFIODeviceOps;
 
 typedef struct VFIODevice {
-- 
2.34.1

[PATCH v1 08/11] vfio/pci: Allocate and initialize HostIOMMUDevice after attachment

2024-02-27 Thread Zhenzhong Duan

Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/pci.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 4fa387f043..6cc7de5d10 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3006,6 +3006,9 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 goto error;
 }
 
+/* Allocate and initialize HostIOMMUDevice after attachment succeed */
+host_iommu_device_create(vbasedev);
+
 vfio_populate_device(vdev, );
 if (err) {
 error_propagate(errp, err);
@@ -3244,6 +3247,7 @@ static void vfio_instance_finalize(Object *obj)
 
 vfio_display_finalize(vdev);
 vfio_bars_finalize(vdev);
+g_free(vdev->vbasedev.base_hdev);
 g_free(vdev->emulated_config_bits);
 g_free(vdev->rom);
 /*
-- 
2.34.1

[PATCH v1 00/11] Add a host IOMMU device abstraction

2024-02-27 Thread Zhenzhong Duan

Hi,

Based on Joao's suggestion, the iommufd nesting prerequisite series [1]
is further splitted to host IOMMU device abstract part and vIOMMU
check/sync part. This series implements the 1st part.

This split also faciliates the dirty tracking series [2] and virtio-iommu
series [3] to depend on 1st part.

PATCH1-3: Introduce HostIOMMUDevice and two sub class
PATCH4: Define HostIOMMUDevice handle in VFIODevice
PATCH5-8: Introdcue host_iommu_device_create callback to allocate and intialize 
HostIOMMUDevice
PATCH9-10: Introdcue set/unset_iommu_device to pass HostIOMMUDevice to vIOMMU
PATCH11: a helper to get host IOMMU info

Because it's becoming clear on community's suggestion, I'd like to remove
rfc tag from this version.

Qemu code can be found at:
https://github.com/yiliu1765/qemu/tree/zhenzhong/iommufd_nesting_preq_part1_v1

[1] 
https://lore.kernel.org/qemu-devel/20240201072818.327930-1-zhenzhong.d...@intel.com/
[2] 
https://lore.kernel.org/qemu-devel/20240212135643.5858-1-joao.m.mart...@oracle.com/
[3] 
https://lore.kernel.org/qemu-devel/20240117080414.316890-1-eric.au...@redhat.com/

Thanks
Zhenzhong

Changelog:
v1:
- use HostIOMMUDevice handle instead of union in VFIODevice (Eric)
- change host_iommu_device_init to host_iommu_device_create
- allocate HostIOMMUDevice in host_iommu_device_create callback
  and set the VFIODevice base_hdev handle (Eric)
- refine pci_device_set/unset_iommu_device doc (Eric)
- use HostIOMMUDevice handle instead of union in VTDHostIOMMUDevice (Eric)

rfcv2:
- introduce common abstract HostIOMMUDevice and sub struct for different BEs 
(Eric, Cédric)
- remove iommufd_device.[ch] (Cédric)
- remove duplicate iommufd/devid define from VFIODevice (Eric)
- drop the p in aliased_pbus and aliased_pdevfn (Eric)
- assert devfn and iommu_bus in pci_device_get_iommu_bus_devfn (Cédric, Eric)
- use errp in iommufd_device_get_info (Eric)
- split and simplify cap/ecap check/sync code in intel_iommu.c (Cédric)
- move VTDHostIOMMUDevice declaration to intel_iommu_internal.h (Cédric)
- make '(vtd->cap_reg >> 16) & 0x3fULL' a MACRO and add missed '+1' (Cédric)
- block migration if vIOMMU cap/ecap updated based on host IOMMU cap/ecap
- add R-B


Yi Liu (1):
  hw/pci: Introduce pci_device_set/unset_iommu_device()

Zhenzhong Duan (10):
  Introduce a common abstract struct HostIOMMUDevice
  backends/iommufd: Introduce IOMMUFDDevice
  vfio: Introduce IOMMULegacyDevice
  vfio: Add HostIOMMUDevice handle into VFIODevice
  vfio: Introduce host_iommu_device_create callback
  vfio/container: Implement host_iommu_device_create callback in legacy
mode
  vfio/iommufd: Implement host_iommu_device_create callback in iommufd
mode
  vfio/pci: Allocate and initialize HostIOMMUDevice after attachment
  vfio: Pass HostIOMMUDevice to vIOMMU
  backends/iommufd: Introduce helper function iommufd_device_get_info()

 include/hw/pci/pci.h  | 38 +++-
 include/hw/vfio/vfio-common.h |  8 
 include/hw/vfio/vfio-container-base.h |  1 +
 include/sysemu/host_iommu_device.h| 22 ++
 include/sysemu/iommufd.h  | 19 
 backends/iommufd.c| 32 +-
 hw/pci/pci.c  | 62 +--
 hw/vfio/common.c  |  8 
 hw/vfio/container.c   |  9 
 hw/vfio/iommufd.c | 10 +
 hw/vfio/pci.c | 24 ---
 11 files changed, 223 insertions(+), 10 deletions(-)
 create mode 100644 include/sysemu/host_iommu_device.h

-- 
2.34.1

[PATCH v1 06/11] vfio/container: Implement host_iommu_device_create callback in legacy mode

2024-02-27 Thread Zhenzhong Duan

This callback will be used to initialize base and public elements in
IOMMULegacyDevice.

Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/container.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index bd25b9fbad..2e8ff32284 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -1120,6 +1120,14 @@ out_single:
 return ret;
 }
 
+static void vfio_legacy_host_iommu_device_create(VFIODevice *vbasedev)
+{
+vbasedev->base_hdev = g_malloc0(sizeof(IOMMULegacyDevice));
+
+host_iommu_base_device_init(vbasedev->base_hdev, HID_LEGACY,
+sizeof(IOMMULegacyDevice));
+}
+
 static void vfio_iommu_legacy_class_init(ObjectClass *klass, void *data)
 {
 VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
@@ -1132,6 +1140,7 @@ static void vfio_iommu_legacy_class_init(ObjectClass 
*klass, void *data)
 vioc->set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking;
 vioc->query_dirty_bitmap = vfio_legacy_query_dirty_bitmap;
 vioc->pci_hot_reset = vfio_legacy_pci_hot_reset;
+vioc->host_iommu_device_create = vfio_legacy_host_iommu_device_create;
 };
 
 static const TypeInfo types[] = {
-- 
2.34.1

[PATCH v1 01/11] Introduce a common abstract struct HostIOMMUDevice

2024-02-27 Thread Zhenzhong Duan

HostIOMMUDevice will be inherited by two sub classes,
legacy and iommufd currently.

Introduce a helper function host_iommu_base_device_init to initialize it.

Suggested-by: Eric Auger 
Signed-off-by: Zhenzhong Duan 
---
 include/sysemu/host_iommu_device.h | 22 ++
 1 file changed, 22 insertions(+)
 create mode 100644 include/sysemu/host_iommu_device.h

diff --git a/include/sysemu/host_iommu_device.h 
b/include/sysemu/host_iommu_device.h
new file mode 100644
index 00..fe80ab25fb
--- /dev/null
+++ b/include/sysemu/host_iommu_device.h
@@ -0,0 +1,22 @@
+#ifndef HOST_IOMMU_DEVICE_H
+#define HOST_IOMMU_DEVICE_H
+
+typedef enum HostIOMMUDevice_Type {
+HID_LEGACY,
+HID_IOMMUFD,
+HID_MAX,
+} HostIOMMUDevice_Type;
+
+typedef struct HostIOMMUDevice {
+HostIOMMUDevice_Type type;
+size_t size;
+} HostIOMMUDevice;
+
+static inline void host_iommu_base_device_init(HostIOMMUDevice *dev,
+   HostIOMMUDevice_Type type,
+   size_t size)
+{
+dev->type = type;
+dev->size = size;
+}
+#endif
-- 
2.34.1

[PATCH v1 02/11] backends/iommufd: Introduce IOMMUFDDevice

2024-02-27 Thread Zhenzhong Duan

IOMMUFDDevice represents a device in iommufd and can be used as
a communication interface between devices (i.e., VFIO, VDPA) and
vIOMMU.

Currently it includes only public iommufd handle and device id
which could be used by vIOMMU to get hw IOMMU information.

There will also be some elements in private field in future,
i.e., capability bits for dirty tracking; when nested translation
is supported in future, vIOMMU is going to have more iommufd related
operations like allocate hwpt for a device, attach/detach hwpt, etc.
So IOMMUFDDevice will be further extended with those needs.

IOMMUFDDevice is willingly not a QOM object because we don't want
it to be visible from the user interface.

Introduce a helper iommufd_device_init to initialize IOMMUFDDevice.

Originally-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 include/sysemu/iommufd.h | 15 +++
 backends/iommufd.c   |  9 +
 2 files changed, 24 insertions(+)

diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index 9af27ebd6c..d509ff88ef 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h
@@ -4,6 +4,7 @@
 #include "qom/object.h"
 #include "exec/hwaddr.h"
 #include "exec/cpu-common.h"
+#include "sysemu/host_iommu_device.h"
 
 #define TYPE_IOMMUFD_BACKEND "iommufd"
 OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, IOMMUFD_BACKEND)
@@ -33,4 +34,18 @@ int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t 
ioas_id, hwaddr iova,
 ram_addr_t size, void *vaddr, bool readonly);
 int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
   hwaddr iova, ram_addr_t size);
+
+
+/* Abstraction of host IOMMUFD device */
+typedef struct IOMMUFDDevice {
+/* private: */
+HostIOMMUDevice base;
+
+/* public: */
+IOMMUFDBackend *iommufd;
+uint32_t devid;
+} IOMMUFDDevice;
+
+void iommufd_device_init(IOMMUFDDevice *idev,
+ IOMMUFDBackend *iommufd, int devid);
 #endif
diff --git a/backends/iommufd.c b/backends/iommufd.c
index 1ef683c7b0..6d280e4aea 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -231,3 +231,12 @@ static void register_types(void)
 }
 
 type_init(register_types);
+
+void iommufd_device_init(IOMMUFDDevice *idev,
+ IOMMUFDBackend *iommufd, int devid)
+{
+host_iommu_base_device_init(>base, HID_IOMMUFD,
+sizeof(IOMMUFDDevice));
+idev->iommufd = iommufd;
+idev->devid = devid;
+}
-- 
2.34.1

Re: [RFC PATCH 0/5] hw/i386/q35: Decouple virtual SMI# lines and wire them to ICH9 chipset

2024-02-27 Thread Laszlo Ersek

Hi Phil,

On 2/26/24 17:49, Philippe Mathieu-Daudé wrote:
> Hi,
>
> This is an experimental series to reduce calls to the
> cpu_interrupt() API from generic HW/. I'm trying to use
> the ICH9 chipset from a non-x86 machine. Without this
> experiment, we can not because cpu_interrupt() is target
> specific. Here the interrupt is decoupled using the QDev
> GPIO API. Even if the SMI# line is left unconnected, the
> device is still usable by a guest.
>
> Based-on: <20240226111416.39217-1-phi...@linaro.org>
>
> Philippe Mathieu-Daudé (5):
>   target/i386/cpu: Expose SMI# IRQ line via QDev
>   hw/i386/piix: Set CPU SMI# interrupt using QDev GPIO API
>   hw/ahci/ich9_tco: Set CPU SMI# interrupt using QDev GPIO API
>   hw/i386/q35: Wire virtual SMI# lines to ICH9 chipset
>   hw/isa: Build ich9_lpc.c once
>
>  include/hw/acpi/ich9.h|  1 +
>  include/hw/acpi/ich9_tco.h|  4 ++--
>  include/hw/i386/pc.h  |  2 --
>  include/hw/isa/ich9_lpc.h | 12 
>  include/hw/southbridge/ich9.h |  1 +
>  target/i386/cpu-internal.h|  1 +
>  hw/acpi/ich9.c|  3 ++-
>  hw/acpi/ich9_tco.c| 13 ++---
>  hw/i386/pc.c  |  9 -
>  hw/i386/pc_piix.c |  4 ++--
>  hw/i386/pc_q35.c  | 26 ++
>  hw/isa/ich9_lpc.c | 15 ---
>  hw/southbridge/ich9.c |  1 +
>  target/i386/cpu-sysemu.c  | 11 +++
>  target/i386/cpu.c |  2 ++
>  hw/isa/meson.build|  3 +--
>  16 files changed, 76 insertions(+), 32 deletions(-)
>

This series is over my head for a review, so the best I could offer
would be to test it.

However, even testing it seems like a challenge. First, I've found that,
when building QEMU at dccbaf0cc0f1, my usual libvirt guests don't start
-- I needed to search the web for the error message, and then apply the
revert series

  [PATCH 0/2] Revert "hw/i386/pc: Confine system flash handling to pc_sysfw"
  https://patchew.org/QEMU/20240226215909.30884-1-shen...@gmail.com/

With that, I managed to establish a "baseline" (test some OVMF SMM
stuff, such as UEFI variable services, ACPI S3 suspend/resume, VCPU
hotplug/hot-unplug).

Then I wanted to apply this series (on top of those reverts on top of
dccbaf0cc0f1). It doesn't apply.

Then I noticed you mentioned the dependency on:

  [PATCH v2 00/15] hw/southbridge: Extract ICH9 QOM container model
  https://patchew.org/QEMU/20240226111416.39217-1-phi...@linaro.org/

That only seems to make things more complicated:

- patchew says "Failed in applying to current master"

- in the blurb, you mention "Rebased on top of Bernhard patches";
however, the above reverts appear to undo some of those patches
precisely, so I'm unsure how stable that foundation should be
considered.

I'd prefer waiting until all these patches stabilized a bit, and the
foundation all went upstream, and then I'd have to apply (a new version
of) this particular series only, on the then-master branch, for testing.

Laszlo

Re: [PATCH] migration: Don't serialize migration while can't switchover

2024-02-27 Thread Peter Xu

On Wed, Feb 28, 2024 at 02:00:26AM +0200, Avihai Horon wrote:
> 
> On 27/02/2024 9:41, Peter Xu wrote:
> > External email: Use caution opening links or attachments
> > 
> > 
> > On Thu, Feb 22, 2024 at 05:56:27PM +0200, Avihai Horon wrote:
> > > Currently, migration code serializes device data sending during pre-copy
> > > iterative phase. As noted in the code comment, this is done to prevent
> > > faster changing device from sending its data over and over.
> > Frankly speaking I don't understand the rational behind 90697be889 ("live
> > migration: Serialize vmstate saving in stage 2").  I don't even think I
> > noticed this logic before even if I worked on migration for a few years...
> > 
> > I was thinking all devices should always get its chance to run for some
> > period during iterations.  Do you know the reasoning behind?  And I must
> > confess I also know little on block migration, which seems to be relevant
> > to this change.  Anyway, I also copy Jan just in case he'll be able to chim
> > in.
> 
> I am not 100% sure either, but I can guess:
> This commit is pretty old (dates to 2009), so maybe back then the only
> iterative devices were block devices and RAM.
> Block devices didn't change as fast as RAM (and were probably much bigger
> than RAM), so it made sense to send them first and only then send RAM, which
> changed constantly.

Makes sense.  For some reason I read it the other way round previously.

> 
> > 
> > If there is a fast changing device, even if we don't proceed with other
> > device iterators and we stick with the current one, assuming it can finally
> > finish dumping all data, but then we'll proceed with other devices and the
> > fast changing device can again accumulate dirty information?
> 
> I guess this logic only makes sense for the case where we only have block
> devices and a RAM device, because the block devices wouldn't change that
> much?
> 
> > 
> > > However, with switchover-ack capability enabled, this behavior can be
> > > problematic and may prevent migration from converging. The problem lies
> > > in the fact that an earlier device may never finish sending its data and
> > > thus block other devices from sending theirs.
> > Yes, this is a problem.
> > 
> > > This bug was observed in several VFIO migration scenarios where some
> > > workload on the VM prevented RAM from ever reaching a hard zero, not
> > > allowing VFIO initial pre-copy data to be sent, and thus destination
> > > could not ack switchover. Note that the same scenario, but without
> > > switchover-ack, would converge.
> > > 
> > > Fix it by not serializing device data sending during pre-copy iterative
> > > phase if switchover was not acked yet.
> > I am still not fully convinced that it's even legal that one device can
> > consume all iterator's bandwidth, ignoring the rest..  Though again it's
> > not about this patch, but about commit 90697be889.
> 
> Yes, I agree. As I wrote above, maybe this was valid back then when the only
> iterative devices were block and RAM.
> 
> > 
> > I'm thinking whether we should allow each device to have its own portion of
> > chance to push data for each call to qemu_savevm_state_iterate(),
> > irrelevant of vfio's switchover-ack capability.
> 
> I wasn't sure for 100% why we have this logic in the first place, so I wrote
> my patch as little invasive as possible, keeping migration behavior as is
> (except for switchover-ack).
> But I tend to agree with you for three reasons:
> 
> 1. I think block migration is deprecated (see commits 66db46ca83b8,
> 40101f320d6b and 8846b5bfca76).
> Instead, users are instructed to use blockdev-mirror and co. [1]. If I'm not
> mistaken, this operates using a different infrastructure than migration.
> So this logic is not relevant anymore.
> 
> 2. As you pointed out earlier, the fast changing device can accumulate dirty
> data over and over. VFIO devices come after RAM, so this logic doesn't
> achieve its goal in this case (we may sync fast changing RAM over and over).
> 
> 3. My fix in this patch won't solve a similar problem that could happen,
> where a VFIO device with a lot of pre-copy data (not necessarily initial
> data) may never be able to send it, thus not realizing the full potential of
> pre-copy for VFIO.
> (I personally have not encountered this problem yet, but maybe it can happen
> with a vGPU).

Thanks for a summary.

> 
> 
> If you agree, I can send a v2 that simply removes this logic and gives every
> device an equal chance to send its data (like Joao showed) with some
> explanation why we do it.

Let's see whether others have an opinion, but to me I think we can give it
a shot.  In that case we can "break" in the previous "ret < 0" check
already.

One more thing to mention is then I think we need to calculate the case of
"all iterators returned 1" (aka, "all completes") scenario.  With the old
check it is guaranteed if the loop iterates over all iterators then all
iterators have completed.  Now we allow ret==0 to

RE: [PATCH rfcv2 18/18] intel_iommu: Block migration if cap is updated

2024-02-27 Thread Duan, Zhenzhong



>-Original Message-
>From: Joao Martins 
>Subject: Re: [PATCH rfcv2 18/18] intel_iommu: Block migration if cap is
>updated
>
>On 27/02/2024 02:41, Duan, Zhenzhong wrote:
>>
>>
>>> -Original Message-
>>> From: Joao Martins 
>>> Subject: Re: [PATCH rfcv2 18/18] intel_iommu: Block migration if cap is
>>> updated
>>>
>>> On 01/02/2024 07:28, Zhenzhong Duan wrote:
 When there is VFIO device and vIOMMU cap/ecap is updated based on
>>> host
 IOMMU cap/ecap, migration should be blocked.

 Signed-off-by: Zhenzhong Duan 
>>>
>>> Is this really needed considering migration with vIOMMU is already
>blocked
>>> anyways?
>>
>> VFIO device can be hot unplugged, then blocker due to vIOMMU is
>removed,
>> but we still need a blocker for cap/ecap update.
>>
>
>Right which then the blocker gets re-added after you add one VFIO device.
>The
>commit message refers xplicitly VFIO device, why would you care about
>blocking
>migration on vIOMMU without vfio devices present? Maybe there's another
>reason
>but that the commit messages doesn't cover? like guest MGAW being bigger
>than
>host MGAW or something like that?

If qemu starts with cold plugged vfio device, that vfio device may update 
cap/ecap.
Even if that vfio device is unplugged at runtime, the changed cap/ecap is kept.
In this case source and dest will have incompatible cap/ecap config.
So I block migration if there is cap/ecap update on source side.

This patch is to deal with the case that there is cold plugged vfio device 
which is
unplugged at runtime and then migration happen.

Thanks
Zhenzhong

[PATCH] hw/misc: zynq_slcr: set SLC_RST bit in REBOOT_STATUS register

2024-02-27 Thread Gregory Anders

When the CPU is reset using PSS_RST_CTRL in the SLCR, bit 19 in
REBOOT_STATUS should be set.

Refer to page 1602 of the Xilinx Zynq 7000 Technical Reference Manual.

Signed-off-by: Gregory Anders 
---
 hw/misc/zynq_slcr.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hw/misc/zynq_slcr.c b/hw/misc/zynq_slcr.c
index d2ac2e77f2..a8f1792bf6 100644
--- a/hw/misc/zynq_slcr.c
+++ b/hw/misc/zynq_slcr.c
@@ -120,6 +120,7 @@ REG32(RS_AWDT_CTRL, 0x24c)
 REG32(RST_REASON, 0x250)
 
 REG32(REBOOT_STATUS, 0x258)
+FIELD(REBOOT_STATUS, SLC_RST, 19, 1)
 REG32(BOOT_MODE, 0x25c)
 
 REG32(APU_CTRL, 0x300)
@@ -562,6 +563,7 @@ static void zynq_slcr_write(void *opaque, hwaddr offset,
 switch (offset) {
 case R_PSS_RST_CTRL:
 if (FIELD_EX32(val, PSS_RST_CTRL, SOFT_RST)) {
+s->regs[R_REBOOT_STATUS] |= R_REBOOT_STATUS_SLC_RST_MASK;
 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
 }
 break;
-- 
2.43.2

Re: [PATCH 3/5] hw/isa: Embed TYPE_PORT92 in south bridges used in PC machines

2024-02-27 Thread BALATON Zoltan


On Tue, 27 Feb 2024, Bernhard Beschow wrote:

Am 27. Februar 2024 21:54:19 UTC schrieb BALATON Zoltan :

On Tue, 27 Feb 2024, Bernhard Beschow wrote:

Am 21. Februar 2024 11:53:21 UTC schrieb Mark Cave-Ayland 
:

On 18/02/2024 13:16, Bernhard Beschow wrote:

Port 92 is an integral part of the PIIX and ICH south bridges, so instantiate it
there. The isapc machine now needs to instantiate it explicitly, analoguous to
the RTC.

Note that due to migration compatibility, port92 is optional in the south
bridges. It is always instantiated the isapc machine for simplicity.

Signed-off-by: Bernhard Beschow 
---
  include/hw/i386/pc.h  |  2 +-
  include/hw/southbridge/ich9.h |  4 
  include/hw/southbridge/piix.h |  3 +++
  hw/i386/pc.c  | 18 --
  hw/i386/pc_piix.c |  9 +++--
  hw/i386/pc_q35.c  |  8 +---
  hw/isa/lpc_ich9.c |  9 +
  hw/isa/piix.c |  9 +
  hw/isa/Kconfig|  2 ++
  9 files changed, 52 insertions(+), 12 deletions(-)


I had a look at this (and did a bit of revision around 8042 and A20), and I am 
starting to wonder if the PORT92 device isn't something that belongs to the 
southbridge, but more specifically to the superio chip?


If there is agreement to model real hardware in QEMU, then I think that


I think there's no such agreement and QEMU is more lax about it both for 
historical reasons and to simplify machine models. Indeed, QEMU sometimes 
models non-existing machines (e.g. the mac99 or virt boards) that don't 
correspond to real hardware but allow guest OSes to boot. Even when modelllng 
real hardware it's ofren modelled just enough for guests to work and unused 
details are omitted for simplicity. It is recommended to follow what real 
hardware does when modelling real hardware but not always required. Although it 
might help both with verifying a device model and to compose machines with 
these models to try to follow the real hardware.


Composing real machines and verifying device models is exactly what I'm 
after. I'm aware that QEMU provides virt machines such as the microvm, 
and from the context I didn't refer to these.


Even without pure virt machines currently a lot of QEMU machines don't 
exactly model real hardware. They may roughly follow real hardware but not 
exactly such as mac99 is a non-existent Mac and the pc machines also use 
some parts that don't exist in real life such as PIIX4-PIIX3 hybrid you've 
been working on resolving. Some of these however are restricted by 
backward compatibilty requirements. But you probably aware of all of that 
but this means the argument that real hardware should be followed is not 
enough. At least it should not break backward compatibility too much and 
that's more important than exactly modelling real machine. Also having a 
simple model may be more important than modeling every detail even when 
not used just to follow real hardware.



port 92 belongs into any device model where the hardware has one. All our 
PC-like southbridges (PIIX, ICH, VIA) have port 92. Many FDC37 including 
the FDC37M81x as used in the Malta board have one, too -- where it must first 
be enabled.


So port92 is not a real hardware but a QEMU abstraction or model of some 
functionality found in some machines. Real chips probably implement this in 
different ways so we could either model this in these chips independently the 
same way as real hardware does or use the abstracted model anywhere in our 
machine model. Since this does not exist in real hardware as this abstract 
model it also does not belong anywhere so we are free to put it where it's most 
convenient or simple to do.


As mentioned already, port 92 is an integral part of PIIX, ICH, and VIA 
southbridges.


Mark argued that more specifically it's part of the superio within those 
couthbridges. That makes sense, considering this port92 is related to 
functionality that was in the keyboard contorller before which is part of 
the superio. I don't know PC hardware too well but reading about this fast 
gate A20 feature looks like original PC and XT had only a 1 MB address 
space but addresses above 1 MB wrapped to 0 and some software depended on 
that. Then AT added more memory but then it needed a way to control if 
addresses above 1 MB would wrap or access high memory. This was done with 
some free part of the keyboard controller but that was too slow so an 
alternative fast way was added with this port92 device. But then the old 
keyboard controller and this port92 stuff are interacting so may need to 
consider both. Apart from that all of this is not relevant to other 
machines that don't use this functionality.


QEMU decided to model it as a separate QOM object that is now instantiated 
by the machines that use it. This is not real hardware but a QEMU 
implementation detail. What's wrong with that? It seems you just want to 
simplify the pc machine creation and

Re: [PATCH] migration: Don't serialize migration while can't switchover

2024-02-27 Thread Avihai Horon




On 27/02/2024 9:41, Peter Xu wrote:

External email: Use caution opening links or attachments


On Thu, Feb 22, 2024 at 05:56:27PM +0200, Avihai Horon wrote:

Currently, migration code serializes device data sending during pre-copy
iterative phase. As noted in the code comment, this is done to prevent
faster changing device from sending its data over and over.

Frankly speaking I don't understand the rational behind 90697be889 ("live
migration: Serialize vmstate saving in stage 2").  I don't even think I
noticed this logic before even if I worked on migration for a few years...

I was thinking all devices should always get its chance to run for some
period during iterations.  Do you know the reasoning behind?  And I must
confess I also know little on block migration, which seems to be relevant
to this change.  Anyway, I also copy Jan just in case he'll be able to chim
in.


I am not 100% sure either, but I can guess:
This commit is pretty old (dates to 2009), so maybe back then the only 
iterative devices were block devices and RAM.
Block devices didn't change as fast as RAM (and were probably much 
bigger than RAM), so it made sense to send them first and only then send 
RAM, which changed constantly.




If there is a fast changing device, even if we don't proceed with other
device iterators and we stick with the current one, assuming it can finally
finish dumping all data, but then we'll proceed with other devices and the
fast changing device can again accumulate dirty information?


I guess this logic only makes sense for the case where we only have 
block devices and a RAM device, because the block devices wouldn't 
change that much?





However, with switchover-ack capability enabled, this behavior can be
problematic and may prevent migration from converging. The problem lies
in the fact that an earlier device may never finish sending its data and
thus block other devices from sending theirs.

Yes, this is a problem.


This bug was observed in several VFIO migration scenarios where some
workload on the VM prevented RAM from ever reaching a hard zero, not
allowing VFIO initial pre-copy data to be sent, and thus destination
could not ack switchover. Note that the same scenario, but without
switchover-ack, would converge.

Fix it by not serializing device data sending during pre-copy iterative
phase if switchover was not acked yet.

I am still not fully convinced that it's even legal that one device can
consume all iterator's bandwidth, ignoring the rest..  Though again it's
not about this patch, but about commit 90697be889.


Yes, I agree. As I wrote above, maybe this was valid back then when the 
only iterative devices were block and RAM.




I'm thinking whether we should allow each device to have its own portion of
chance to push data for each call to qemu_savevm_state_iterate(),
irrelevant of vfio's switchover-ack capability.


I wasn't sure for 100% why we have this logic in the first place, so I 
wrote my patch as little invasive as possible, keeping migration 
behavior as is (except for switchover-ack).

But I tend to agree with you for three reasons:

1. I think block migration is deprecated (see commits 66db46ca83b8, 
40101f320d6b and 8846b5bfca76).
Instead, users are instructed to use blockdev-mirror and co. [1]. If I'm 
not mistaken, this operates using a different infrastructure than migration.

So this logic is not relevant anymore.

2. As you pointed out earlier, the fast changing device can accumulate 
dirty data over and over. VFIO devices come after RAM, so this logic 
doesn't achieve its goal in this case (we may sync fast changing RAM 
over and over).


3. My fix in this patch won't solve a similar problem that could happen, 
where a VFIO device with a lot of pre-copy data (not necessarily initial 
data) may never be able to send it, thus not realizing the full 
potential of pre-copy for VFIO.
(I personally have not encountered this problem yet, but maybe it can 
happen with a vGPU).



If you agree, I can send a v2 that simply removes this logic and gives 
every device an equal chance to send its data (like Joao showed) with 
some explanation why we do it.
We could also give RAM precedence over other devices only during the 
first iteration of sending RAM (i.e., only until first dirty sync), but 
I don't know how much benefit this would give.


[1] https://qemu-project.gitlab.io/qemu/interop/live-block-operations.html




Fixes: 1b4adb10f898 ("migration: Implement switchover ack logic")
Signed-off-by: Avihai Horon 
---
  migration/savevm.h|  2 +-
  migration/migration.c |  4 ++--
  migration/savevm.c| 22 +++---
  3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/migration/savevm.h b/migration/savevm.h
index 74669733dd6..d4a368b522b 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -36,7 +36,7 @@ void qemu_savevm_state_setup(QEMUFile *f);
  bool qemu_savevm_state_guest_unplug_pending(void);
  int

Re: [PATCH v4 22/34] migration/multifd: Prepare multifd sync for fixed-ram migration

2024-02-27 Thread Peter Xu

On Tue, Feb 27, 2024 at 11:00:44AM -0300, Fabiano Rosas wrote:
> I don't really like the interleaving of file and socket logic at
> multifd_recv_sync_main(), but I can live with it.

The idea was to share the "wait" part and the semaphore.  If you don't like
the form of it, an alternative is we can provide three helpers (file_kick,
wait, socket_kick), then:

  if (file) {
file_kick();
wait();
  } else {
wait();
socket_kick();
  }

> 
> Waiting on multifd_recv_state->sem_sync is problematic because if the
> thread has an error, that will hang forever.
> 
> Actually, I don't even see this being handled in _current_ code
> anywhere, we probably have a bug there. I guess we need to add one more
> "post this sem just because" somewhere. multifd_recv_kick_main probably.

Might because dest qemu is even less of a concern? As if something wrong on
dest, then src is probably already failing the migration, then libvirt or
upper layer can directly kill dest qemu (while we can't do that to src).
But yeah we should still fix it at some point.. to make dest qemu quit
gracefully in error cases, and it'll also help more in the future if
multifd will support postcopy, then both src/dst can't be killed.

-- 
Peter Xu

Re: [PATCH 3/5] hw/isa: Embed TYPE_PORT92 in south bridges used in PC machines

2024-02-27 Thread Bernhard Beschow




Am 27. Februar 2024 21:54:19 UTC schrieb BALATON Zoltan :
>On Tue, 27 Feb 2024, Bernhard Beschow wrote:
>> Am 21. Februar 2024 11:53:21 UTC schrieb Mark Cave-Ayland 
>> :
>>> On 18/02/2024 13:16, Bernhard Beschow wrote:
 Port 92 is an integral part of the PIIX and ICH south bridges, so 
 instantiate it
 there. The isapc machine now needs to instantiate it explicitly, 
 analoguous to
 the RTC.
 
 Note that due to migration compatibility, port92 is optional in the south
 bridges. It is always instantiated the isapc machine for simplicity.
 
 Signed-off-by: Bernhard Beschow 
 ---
   include/hw/i386/pc.h  |  2 +-
   include/hw/southbridge/ich9.h |  4 
   include/hw/southbridge/piix.h |  3 +++
   hw/i386/pc.c  | 18 --
   hw/i386/pc_piix.c |  9 +++--
   hw/i386/pc_q35.c  |  8 +---
   hw/isa/lpc_ich9.c |  9 +
   hw/isa/piix.c |  9 +
   hw/isa/Kconfig|  2 ++
   9 files changed, 52 insertions(+), 12 deletions(-)
>>> 
>>> I had a look at this (and did a bit of revision around 8042 and A20), and I 
>>> am starting to wonder if the PORT92 device isn't something that belongs to 
>>> the southbridge, but more specifically to the superio chip?
>> 
>> If there is agreement to model real hardware in QEMU, then I think that
>
>I think there's no such agreement and QEMU is more lax about it both for 
>historical reasons and to simplify machine models. Indeed, QEMU sometimes 
>models non-existing machines (e.g. the mac99 or virt boards) that don't 
>correspond to real hardware but allow guest OSes to boot. Even when modelllng 
>real hardware it's ofren modelled just enough for guests to work and unused 
>details are omitted for simplicity. It is recommended to follow what real 
>hardware does when modelling real hardware but not always required. Although 
>it might help both with verifying a device model and to compose machines with 
>these models to try to follow the real hardware.

Composing real machines and verifying device models is exactly what I'm after. 
I'm aware that QEMU provides virt machines such as the microvm, and from the 
context I didn't refer to these.

>
>> port 92 belongs into any device model where the hardware has one. All our 
>> PC-like southbridges (PIIX, ICH, VIA) have port 92. Many FDC37 including 
>> the FDC37M81x as used in the Malta board have one, too -- where it must 
>> first be enabled.
>
>So port92 is not a real hardware but a QEMU abstraction or model of some 
>functionality found in some machines. Real chips probably implement this in 
>different ways so we could either model this in these chips independently the 
>same way as real hardware does or use the abstracted model anywhere in our 
>machine model. Since this does not exist in real hardware as this abstract 
>model it also does not belong anywhere so we are free to put it where it's 
>most convenient or simple to do.

As mentioned already, port 92 is an integral part of PIIX, ICH, and VIA 
southbridges. That's why I want to move it there. My goal is to create 
different PC machines in a data-driven manner which model real boards. I want 
to see how low-level guests interact with the hardware, including e.g. how they 
set up the memory map.

>
>>> A couple of thoughts as to why I came to this conclusion: firstly the 
>>> superio chip is generally considered to be a single integrated 
>>> implementation of legacy IO devices, so this feels like a natural home for 
>>> the PORT92 device.
>> 
>>> Secondly the value of the "has-port92" property is controlled by 
>>> pcms->i8042_enabled, and this value is already passed into functions such 
>>> as pc_superio_init() for example.
>> 
>> Rhight. There, it also controls the presence of port 92. If we move port 92 
>> into the southbridges, we have to respect this command line switch there to 
>> preserve backward compatibility.
>> 
>> I wonder what `-M i8042` is supposed to do. If it is for modeling a 
>> stripped-down x86 system, why not use the microvm instead? How is it 
>> possible to omit an essential piece of hardware needed to boot x86 systems? 
>> Don't we need at least either one (i8042 or port 92)?
>
>Try git log -p 4ccd5fe22fe (found it via git blame and see what added that 
>property).

Alright, the intention was to omit the PS/2 controller in favor of USB. That 
doesn't mean that port 92 needs to be affected. I see an opportunity here to 
reduce the scope of the i8042 option which may help with data-driven machine 
creation in the future.

>
>>> I think this would also help reduce the changes required for the individual 
>>> machines, however the devil is always in the details particularly when 
>>> migration is involved.
>> 
>> As stated above, this series is more about modeling real hardware, in the 
>> hope that this will lend itself for

[PATCH] mips: do not list individual devices from configs/

2024-02-27 Thread Paolo Bonzini

Add new "select" and "imply" directives if needed.  The resulting
config-devices.mak files are the same as before.

Note that builds without default devices will become much smaller
than before; for this reason, it's necessary to use only the bare
minimum of USB functions, in particular only those that are inlined.
For this reason, usb_bus_find() must be removed, as it only exists
if CONFIG_USB is selected by a host controller.

Signed-off-by: Paolo Bonzini 
---
 configs/devices/mips-softmmu/common.mak  | 28 +++-
 configs/devices/mips64el-softmmu/default.mak |  3 ---
 .gitlab-ci.d/buildtest.yml   |  2 +-
 hw/display/Kconfig   |  2 +-
 hw/mips/Kconfig  | 20 +-
 5 files changed, 25 insertions(+), 30 deletions(-)

diff --git a/configs/devices/mips-softmmu/common.mak 
b/configs/devices/mips-softmmu/common.mak
index 1a853841b27..416a5d353e8 100644
--- a/configs/devices/mips-softmmu/common.mak
+++ b/configs/devices/mips-softmmu/common.mak
@@ -1,28 +1,8 @@
 # Common mips*-softmmu CONFIG defines
 
-CONFIG_ISA_BUS=y
-CONFIG_PCI=y
-CONFIG_PCI_DEVICES=y
-CONFIG_VGA_ISA=y
-CONFIG_VGA_MMIO=y
-CONFIG_VGA_CIRRUS=y
-CONFIG_VMWARE_VGA=y
-CONFIG_SERIAL=y
-CONFIG_SERIAL_ISA=y
-CONFIG_PARALLEL=y
-CONFIG_I8254=y
-CONFIG_PCSPK=y
-CONFIG_PCKBD=y
-CONFIG_FDC=y
-CONFIG_I8257=y
-CONFIG_IDE_ISA=y
-CONFIG_PFLASH_CFI01=y
-CONFIG_I8259=y
-CONFIG_MC146818RTC=y
-CONFIG_MIPS_CPS=y
-CONFIG_MIPS_ITU=y
+# Uncomment the following lines to disable these optional devices:
+# CONFIG_PCI_DEVICES=n
+# CONFIG_TEST_DEVICES=n
+
 CONFIG_MALTA=y
-CONFIG_PCNET_PCI=y
 CONFIG_MIPSSIM=y
-CONFIG_SMBUS_EEPROM=y
-CONFIG_TEST_DEVICES=y
diff --git a/configs/devices/mips64el-softmmu/default.mak 
b/configs/devices/mips64el-softmmu/default.mak
index d5188f7ea58..88a37cf27f1 100644
--- a/configs/devices/mips64el-softmmu/default.mak
+++ b/configs/devices/mips64el-softmmu/default.mak
@@ -3,8 +3,5 @@
 include ../mips-softmmu/common.mak
 CONFIG_FULOONG=y
 CONFIG_LOONGSON3V=y
-CONFIG_ATI_VGA=y
-CONFIG_RTL8139_PCI=y
 CONFIG_JAZZ=y
-CONFIG_VT82C686=y
 CONFIG_MIPS_BOSTON=y
diff --git a/.gitlab-ci.d/buildtest.yml b/.gitlab-ci.d/buildtest.yml
index a1c030337b1..901265af95d 100644
--- a/.gitlab-ci.d/buildtest.yml
+++ b/.gitlab-ci.d/buildtest.yml
@@ -659,7 +659,7 @@ build-without-defaults:
   --disable-pie
   --disable-qom-cast-debug
   --disable-strip
-TARGETS: avr-softmmu mips64-softmmu s390x-softmmu sh4-softmmu
+TARGETS: avr-softmmu s390x-softmmu sh4-softmmu
   sparc64-softmmu hexagon-linux-user i386-linux-user s390x-linux-user
 MAKE_CHECK_ARGS: check
 
diff --git a/hw/display/Kconfig b/hw/display/Kconfig
index 07acb37dc66..234c7de027c 100644
--- a/hw/display/Kconfig
+++ b/hw/display/Kconfig
@@ -55,7 +55,7 @@ config VGA_MMIO
 
 config VMWARE_VGA
 bool
-default y if PCI_DEVICES && PC_PCI
+default y if PCI_DEVICES && (PC_PCI || MIPS)
 depends on PCI
 select VGA
 
diff --git a/hw/mips/Kconfig b/hw/mips/Kconfig
index e57db4f6412..5c83ef49cf6 100644
--- a/hw/mips/Kconfig
+++ b/hw/mips/Kconfig
@@ -1,8 +1,15 @@
 config MALTA
 bool
+imply PCNET_PCI
+imply PCI_DEVICES
+imply TEST_DEVICES
 select FDC37M81X
 select GT64120
+select MIPS_CPS
 select PIIX
+select PFLASH_CFI01
+select SERIAL
+select SMBUS_EEPROM
 
 config MIPSSIM
 bool
@@ -31,17 +38,26 @@ config JAZZ
 
 config FULOONG
 bool
+imply PCI_DEVICES
+imply TEST_DEVICES
+imply ATI_VGA
+imply RTL8139_PCI
 select PCI_BONITO
+select SMBUS_EEPROM
 select VT82C686
 
 config LOONGSON3V
 bool
+imply PCI_DEVICES
+imply TEST_DEVICES
+imply VIRTIO_PCI
+imply VIRTIO_NET
 imply VIRTIO_VGA
 imply QXL if SPICE
+imply USB_OHCI_PCI
 select SERIAL
 select GOLDFISH_RTC
 select LOONGSON_LIOINTC
-select PCI_DEVICES
 select PCI_EXPRESS_GENERIC_BRIDGE
 select MSI_NONBROKEN
 select FW_CFG_MIPS
@@ -53,6 +69,8 @@ config MIPS_CPS
 
 config MIPS_BOSTON
 bool
+imply PCI_DEVICES
+imply TEST_DEVICES
 select FITLOADER
 select MIPS_CPS
 select PCI_EXPRESS_XILINX
-- 
2.43.2

[QEMU][PATCH v3 0/7] Xen: support grant mappings.

2024-02-27 Thread Vikram Garhwal

Hi,
This patch series add support for grant mappings as a pseudo RAM region for Xen.

Enabling grant mappings patches(first 6) are written by Juergen in 2021.

QEMU Virtio device provides an emulated backends for Virtio frontned devices
in Xen.
Please set "iommu_platform=on" option when invoking QEMU. As this will set
VIRTIO_F_ACCESS_PLATFORM feature which will be used by virtio frontend in Xen
to know whether backend supports grants or not.

Changelog:
v2->v3:
Drop patch 1/7. This was done because device unplug is an x86-only case.
Add missing qemu_mutex_unlock() before return.
v1->v2:
Split patch 2/7 to keep phymem.c changes in a separate.
In patch "xen: add map and unmap callbacks for grant" add check for total
allowed grant < XEN_MAX_VIRTIO_GRANTS.
Fix formatting issues and re-based with master latest.

Regards,
Vikram

Juergen Gross (5):
  xen: add pseudo RAM region for grant mappings
  softmmu: let qemu_map_ram_ptr() use qemu_ram_ptr_length()
  xen: let xen_ram_addr_from_mapcache() return -1 in case of not found
entry
  memory: add MemoryRegion map and unmap callbacks
  xen: add map and unmap callbacks for grant region

Vikram Garhwal (2):
  softmmu: physmem: Split ram_block_add()
  hw: arm: Add grant mapping.

 hw/arm/xen_arm.c|   3 +
 hw/i386/xen/xen-hvm.c   |   3 +
 hw/xen/xen-hvm-common.c |   4 +-
 hw/xen/xen-mapcache.c   | 214 ++--
 include/exec/memory.h   |  21 
 include/exec/ram_addr.h |   1 +
 include/hw/xen/xen-hvm-common.h |   2 +
 include/hw/xen/xen_pvdev.h  |   3 +
 include/sysemu/xen-mapcache.h   |   3 +
 system/physmem.c| 179 +++---
 10 files changed, 351 insertions(+), 82 deletions(-)

-- 
2.17.1

[QEMU][PATCH v3 7/7] hw: arm: Add grant mapping.

2024-02-27 Thread Vikram Garhwal

Enable grant ram mapping support for Xenpvh machine on ARM.

Signed-off-by: Vikram Garhwal 
Reviewed-by: Stefano Stabellini 
---
 hw/arm/xen_arm.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hw/arm/xen_arm.c b/hw/arm/xen_arm.c
index 32776d94df..b5993ef2a6 100644
--- a/hw/arm/xen_arm.c
+++ b/hw/arm/xen_arm.c
@@ -125,6 +125,9 @@ static void xen_init_ram(MachineState *machine)
  GUEST_RAM1_BASE, ram_size[1]);
 memory_region_add_subregion(sysmem, GUEST_RAM1_BASE, _hi);
 }
+
+DPRINTF("init grant ram mapping for XEN\n");
+ram_grants = *xen_init_grant_ram();
 }
 
 void arch_handle_ioreq(XenIOState *state, ioreq_t *req)
-- 
2.17.1

[QEMU][PATCH v3 2/7] xen: add pseudo RAM region for grant mappings

2024-02-27 Thread Vikram Garhwal

From: Juergen Gross 

Add a memory region which can be used to automatically map granted
memory. It is starting at 0x8000ULL in order to be able to
distinguish it from normal RAM.

For this reason the xen.ram memory region is expanded, which has no
further impact as it is used just as a container of the real RAM
regions and now the grant region.

Signed-off-by: Juergen Gross 
Signed-off-by: Vikram Garhwal 
Reviewed-by: Stefano Stabellini 
---
 hw/i386/xen/xen-hvm.c   |  3 +++
 hw/xen/xen-hvm-common.c |  4 ++--
 hw/xen/xen-mapcache.c   | 27 +++
 include/hw/xen/xen-hvm-common.h |  2 ++
 include/hw/xen/xen_pvdev.h  |  3 +++
 include/sysemu/xen-mapcache.h   |  3 +++
 6 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/hw/i386/xen/xen-hvm.c b/hw/i386/xen/xen-hvm.c
index f42621e674..67a8a6 100644
--- a/hw/i386/xen/xen-hvm.c
+++ b/hw/i386/xen/xen-hvm.c
@@ -172,6 +172,9 @@ static void xen_ram_init(PCMachineState *pcms,
  x86ms->above_4g_mem_size);
 memory_region_add_subregion(sysmem, 0x1ULL, _hi);
 }
+
+/* Add grant mappings as a pseudo RAM region. */
+ram_grants = *xen_init_grant_ram();
 }
 
 static XenPhysmap *get_physmapping(hwaddr start_addr, ram_addr_t size)
diff --git a/hw/xen/xen-hvm-common.c b/hw/xen/xen-hvm-common.c
index baa1adb9f2..6e53d3bf81 100644
--- a/hw/xen/xen-hvm-common.c
+++ b/hw/xen/xen-hvm-common.c
@@ -9,7 +9,7 @@
 #include "hw/boards.h"
 #include "hw/xen/arch_hvm.h"
 
-MemoryRegion ram_memory;
+MemoryRegion ram_memory, ram_grants;
 
 void xen_ram_alloc(ram_addr_t ram_addr, ram_addr_t size, MemoryRegion *mr,
Error **errp)
@@ -26,7 +26,7 @@ void xen_ram_alloc(ram_addr_t ram_addr, ram_addr_t size, 
MemoryRegion *mr,
 return;
 }
 
-if (mr == _memory) {
+if (mr == _memory || mr == _grants) {
 return;
 }
 
diff --git a/hw/xen/xen-mapcache.c b/hw/xen/xen-mapcache.c
index 4f956d048e..dfc412d138 100644
--- a/hw/xen/xen-mapcache.c
+++ b/hw/xen/xen-mapcache.c
@@ -14,7 +14,9 @@
 
 #include 
 
+#include "hw/xen/xen-hvm-common.h"
 #include "hw/xen/xen_native.h"
+#include "hw/xen/xen_pvdev.h"
 #include "qemu/bitmap.h"
 
 #include "sysemu/runstate.h"
@@ -590,3 +592,28 @@ uint8_t *xen_replace_cache_entry(hwaddr old_phys_addr,
 mapcache_unlock();
 return p;
 }
+
+MemoryRegion *xen_init_grant_ram(void)
+{
+RAMBlock *block;
+
+memory_region_init(_grants, NULL, "xen.grants",
+   XEN_MAX_VIRTIO_GRANTS * XC_PAGE_SIZE);
+block = g_malloc0(sizeof(*block));
+block->mr = _grants;
+block->used_length = XEN_MAX_VIRTIO_GRANTS * XC_PAGE_SIZE;
+block->max_length = XEN_MAX_VIRTIO_GRANTS * XC_PAGE_SIZE;
+block->fd = -1;
+block->page_size = XC_PAGE_SIZE;
+block->host = (void *)XEN_GRANT_ADDR_OFF;
+block->offset = XEN_GRANT_ADDR_OFF;
+block->flags = RAM_PREALLOC;
+ram_grants.ram_block = block;
+ram_grants.ram = true;
+ram_grants.terminates = true;
+ram_block_add_list(block);
+memory_region_add_subregion(get_system_memory(), XEN_GRANT_ADDR_OFF,
+_grants);
+
+return _grants;
+}
diff --git a/include/hw/xen/xen-hvm-common.h b/include/hw/xen/xen-hvm-common.h
index 4b1d728f35..8deeff6bcf 100644
--- a/include/hw/xen/xen-hvm-common.h
+++ b/include/hw/xen/xen-hvm-common.h
@@ -16,6 +16,8 @@
 #include 
 
 extern MemoryRegion ram_memory;
+
+extern MemoryRegion ram_grants;
 extern MemoryListener xen_io_listener;
 extern DeviceListener xen_device_listener;
 
diff --git a/include/hw/xen/xen_pvdev.h b/include/hw/xen/xen_pvdev.h
index ddad4b9f36..0f1b5edfa9 100644
--- a/include/hw/xen/xen_pvdev.h
+++ b/include/hw/xen/xen_pvdev.h
@@ -80,4 +80,7 @@ int xen_pv_send_notify(struct XenLegacyDevice *xendev);
 void xen_pv_printf(struct XenLegacyDevice *xendev, int msg_level,
const char *fmt, ...)  G_GNUC_PRINTF(3, 4);
 
+#define XEN_GRANT_ADDR_OFF0x8000ULL
+#define XEN_MAX_VIRTIO_GRANTS 65536
+
 #endif /* QEMU_HW_XEN_PVDEV_H */
diff --git a/include/sysemu/xen-mapcache.h b/include/sysemu/xen-mapcache.h
index c8e7c2f6cf..f4bedb1c11 100644
--- a/include/sysemu/xen-mapcache.h
+++ b/include/sysemu/xen-mapcache.h
@@ -10,6 +10,7 @@
 #define XEN_MAPCACHE_H
 
 #include "exec/cpu-common.h"
+#include "exec/ram_addr.h"
 
 typedef hwaddr (*phys_offset_to_gaddr_t)(hwaddr phys_offset,
  ram_addr_t size);
@@ -25,6 +26,8 @@ void xen_invalidate_map_cache(void);
 uint8_t *xen_replace_cache_entry(hwaddr old_phys_addr,
  hwaddr new_phys_addr,
  hwaddr size);
+MemoryRegion *xen_init_grant_ram(void);
+
 #else
 
 static inline void xen_map_cache_init(phys_offset_to_gaddr_t f,
-- 
2.17.1

[QEMU][PATCH v3 4/7] xen: let xen_ram_addr_from_mapcache() return -1 in case of not found entry

2024-02-27 Thread Vikram Garhwal

From: Juergen Gross 

Today xen_ram_addr_from_mapcache() will either abort() or return 0 in
case it can't find a matching entry for a pointer value. Both cases
are bad, so change that to return an invalid address instead.

Signed-off-by: Juergen Gross 
Reviewed-by: Stefano Stabellini 
---
 hw/xen/xen-mapcache.c | 11 +++
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/hw/xen/xen-mapcache.c b/hw/xen/xen-mapcache.c
index dfc412d138..179b7e95b2 100644
--- a/hw/xen/xen-mapcache.c
+++ b/hw/xen/xen-mapcache.c
@@ -396,13 +396,8 @@ ram_addr_t xen_ram_addr_from_mapcache(void *ptr)
 }
 }
 if (!found) {
-trace_xen_ram_addr_from_mapcache_not_found(ptr);
-QTAILQ_FOREACH(reventry, >locked_entries, next) {
-trace_xen_ram_addr_from_mapcache_found(reventry->paddr_index,
-   reventry->vaddr_req);
-}
-abort();
-return 0;
+mapcache_unlock();
+return RAM_ADDR_INVALID;
 }
 
 entry = >entry[paddr_index % mapcache->nr_buckets];
@@ -411,7 +406,7 @@ ram_addr_t xen_ram_addr_from_mapcache(void *ptr)
 }
 if (!entry) {
 trace_xen_ram_addr_from_mapcache_not_in_cache(ptr);
-raddr = 0;
+raddr = RAM_ADDR_INVALID;
 } else {
 raddr = (reventry->paddr_index << MCACHE_BUCKET_SHIFT) +
  ((unsigned long) ptr - (unsigned long) entry->vaddr_base);
-- 
2.17.1

[QEMU][PATCH v3 1/7] softmmu: physmem: Split ram_block_add()

2024-02-27 Thread Vikram Garhwal

Extract ram block list update to a new function ram_block_add_list(). This is
done to support grant mappings which adds a memory region for granted memory and
updates the ram_block list.

Signed-off-by: Juergen Gross 
Signed-off-by: Vikram Garhwal 
Reviewed-by: Stefano Stabellini 
---
 include/exec/ram_addr.h |  1 +
 system/physmem.c| 62 ++---
 2 files changed, 40 insertions(+), 23 deletions(-)

diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index 90676093f5..c0b5f9a7d0 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -139,6 +139,7 @@ void qemu_ram_free(RAMBlock *block);
 int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp);
 
 void qemu_ram_msync(RAMBlock *block, ram_addr_t start, ram_addr_t length);
+void ram_block_add_list(RAMBlock *new_block);
 
 /* Clear whole block of mem */
 static inline void qemu_ram_block_writeback(RAMBlock *block)
diff --git a/system/physmem.c b/system/physmem.c
index e3ebc19eef..84f3022099 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -1803,12 +1803,47 @@ static void dirty_memory_extend(ram_addr_t old_ram_size,
 }
 }
 
+static void ram_block_add_list_locked(RAMBlock *new_block)
+ {
+ RAMBlock *block;
+ RAMBlock *last_block = NULL;
+
+/*
+ * Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
+ * QLIST (which has an RCU-friendly variant) does not have insertion at
+ * tail, so save the last element in last_block.
+ */
+RAMBLOCK_FOREACH(block) {
+last_block = block;
+if (block->max_length < new_block->max_length) {
+break;
+}
+}
+if (block) {
+QLIST_INSERT_BEFORE_RCU(block, new_block, next);
+} else if (last_block) {
+QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
+} else { /* list is empty */
+QLIST_INSERT_HEAD_RCU(_list.blocks, new_block, next);
+}
+ram_list.mru_block = NULL;
+
+/* Write list before version */
+smp_wmb();
+ram_list.version++;
+}
+
+void ram_block_add_list(RAMBlock *new_block)
+{
+qemu_mutex_lock_ramlist();
+ram_block_add_list_locked(new_block);
+qemu_mutex_unlock_ramlist();
+}
+
 static void ram_block_add(RAMBlock *new_block, Error **errp)
 {
 const bool noreserve = qemu_ram_is_noreserve(new_block);
 const bool shared = qemu_ram_is_shared(new_block);
-RAMBlock *block;
-RAMBlock *last_block = NULL;
 ram_addr_t old_ram_size, new_ram_size;
 Error *err = NULL;
 
@@ -1846,28 +1881,9 @@ static void ram_block_add(RAMBlock *new_block, Error 
**errp)
 if (new_ram_size > old_ram_size) {
 dirty_memory_extend(old_ram_size, new_ram_size);
 }
-/* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
- * QLIST (which has an RCU-friendly variant) does not have insertion at
- * tail, so save the last element in last_block.
- */
-RAMBLOCK_FOREACH(block) {
-last_block = block;
-if (block->max_length < new_block->max_length) {
-break;
-}
-}
-if (block) {
-QLIST_INSERT_BEFORE_RCU(block, new_block, next);
-} else if (last_block) {
-QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
-} else { /* list is empty */
-QLIST_INSERT_HEAD_RCU(_list.blocks, new_block, next);
-}
-ram_list.mru_block = NULL;
 
-/* Write list before version */
-smp_wmb();
-ram_list.version++;
+ram_block_add_list_locked(new_block);
+
 qemu_mutex_unlock_ramlist();
 
 cpu_physical_memory_set_dirty_range(new_block->offset,
-- 
2.17.1

[QEMU][PATCH v3 3/7] softmmu: let qemu_map_ram_ptr() use qemu_ram_ptr_length()

2024-02-27 Thread Vikram Garhwal

From: Juergen Gross 

qemu_map_ram_ptr() and qemu_ram_ptr_length() share quite some code, so
modify qemu_ram_ptr_length() a little bit and use it for
qemu_map_ram_ptr(), too.

Signed-off-by: Juergen Gross 
Signed-off-by: Vikram Garhwal 
Reviewed-by: Stefano Stabellini 
---
 system/physmem.c | 56 
 1 file changed, 23 insertions(+), 33 deletions(-)

diff --git a/system/physmem.c b/system/physmem.c
index 84f3022099..949dcb20ba 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -2163,43 +2163,17 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
 }
 #endif /* !_WIN32 */
 
-/* Return a host pointer to ram allocated with qemu_ram_alloc.
- * This should not be used for general purpose DMA.  Use address_space_map
- * or address_space_rw instead. For local memory (e.g. video ram) that the
- * device owns, use memory_region_get_ram_ptr.
- *
- * Called within RCU critical section.
- */
-void *qemu_map_ram_ptr(RAMBlock *block, ram_addr_t addr)
-{
-if (block == NULL) {
-block = qemu_get_ram_block(addr);
-addr -= block->offset;
-}
-
-if (xen_enabled() && block->host == NULL) {
-/* We need to check if the requested address is in the RAM
- * because we don't want to map the entire memory in QEMU.
- * In that case just map until the end of the page.
- */
-if (block->offset == 0) {
-return xen_map_cache(addr, 0, 0, false);
-}
-
-block->host = xen_map_cache(block->offset, block->max_length, 1, 
false);
-}
-return ramblock_ptr(block, addr);
-}
-
-/* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr
- * but takes a size argument.
+/*
+ * Return a host pointer to guest's ram.
  *
  * Called within RCU critical section.
  */
 static void *qemu_ram_ptr_length(RAMBlock *block, ram_addr_t addr,
  hwaddr *size, bool lock)
 {
-if (*size == 0) {
+hwaddr len = 0;
+
+if (size && *size == 0) {
 return NULL;
 }
 
@@ -2207,7 +2181,10 @@ static void *qemu_ram_ptr_length(RAMBlock *block, 
ram_addr_t addr,
 block = qemu_get_ram_block(addr);
 addr -= block->offset;
 }
-*size = MIN(*size, block->max_length - addr);
+if (size) {
+*size = MIN(*size, block->max_length - addr);
+len = *size;
+}
 
 if (xen_enabled() && block->host == NULL) {
 /* We need to check if the requested address is in the RAM
@@ -2215,7 +2192,7 @@ static void *qemu_ram_ptr_length(RAMBlock *block, 
ram_addr_t addr,
  * In that case just map the requested area.
  */
 if (block->offset == 0) {
-return xen_map_cache(addr, *size, lock, lock);
+return xen_map_cache(addr, len, lock, lock);
 }
 
 block->host = xen_map_cache(block->offset, block->max_length, 1, lock);
@@ -2224,6 +2201,19 @@ static void *qemu_ram_ptr_length(RAMBlock *block, 
ram_addr_t addr,
 return ramblock_ptr(block, addr);
 }
 
+/*
+ * Return a host pointer to ram allocated with qemu_ram_alloc.
+ * This should not be used for general purpose DMA.  Use address_space_map
+ * or address_space_rw instead. For local memory (e.g. video ram) that the
+ * device owns, use memory_region_get_ram_ptr.
+ *
+ * Called within RCU critical section.
+ */
+void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
+{
+return qemu_ram_ptr_length(ram_block, addr, NULL, false);
+}
+
 /* Return the offset of a hostpointer within a ramblock */
 ram_addr_t qemu_ram_block_host_offset(RAMBlock *rb, void *host)
 {
-- 
2.17.1

[QEMU][PATCH v3 6/7] xen: add map and unmap callbacks for grant region

2024-02-27 Thread Vikram Garhwal

From: Juergen Gross 

Add the callbacks for mapping/unmapping guest memory via grants to the
special grant memory region.

Signed-off-by: Juergen Gross 
Signed-off-by: Vikram Garhwal 
---
 hw/xen/xen-mapcache.c | 176 +-
 system/physmem.c  |  11 ++-
 2 files changed, 182 insertions(+), 5 deletions(-)

diff --git a/hw/xen/xen-mapcache.c b/hw/xen/xen-mapcache.c
index 179b7e95b2..2e4c9b4947 100644
--- a/hw/xen/xen-mapcache.c
+++ b/hw/xen/xen-mapcache.c
@@ -9,6 +9,8 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/queue.h"
+#include "qemu/thread.h"
 #include "qemu/units.h"
 #include "qemu/error-report.h"
 
@@ -23,6 +25,8 @@
 #include "sysemu/xen-mapcache.h"
 #include "trace.h"
 
+#include 
+#include 
 
 #if HOST_LONG_BITS == 32
 #  define MCACHE_BUCKET_SHIFT 16
@@ -377,7 +381,7 @@ uint8_t *xen_map_cache(hwaddr phys_addr, hwaddr size,
 return p;
 }
 
-ram_addr_t xen_ram_addr_from_mapcache(void *ptr)
+static ram_addr_t xen_ram_addr_from_mapcache_try(void *ptr)
 {
 MapCacheEntry *entry = NULL;
 MapCacheRev *reventry;
@@ -588,10 +592,179 @@ uint8_t *xen_replace_cache_entry(hwaddr old_phys_addr,
 return p;
 }
 
+struct XENMappedGrantRegion {
+void *addr;
+unsigned int pages;
+unsigned int refs;
+unsigned int prot;
+uint32_t idx;
+QLIST_ENTRY(XENMappedGrantRegion) list;
+};
+
+static xengnttab_handle *xen_region_gnttabdev;
+static QLIST_HEAD(GrantRegionList, XENMappedGrantRegion) xen_grant_mappings =
+QLIST_HEAD_INITIALIZER(xen_grant_mappings);
+static QemuMutex xen_map_mutex;
+
+static void *xen_map_grant_dyn(MemoryRegion **mr, hwaddr addr, hwaddr *plen,
+   bool is_write, MemTxAttrs attrs)
+{
+unsigned int page_off = addr & (XC_PAGE_SIZE - 1);
+unsigned int i;
+unsigned int total_grants = 0;
+unsigned int nrefs = (page_off + *plen + XC_PAGE_SIZE - 1) >> 
XC_PAGE_SHIFT;
+uint32_t ref = (addr - XEN_GRANT_ADDR_OFF) >> XC_PAGE_SHIFT;
+uint32_t *refs = NULL;
+unsigned int prot = PROT_READ;
+struct XENMappedGrantRegion *mgr = NULL;
+
+if (is_write) {
+prot |= PROT_WRITE;
+}
+
+qemu_mutex_lock(_map_mutex);
+
+QLIST_FOREACH(mgr, _grant_mappings, list) {
+if (mgr->idx == ref &&
+mgr->pages == nrefs &&
+(mgr->prot & prot) == prot) {
+break;
+}
+
+total_grants += mgr->pages;
+}
+
+if (!mgr) {
+if (nrefs + total_grants >= XEN_MAX_VIRTIO_GRANTS) {
+qemu_mutex_unlock(_map_mutex);
+return NULL;
+}
+
+mgr = g_new(struct XENMappedGrantRegion, 1);
+
+if (nrefs == 1) {
+refs = 
+} else {
+refs = g_new(uint32_t, nrefs);
+for (i = 0; i < nrefs; i++) {
+refs[i] = ref + i;
+}
+}
+mgr->addr = xengnttab_map_domain_grant_refs(xen_region_gnttabdev, 
nrefs,
+xen_domid, refs, prot);
+if (mgr->addr) {
+mgr->pages = nrefs;
+mgr->refs = 1;
+mgr->prot = prot;
+mgr->idx = ref;
+
+QLIST_INSERT_HEAD(_grant_mappings, mgr, list);
+} else {
+g_free(mgr);
+mgr = NULL;
+}
+} else {
+mgr->refs++;
+}
+
+qemu_mutex_unlock(_map_mutex);
+
+if (nrefs > 1) {
+g_free(refs);
+}
+
+return mgr ? mgr->addr + page_off : NULL;
+}
+
+static void xen_unmap_grant_dyn(MemoryRegion *mr, void *buffer, ram_addr_t 
addr,
+hwaddr len, bool is_write, hwaddr access_len)
+{
+unsigned int page_off = (unsigned long)buffer & (XC_PAGE_SIZE - 1);
+unsigned int nrefs = (page_off + len + XC_PAGE_SIZE - 1) >> XC_PAGE_SHIFT;
+unsigned int prot = PROT_READ;
+struct XENMappedGrantRegion *mgr = NULL;
+
+if (is_write) {
+prot |= PROT_WRITE;
+}
+
+qemu_mutex_lock(_map_mutex);
+
+QLIST_FOREACH(mgr, _grant_mappings, list) {
+if (mgr->addr == buffer - page_off &&
+mgr->pages == nrefs &&
+(mgr->prot & prot) == prot) {
+break;
+}
+}
+if (mgr) {
+mgr->refs--;
+if (!mgr->refs) {
+xengnttab_unmap(xen_region_gnttabdev, mgr->addr, nrefs);
+
+QLIST_REMOVE(mgr, list);
+g_free(mgr);
+}
+} else {
+error_report("xen_unmap_grant_dyn() trying to unmap unknown buffer");
+}
+
+qemu_mutex_unlock(_map_mutex);
+}
+
+static ram_addr_t xen_ram_addr_from_grant_cache(void *ptr)
+{
+unsigned int page_off = (unsigned long)ptr & (XC_PAGE_SIZE - 1);
+struct XENMappedGrantRegion *mgr = NULL;
+ram_addr_t raddr = RAM_ADDR_INVALID;
+
+qemu_mutex_lock(_map_mutex);
+
+QLIST_FOREACH(mgr, _grant_mappings, list) {
+if (mgr->addr == ptr - page_off) {
+break;
+}
+}
+
+if (mgr) {
+raddr =

[QEMU][PATCH v3 5/7] memory: add MemoryRegion map and unmap callbacks

2024-02-27 Thread Vikram Garhwal

From: Juergen Gross 

In order to support mapping and unmapping guest memory dynamically to
and from qemu during address_space_[un]map() operations add the map()
and unmap() callbacks to MemoryRegionOps.

Those will be used e.g. for Xen grant mappings when performing guest
I/Os.

Signed-off-by: Juergen Gross 
Signed-off-by: Vikram Garhwal 
---
 include/exec/memory.h | 21 ++
 system/physmem.c  | 50 +--
 2 files changed, 60 insertions(+), 11 deletions(-)

diff --git a/include/exec/memory.h b/include/exec/memory.h
index 8626a355b3..9f7dfe59c7 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -282,6 +282,27 @@ struct MemoryRegionOps {
 unsigned size,
 MemTxAttrs attrs);
 
+/*
+ * Dynamically create mapping. @addr is the guest address to map; @plen
+ * is the pointer to the usable length of the buffer.
+ * @mr contents can be changed in case a new memory region is created for
+ * the mapping.
+ * Returns the buffer address for accessing the data.
+ */
+void *(*map)(MemoryRegion **mr,
+ hwaddr addr,
+ hwaddr *plen,
+ bool is_write,
+ MemTxAttrs attrs);
+
+/* Unmap an area obtained via map() before. */
+void (*unmap)(MemoryRegion *mr,
+  void *buffer,
+  ram_addr_t addr,
+  hwaddr len,
+  bool is_write,
+  hwaddr access_len);
+
 enum device_endian endianness;
 /* Guest-visible constraints: */
 struct {
diff --git a/system/physmem.c b/system/physmem.c
index 949dcb20ba..d989e9fc1f 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -3141,6 +3141,7 @@ void *address_space_map(AddressSpace *as,
 hwaddr len = *plen;
 hwaddr l, xlat;
 MemoryRegion *mr;
+void *ptr = NULL;
 FlatView *fv;
 
 if (len == 0) {
@@ -3174,12 +3175,20 @@ void *address_space_map(AddressSpace *as,
 return bounce.buffer;
 }
 
-
 memory_region_ref(mr);
+
+if (mr->ops && mr->ops->map) {
+ptr = mr->ops->map(, addr, plen, is_write, attrs);
+}
+
 *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
 l, is_write, attrs);
 fuzz_dma_read_cb(addr, *plen, mr);
-return qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
+if (ptr == NULL) {
+ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
+}
+
+return ptr;
 }
 
 /* Unmaps a memory region previously mapped by address_space_map().
@@ -3195,11 +3204,16 @@ void address_space_unmap(AddressSpace *as, void 
*buffer, hwaddr len,
 
 mr = memory_region_from_host(buffer, );
 assert(mr != NULL);
-if (is_write) {
-invalidate_and_set_dirty(mr, addr1, access_len);
-}
-if (xen_enabled()) {
-xen_invalidate_map_cache_entry(buffer);
+
+if (mr->ops && mr->ops->unmap) {
+mr->ops->unmap(mr, buffer, addr1, len, is_write, access_len);
+} else {
+if (is_write) {
+invalidate_and_set_dirty(mr, addr1, access_len);
+}
+if (xen_enabled()) {
+xen_invalidate_map_cache_entry(buffer);
+}
 }
 memory_region_unref(mr);
 return;
@@ -3272,10 +3286,18 @@ int64_t address_space_cache_init(MemoryRegionCache 
*cache,
  * doing this if we found actual RAM, which behaves the same
  * regardless of attributes; so UNSPECIFIED is fine.
  */
+if (mr->ops && mr->ops->map) {
+cache->ptr = mr->ops->map(, addr, , is_write,
+  MEMTXATTRS_UNSPECIFIED);
+}
+
 l = flatview_extend_translation(cache->fv, addr, len, mr,
 cache->xlat, l, is_write,
 MEMTXATTRS_UNSPECIFIED);
-cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, , true);
+if (!cache->ptr) {
+cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, ,
+ true);
+}
 } else {
 cache->ptr = NULL;
 }
@@ -3297,14 +3319,20 @@ void address_space_cache_invalidate(MemoryRegionCache 
*cache,
 
 void address_space_cache_destroy(MemoryRegionCache *cache)
 {
-if (!cache->mrs.mr) {
+MemoryRegion *mr = cache->mrs.mr;
+
+if (!mr) {
 return;
 }
 
-if (xen_enabled()) {
+if (mr->ops && mr->ops->unmap) {
+mr->ops->unmap(mr, cache->ptr, cache->xlat, cache->len,
+   cache->is_write, cache->len);
+} else if (xen_enabled()) {
 xen_invalidate_map_cache_entry(cache->ptr);
 }
-memory_region_unref(cache->mrs.mr);
+
+memory_region_unref(mr);
 flatview_unref(cache->fv);

[RFC PATCH 1/5] target/arm: Add requester ID to memattrs

2024-02-27 Thread Joe Komlodi

I've seen a few different instances where a CPU or a memory region is
behind some sort of IOMMU, and the IOMMU translates (or denies) accesses
based on the requester ID of the CPU.

This patch only does it on ARM CPUs, because I did not see CPU-agnostic
code that added CPU attributes when creating TLBs. Similarly, we add the
requester ID during PTW, while populating the rest of the memory
attributes.

We add the requester ID during GPC and descriptor grabbing as well as
PTWs.

Signed-off-by: Joe Komlodi 
---
 target/arm/cpu.c | 4 
 target/arm/cpu.h | 6 ++
 target/arm/ptw.c | 5 +
 3 files changed, 15 insertions(+)

diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index 5fa86bc8d5..9cfbba10c2 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -2402,6 +2402,9 @@ static void arm_cpu_realizefn(DeviceState *dev, Error 
**errp)
 }
 }
 
+/* For MemTxAttrs. */
+env->requester_id = cpu->requester_id;
+
 qemu_init_vcpu(cs);
 cpu_reset(cs);
 
@@ -2439,6 +2442,7 @@ static Property arm_cpu_properties[] = {
 mp_affinity, ARM64_AFFINITY_INVALID),
 DEFINE_PROP_INT32("node-id", ARMCPU, node_id, CPU_UNSET_NUMA_NODE_ID),
 DEFINE_PROP_INT32("core-count", ARMCPU, core_count, -1),
+DEFINE_PROP_UINT16("requester-id", ARMCPU, requester_id, 0),
 DEFINE_PROP_END_OF_LIST()
 };
 
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index 63f31e0d98..5fc572e077 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -774,6 +774,9 @@ typedef struct CPUArchState {
 /* Linux syscall tagged address support */
 bool tagged_addr_enable;
 #endif
+
+/* For MemTxAttrs. */
+uint16_t requester_id;
 } CPUARMState;
 
 static inline void set_feature(CPUARMState *env, int feature)
@@ -1091,6 +1094,9 @@ struct ArchCPU {
 
 /* Generic timer counter frequency, in Hz */
 uint64_t gt_cntfrq_hz;
+
+/* Requester ID, used in MemTxAttrs. */
+uint16_t requester_id;
 };
 
 typedef struct ARMCPUInfo {
diff --git a/target/arm/ptw.c b/target/arm/ptw.c
index 5eb3577bcd..148af3a000 100644
--- a/target/arm/ptw.c
+++ b/target/arm/ptw.c
@@ -287,6 +287,7 @@ static bool granule_protection_check(CPUARMState *env, 
uint64_t paddress,
 MemTxAttrs attrs = {
 .secure = true,
 .space = ARMSS_Root,
+.requester_id = env->requester_id,
 };
 ARMCPU *cpu = env_archcpu(env);
 uint64_t gpccr = env->cp15.gpccr_el3;
@@ -638,6 +639,7 @@ static uint32_t arm_ldl_ptw(CPUARMState *env, S1Translate 
*ptw,
 MemTxAttrs attrs = {
 .space = ptw->out_space,
 .secure = arm_space_is_secure(ptw->out_space),
+.requester_id = env->requester_id,
 };
 AddressSpace *as = arm_addressspace(cs, attrs);
 MemTxResult result = MEMTX_OK;
@@ -684,6 +686,7 @@ static uint64_t arm_ldq_ptw(CPUARMState *env, S1Translate 
*ptw,
 MemTxAttrs attrs = {
 .space = ptw->out_space,
 .secure = arm_space_is_secure(ptw->out_space),
+.requester_id = env->requester_id,
 };
 AddressSpace *as = arm_addressspace(cs, attrs);
 MemTxResult result = MEMTX_OK;
@@ -3306,6 +3309,8 @@ static bool get_phys_addr_nogpc(CPUARMState *env, 
S1Translate *ptw,
 result->f.attrs.space = ptw->in_space;
 result->f.attrs.secure = arm_space_is_secure(ptw->in_space);
 
+result->f.attrs.requester_id = env->requester_id;
+
 switch (mmu_idx) {
 case ARMMMUIdx_Phys_S:
 case ARMMMUIdx_Phys_NS:
-- 
2.44.0.rc0.258.g7320e95886-goog

[RFC PATCH 5/5] hw/pci: Add user-defined memattrs

2024-02-27 Thread Joe Komlodi

This adds user-defined bits, which users can set and use on transactions
that involve memory attributes.

We add it in the MSI function, since the attributes are initialized in
that function.
We do not add it in pci_dma_rw because the attributes are passed in.
Some users might pass in MEMTXATTRS_UNSPECIFIED, and we should respect
that instead of injecting user-defined attributes in the function.

Signed-off-by: Joe Komlodi 
---
 hw/pci/pci.c| 3 +++
 include/hw/pci/pci_device.h | 1 +
 2 files changed, 4 insertions(+)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 6496d027ca..b0bb682f15 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -85,6 +85,8 @@ static Property pci_props[] = {
 QEMU_PCIE_ERR_UNC_MASK_BITNR, true),
 DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present,
 QEMU_PCIE_ARI_NEXTFN_1_BITNR, false),
+DEFINE_PROP_UINT8("memattr-user-defined", PCIDevice, memattr_user_defined,
+  0),
 DEFINE_PROP_END_OF_LIST()
 };
 
@@ -361,6 +363,7 @@ static void pci_msi_trigger(PCIDevice *dev, MSIMessage msg)
 return;
 }
 attrs.requester_id = pci_requester_id(dev);
+attrs.user_defined = dev->memattr_user_defined;
 address_space_stl_le(>bus_master_as, msg.address, msg.data,
  attrs, NULL);
 }
diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
index d3dd0f64b2..99be6d72b1 100644
--- a/include/hw/pci/pci_device.h
+++ b/include/hw/pci/pci_device.h
@@ -84,6 +84,7 @@ struct PCIDevice {
  * conventional PCI root complex, this field is meaningless.
  */
 PCIReqIDCache requester_id_cache;
+uint8_t memattr_user_defined;
 char name[64];
 PCIIORegion io_regions[PCI_NUM_REGIONS];
 AddressSpace bus_master_as;
-- 
2.44.0.rc0.258.g7320e95886-goog

[RFC PATCH 0/5] memattrs: target/arm: add user-defined and requester ID memattrs

2024-02-27 Thread Joe Komlodi

Hi all,

This adds requester IDs to ARM CPUs and adds a "user-defined" memory
attribute.

The requester ID on ARM CPUs is there because I've seen some cases where
there's an IOMMU between a CPU and memory that uses the CPU's requester
ID to look up how it should translate, such as an SMMU TBU or some other
IOMMU-like device.
For a specific downstream example I've seen, Xilinx overrides CPU
attributes with ones passed in by an object property in order to have
their IOMMUs work:
https://github.com/Xilinx/qemu/blob/23b643ba1683a47ef49447a45643fe2172d6f8ca/accel/tcg/cputlb.c#L1127.
The object property with the memory attributes is declared here, for
reference: 
https://github.com/Xilinx/qemu/blob/23b643ba1683a47ef49447a45643fe2172d6f8ca/target/arm/cpu.c#L1310.

The user-defined attribute represents optional user signals that are a
part of AMBA-AXI. As the name suggests, these are defined
per-implementation and devices that receive these have their own
interpretation of what the user-defined attribute means.

We add them in CPUs and PCI transactions, because some of their
attributes are set in functions in ways that are not user-facing. DMAs
or other devices that set attributes (using address_space_rw or some
other means), can add them on a per-device basis.

RFC because it's possible we might want this implementated in some other
way, and it touches some pretty frequently used code that I'm somewhat
familiar with, but not 100% familiar with.

Thanks,
Joe

Joe Komlodi (5):
  target/arm: Add requester ID to memattrs
  memattrs: Fix target_tlb_bit whitespace
  memattrs: Add user-defined attribute
  target/arm: Add user-defined memattrs
  hw/pci: Add user-defined memattrs

 hw/pci/pci.c| 3 +++
 include/exec/memattrs.h | 8 +---
 include/hw/pci/pci_device.h | 1 +
 target/arm/cpu.c| 6 ++
 target/arm/cpu.h| 8 
 target/arm/ptw.c| 9 +
 6 files changed, 32 insertions(+), 3 deletions(-)

-- 
2.44.0.rc0.258.g7320e95886-goog

[RFC PATCH 2/5] memattrs: Fix target_tlb_bit whitespace

2024-02-27 Thread Joe Komlodi

checkpatch.pl doesn't like these spaces around the colon, so we may as
well fix it up.

No functional change.

Signed-off-by: Joe Komlodi 
---
 include/exec/memattrs.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/exec/memattrs.h b/include/exec/memattrs.h
index d04170aa27..942b721be8 100644
--- a/include/exec/memattrs.h
+++ b/include/exec/memattrs.h
@@ -61,9 +61,9 @@ typedef struct MemTxAttrs {
  * and has unused bits.  These fields will be read by target-specific
  * helpers using env->iotlb[mmu_idx][tlb_index()].attrs.target_tlb_bitN.
  */
-unsigned int target_tlb_bit0 : 1;
-unsigned int target_tlb_bit1 : 1;
-unsigned int target_tlb_bit2 : 1;
+unsigned int target_tlb_bit0:1;
+unsigned int target_tlb_bit1:1;
+unsigned int target_tlb_bit2:1;
 } MemTxAttrs;
 
 /* Bus masters which don't specify any attributes will get this,
-- 
2.44.0.rc0.258.g7320e95886-goog

[RFC PATCH 3/5] memattrs: Add user-defined attribute

2024-02-27 Thread Joe Komlodi

These are used to represent implementation-specific data.
These are based off of AMBA-AXI user signals, but can be used in any
implementation.

The length of 4-bits is arbitrary.

Signed-off-by: Joe Komlodi 
---
 include/exec/memattrs.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/exec/memattrs.h b/include/exec/memattrs.h
index 942b721be8..a38645f881 100644
--- a/include/exec/memattrs.h
+++ b/include/exec/memattrs.h
@@ -64,6 +64,8 @@ typedef struct MemTxAttrs {
 unsigned int target_tlb_bit0:1;
 unsigned int target_tlb_bit1:1;
 unsigned int target_tlb_bit2:1;
+/* User-defined bits represent data that is implementation defined. */
+unsigned int user_defined:4;
 } MemTxAttrs;
 
 /* Bus masters which don't specify any attributes will get this,
-- 
2.44.0.rc0.258.g7320e95886-goog

[RFC PATCH 4/5] target/arm: Add user-defined memattrs

2024-02-27 Thread Joe Komlodi

During transactions, these get added to memory attributes at the same
time other attributes are added.

Similar to the requester ID, these are added on PTWs, GPCs, and
descriptor grabbing as well.

Signed-off-by: Joe Komlodi 
---
 target/arm/cpu.c | 2 ++
 target/arm/cpu.h | 2 ++
 target/arm/ptw.c | 4 
 3 files changed, 8 insertions(+)

diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index 9cfbba10c2..dcd2c16c2e 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -2404,6 +2404,7 @@ static void arm_cpu_realizefn(DeviceState *dev, Error 
**errp)
 
 /* For MemTxAttrs. */
 env->requester_id = cpu->requester_id;
+env->memattr_user_defined = cpu->memattr_user_defined;
 
 qemu_init_vcpu(cs);
 cpu_reset(cs);
@@ -2443,6 +2444,7 @@ static Property arm_cpu_properties[] = {
 DEFINE_PROP_INT32("node-id", ARMCPU, node_id, CPU_UNSET_NUMA_NODE_ID),
 DEFINE_PROP_INT32("core-count", ARMCPU, core_count, -1),
 DEFINE_PROP_UINT16("requester-id", ARMCPU, requester_id, 0),
+DEFINE_PROP_UINT8("memattr-user-defined", ARMCPU, memattr_user_defined, 0),
 DEFINE_PROP_END_OF_LIST()
 };
 
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index 5fc572e077..499a5b25c7 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -777,6 +777,7 @@ typedef struct CPUArchState {
 
 /* For MemTxAttrs. */
 uint16_t requester_id;
+uint8_t memattr_user_defined;
 } CPUARMState;
 
 static inline void set_feature(CPUARMState *env, int feature)
@@ -1097,6 +1098,7 @@ struct ArchCPU {
 
 /* Requester ID, used in MemTxAttrs. */
 uint16_t requester_id;
+uint8_t memattr_user_defined;
 };
 
 typedef struct ARMCPUInfo {
diff --git a/target/arm/ptw.c b/target/arm/ptw.c
index 148af3a000..b2af3d9052 100644
--- a/target/arm/ptw.c
+++ b/target/arm/ptw.c
@@ -288,6 +288,7 @@ static bool granule_protection_check(CPUARMState *env, 
uint64_t paddress,
 .secure = true,
 .space = ARMSS_Root,
 .requester_id = env->requester_id,
+.user_defined = env->memattr_user_defined,
 };
 ARMCPU *cpu = env_archcpu(env);
 uint64_t gpccr = env->cp15.gpccr_el3;
@@ -640,6 +641,7 @@ static uint32_t arm_ldl_ptw(CPUARMState *env, S1Translate 
*ptw,
 .space = ptw->out_space,
 .secure = arm_space_is_secure(ptw->out_space),
 .requester_id = env->requester_id,
+.user_defined = env->memattr_user_defined,
 };
 AddressSpace *as = arm_addressspace(cs, attrs);
 MemTxResult result = MEMTX_OK;
@@ -687,6 +689,7 @@ static uint64_t arm_ldq_ptw(CPUARMState *env, S1Translate 
*ptw,
 .space = ptw->out_space,
 .secure = arm_space_is_secure(ptw->out_space),
 .requester_id = env->requester_id,
+.user_defined = env->memattr_user_defined,
 };
 AddressSpace *as = arm_addressspace(cs, attrs);
 MemTxResult result = MEMTX_OK;
@@ -3310,6 +3313,7 @@ static bool get_phys_addr_nogpc(CPUARMState *env, 
S1Translate *ptw,
 result->f.attrs.secure = arm_space_is_secure(ptw->in_space);
 
 result->f.attrs.requester_id = env->requester_id;
+result->f.attrs.user_defined = env->memattr_user_defined;
 
 switch (mmu_idx) {
 case ARMMMUIdx_Phys_S:
-- 
2.44.0.rc0.258.g7320e95886-goog

Re: [PATCH 3/5] hw/isa: Embed TYPE_PORT92 in south bridges used in PC machines

2024-02-27 Thread BALATON Zoltan


On Tue, 27 Feb 2024, Bernhard Beschow wrote:

Am 21. Februar 2024 11:53:21 UTC schrieb Mark Cave-Ayland 
:

On 18/02/2024 13:16, Bernhard Beschow wrote:

Port 92 is an integral part of the PIIX and ICH south bridges, so instantiate it
there. The isapc machine now needs to instantiate it explicitly, analoguous to
the RTC.

Note that due to migration compatibility, port92 is optional in the south
bridges. It is always instantiated the isapc machine for simplicity.

Signed-off-by: Bernhard Beschow 
---
  include/hw/i386/pc.h  |  2 +-
  include/hw/southbridge/ich9.h |  4 
  include/hw/southbridge/piix.h |  3 +++
  hw/i386/pc.c  | 18 --
  hw/i386/pc_piix.c |  9 +++--
  hw/i386/pc_q35.c  |  8 +---
  hw/isa/lpc_ich9.c |  9 +
  hw/isa/piix.c |  9 +
  hw/isa/Kconfig|  2 ++
  9 files changed, 52 insertions(+), 12 deletions(-)


I had a look at this (and did a bit of revision around 8042 and A20), 
and I am starting to wonder if the PORT92 device isn't something that 
belongs to the southbridge, but more specifically to the superio chip?


If there is agreement to model real hardware in QEMU, then I think that


I think there's no such agreement and QEMU is more lax about it both for 
historical reasons and to simplify machine models. Indeed, QEMU sometimes 
models non-existing machines (e.g. the mac99 or virt boards) that don't 
correspond to real hardware but allow guest OSes to boot. Even when 
modelllng real hardware it's ofren modelled just enough for guests to work 
and unused details are omitted for simplicity. It is recommended to follow 
what real hardware does when modelling real hardware but not always 
required. Although it might help both with verifying a device model and to 
compose machines with these models to try to follow the real hardware.


port 92 belongs into any device model where the hardware has one. All 
our PC-like southbridges (PIIX, ICH, VIA) have port 92. Many FDC37 
including the FDC37M81x as used in the Malta board have one, too -- 
where it must first be enabled.


So port92 is not a real hardware but a QEMU abstraction or model of some 
functionality found in some machines. Real chips probably implement this 
in different ways so we could either model this in these chips 
independently the same way as real hardware does or use the abstracted 
model anywhere in our machine model. Since this does not exist in real 
hardware as this abstract model it also does not belong anywhere so we are 
free to put it where it's most convenient or simple to do.


A couple of thoughts as to why I came to this conclusion: firstly the 
superio chip is generally considered to be a single integrated 
implementation of legacy IO devices, so this feels like a natural home 
for the PORT92 device.


Secondly the value of the "has-port92" property is controlled by 
pcms->i8042_enabled, and this value is already passed into functions 
such as pc_superio_init() for example.


Rhight. There, it also controls the presence of port 92. If we move port 
92 into the southbridges, we have to respect this command line switch 
there to preserve backward compatibility.


I wonder what `-M i8042` is supposed to do. If it is for modeling a 
stripped-down x86 system, why not use the microvm instead? How is it 
possible to omit an essential piece of hardware needed to boot x86 
systems? Don't we need at least either one (i8042 or port 92)?


Try git log -p 4ccd5fe22fe (found it via git blame and see what added that 
property).


I think this would also help reduce the changes required for the 
individual machines, however the devil is always in the details 
particularly when migration is involved.


As stated above, this series is more about modeling real hardware, in 
the hope that this will lend itself for configuration-driven machine 
creation. It is also about identifying obstacles towards this goal. Does 
it make sense to deprecate some machine-specific options such as i8042?


Only if you want to break downsteam users of those options but maybe they 
won't be happy about that.


Regards,
BALATON Zoltan

Re: [PATCH v8 2/2] ppc: spapr: Enable 2nd DAWR on Power10 pSeries machine

2024-02-27 Thread David Gibson

On Tue, Feb 27, 2024 at 10:21:23PM +1000, Nicholas Piggin wrote:
> On Fri Feb 2, 2024 at 12:46 AM AEST, Shivaprasad G Bhat wrote:
> > As per the PAPR, bit 0 of byte 64 in pa-features property
> > indicates availability of 2nd DAWR registers. i.e. If this bit is set, 2nd
> > DAWR is present, otherwise not. Use KVM_CAP_PPC_DAWR1 capability to find
> > whether kvm supports 2nd DAWR or not. If it's supported, allow user to set
> > the pa-feature bit in guest DT using cap-dawr1 machine capability.
> >
> > Signed-off-by: Ravi Bangoria 
> > Signed-off-by: Shivaprasad G Bhat 
> > ---
> >  hw/ppc/spapr.c |7 ++-
> >  hw/ppc/spapr_caps.c|   36 
> >  hw/ppc/spapr_hcall.c   |   25 -
> >  include/hw/ppc/spapr.h |6 +-
> >  target/ppc/kvm.c   |   12 
> >  target/ppc/kvm_ppc.h   |   12 
> >  6 files changed, 87 insertions(+), 11 deletions(-)
> >
> > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > index e8dabc8614..91a97d72e7 100644
> > --- a/hw/ppc/spapr.c
> > +++ b/hw/ppc/spapr.c
> > @@ -262,7 +262,7 @@ static void spapr_dt_pa_features(SpaprMachineState 
> > *spapr,
> >  0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 48 - 53 */
> >  /* 54: DecFP, 56: DecI, 58: SHA */
> >  0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 54 - 59 */
> > -/* 60: NM atomic, 62: RNG */
> > +/* 60: NM atomic, 62: RNG, 64: DAWR1 (ISA 3.1) */
> >  0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 60 - 65 */
> >  };
> >  uint8_t *pa_features = NULL;
> > @@ -303,6 +303,9 @@ static void spapr_dt_pa_features(SpaprMachineState 
> > *spapr,
> >   * in pa-features. So hide it from them. */
> >  pa_features[40 + 2] &= ~0x80; /* Radix MMU */
> >  }
> > +if (spapr_get_cap(spapr, SPAPR_CAP_DAWR1)) {
> > +pa_features[66] |= 0x80;
> > +}
> >  
> >  _FDT((fdt_setprop(fdt, offset, "ibm,pa-features", pa_features, 
> > pa_size)));
> >  }
> > @@ -2138,6 +2141,7 @@ static const VMStateDescription vmstate_spapr = {
> >  _spapr_cap_fwnmi,
> >  _spapr_fwnmi,
> >  _spapr_cap_rpt_invalidate,
> > +_spapr_cap_dawr1,
> >  NULL
> >  }
> >  };
> > @@ -4717,6 +4721,7 @@ static void spapr_machine_class_init(ObjectClass *oc, 
> > void *data)
> >  smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_ON;
> >  smc->default_caps.caps[SPAPR_CAP_FWNMI] = SPAPR_CAP_ON;
> >  smc->default_caps.caps[SPAPR_CAP_RPT_INVALIDATE] = SPAPR_CAP_OFF;
> > +smc->default_caps.caps[SPAPR_CAP_DAWR1] = SPAPR_CAP_OFF;
> >  
> >  /*
> >   * This cap specifies whether the AIL 3 mode for
> > diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c
> > index e889244e52..677f17cea6 100644
> > --- a/hw/ppc/spapr_caps.c
> > +++ b/hw/ppc/spapr_caps.c
> > @@ -655,6 +655,32 @@ static void cap_ail_mode_3_apply(SpaprMachineState 
> > *spapr,
> >  }
> >  }
> >  
> > +static void cap_dawr1_apply(SpaprMachineState *spapr, uint8_t val,
> > +   Error **errp)
> > +{
> > +ERRP_GUARD();
> > +
> > +if (!val) {
> > +return; /* Disable by default */
> > +}
> > +
> > +if (!ppc_type_check_compat(MACHINE(spapr)->cpu_type,
> > +   CPU_POWERPC_LOGICAL_3_10, 0,
> > +   spapr->max_compat_pvr)) {
> > +warn_report("DAWR1 supported only on POWER10 and later CPUs");
> > +}
> 
> Should this be an error?

Yes, it should.  If you can't supply the cap requested, you *must*
fail to start.  Near enough is not good enough when it comes to the
guest visible properties of the virtual machine, or you'll end up with
no end of migration headaches.

> Should the dawr1 cap be enabled by default for POWER10 machines?
> 
> > +
> > +if (kvm_enabled()) {
> > +if (!kvmppc_has_cap_dawr1()) {
> > +error_setg(errp, "DAWR1 not supported by KVM.");
> > +error_append_hint(errp, "Try appending -machine 
> > cap-dawr1=off");
> > +} else if (kvmppc_set_cap_dawr1(val) < 0) {
> > +error_setg(errp, "Error enabling cap-dawr1 with KVM.");
> > +error_append_hint(errp, "Try appending -machine 
> > cap-dawr1=off");
> > +}
> > +}
> > +}
> > +
> >  SpaprCapabilityInfo capability_table[SPAPR_CAP_NUM] = {
> >  [SPAPR_CAP_HTM] = {
> >  .name = "htm",
> > @@ -781,6 +807,15 @@ SpaprCapabilityInfo capability_table[SPAPR_CAP_NUM] = {
> >  .type = "bool",
> >  .apply = cap_ail_mode_3_apply,
> >  },
> > +[SPAPR_CAP_DAWR1] = {
> > +.name = "dawr1",
> > +.description = "Allow 2nd Data Address Watchpoint Register 
> > (DAWR1)",
> > +.index = SPAPR_CAP_DAWR1,
> > +.get = spapr_cap_get_bool,
> > +.set = spapr_cap_set_bool,
> > +.type = "bool",
> > +.apply = cap_dawr1_apply,
> > +},
> >  };
> >  
> >  static SpaprCapabilities

Re: [PATCH 08/14] linux-user/elfload: Lock cpu list and mmap during elf_core_dump

2024-02-27 Thread Alex Bennée

Richard Henderson  writes:

> Do not allow changes to the set of cpus and memory regions
> while we are dumping core.
>
> Signed-off-by: Richard Henderson 

Reviewed-by: Alex Bennée 

-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro

Re: [PATCH 3/5] hw/isa: Embed TYPE_PORT92 in south bridges used in PC machines

2024-02-27 Thread Bernhard Beschow




Am 21. Februar 2024 11:53:21 UTC schrieb Mark Cave-Ayland 
:
>On 18/02/2024 13:16, Bernhard Beschow wrote:
>
>> Port 92 is an integral part of the PIIX and ICH south bridges, so 
>> instantiate it
>> there. The isapc machine now needs to instantiate it explicitly, analoguous 
>> to
>> the RTC.
>> 
>> Note that due to migration compatibility, port92 is optional in the south
>> bridges. It is always instantiated the isapc machine for simplicity.
>> 
>> Signed-off-by: Bernhard Beschow 
>> ---
>>   include/hw/i386/pc.h  |  2 +-
>>   include/hw/southbridge/ich9.h |  4 
>>   include/hw/southbridge/piix.h |  3 +++
>>   hw/i386/pc.c  | 18 --
>>   hw/i386/pc_piix.c |  9 +++--
>>   hw/i386/pc_q35.c  |  8 +---
>>   hw/isa/lpc_ich9.c |  9 +
>>   hw/isa/piix.c |  9 +
>>   hw/isa/Kconfig|  2 ++
>>   9 files changed, 52 insertions(+), 12 deletions(-)
>> 
>> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
>> index b2987209b1..a9ff1f5ab3 100644
>> --- a/include/hw/i386/pc.h
>> +++ b/include/hw/i386/pc.h
>> @@ -178,7 +178,7 @@ uint64_t pc_pci_hole64_start(void);
>>   DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus);
>>   void pc_basic_device_init(struct PCMachineState *pcms,
>> ISABus *isa_bus, qemu_irq *gsi,
>> -  ISADevice *rtc_state,
>> +  ISADevice *rtc_state, ISADevice *port92,
>> bool create_fdctrl,
>> uint32_t hpet_irqs);
>>   void pc_cmos_init(PCMachineState *pcms,
>> diff --git a/include/hw/southbridge/ich9.h b/include/hw/southbridge/ich9.h
>> index fd01649d04..d70a94f5e7 100644
>> --- a/include/hw/southbridge/ich9.h
>> +++ b/include/hw/southbridge/ich9.h
>> @@ -3,6 +3,7 @@
>> #include "hw/isa/apm.h"
>>   #include "hw/acpi/ich9.h"
>> +#include "hw/isa/port92.h"
>>   #include "hw/intc/ioapic.h"
>>   #include "hw/pci/pci.h"
>>   #include "hw/pci/pci_device.h"
>> @@ -32,6 +33,7 @@ struct ICH9LPCState {
>>   uint8_t irr[PCI_SLOT_MAX][PCI_NUM_PINS];
>> MC146818RtcState rtc;
>> +Port92State port92;
>>   APMState apm;
>>   ICH9LPCPMRegs pm;
>>   uint32_t sci_level; /* track sci level */
>> @@ -54,6 +56,8 @@ struct ICH9LPCState {
>>   uint8_t rst_cnt;
>>   MemoryRegion rst_cnt_mem;
>>   +bool has_port92;
>> +
>>   /* SMI feature negotiation via fw_cfg */
>>   uint64_t smi_host_features;   /* guest-invisible, host endian */
>>   uint8_t smi_host_features_le[8];  /* guest-visible, read-only, little
>> diff --git a/include/hw/southbridge/piix.h b/include/hw/southbridge/piix.h
>> index 86709ba2e4..35058529d1 100644
>> --- a/include/hw/southbridge/piix.h
>> +++ b/include/hw/southbridge/piix.h
>> @@ -15,6 +15,7 @@
>>   #include "hw/pci/pci_device.h"
>>   #include "hw/acpi/piix4.h"
>>   #include "hw/ide/pci.h"
>> +#include "hw/isa/port92.h"
>>   #include "hw/rtc/mc146818rtc.h"
>>   #include "hw/usb/hcd-uhci.h"
>>   @@ -56,6 +57,7 @@ struct PIIXState {
>>   int32_t pci_irq_levels_vmstate[PIIX_NUM_PIRQS];
>> MC146818RtcState rtc;
>> +Port92State port92;
>>   PCIIDEState ide;
>>   UHCIState uhci;
>>   PIIX4PMState pm;
>> @@ -71,6 +73,7 @@ struct PIIXState {
>>   bool has_acpi;
>>   bool has_pic;
>>   bool has_pit;
>> +bool has_port92;
>>   bool has_usb;
>>   bool smm_enabled;
>>   };
>> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
>> index 0b11d4576e..8b601ea6cf 100644
>> --- a/hw/i386/pc.c
>> +++ b/hw/i386/pc.c
>> @@ -1160,7 +1160,7 @@ static void pc_superio_init(ISABus *isa_bus, bool 
>> create_fdctrl,
>>   int i;
>>   DriveInfo *fd[MAX_FD];
>>   qemu_irq *a20_line;
>> -ISADevice *fdc, *i8042, *port92, *vmmouse;
>> +ISADevice *fdc, *i8042, *vmmouse;
>> serial_hds_isa_init(isa_bus, 0, MAX_ISA_SERIAL_PORTS);
>>   parallel_hds_isa_init(isa_bus, MAX_PARALLEL_PORTS);
>> @@ -1193,18 +1193,15 @@ static void pc_superio_init(ISABus *isa_bus, bool 
>> create_fdctrl,
>>_abort);
>>   isa_realize_and_unref(vmmouse, isa_bus, _fatal);
>>   }
>> -port92 = isa_create_simple(isa_bus, TYPE_PORT92);
>>   -a20_line = qemu_allocate_irqs(handle_a20_line_change, first_cpu, 2);
>> +a20_line = qemu_allocate_irqs(handle_a20_line_change, first_cpu, 1);
>>   i8042_setup_a20_line(i8042, a20_line[0]);
>> -qdev_connect_gpio_out_named(DEVICE(port92),
>> -PORT92_A20_LINE, 0, a20_line[1]);
>>   g_free(a20_line);
>>   }
>> void pc_basic_device_init(struct PCMachineState *pcms,
>> ISABus *isa_bus, qemu_irq *gsi,
>> -  ISADevice *rtc_state,
>> +  ISADevice *rtc_state, ISADevice *port92,
>> bool create_fdctrl,
>>

[PATCH v2 08/21] memory: Add Error** argument to .log_global*() handlers

2024-02-27 Thread Cédric Le Goater

Modify all log_global*() handlers to take an Error** parameter and
return a bool. A new MEMORY_LISTENER_CALL_LOG_GLOBAL macro looping on
the listeners is introduced to handle a possible error, which will
would interrupt the loop if necessary.

To be noted a change in memory_global_dirty_log_start() behavior as it
will return as soon as an error is detected.

Cc: Stefano Stabellini 
Cc: Anthony Perard 
Cc: Paul Durrant 
Cc: Michael S. Tsirkin 
Cc: Paolo Bonzini 
Cc: David Hildenbrand 
Signed-off-by: Cédric Le Goater 
---
 include/exec/memory.h | 15 ++--
 hw/i386/xen/xen-hvm.c |  6 ++--
 hw/vfio/common.c  |  8 +++--
 hw/virtio/vhost.c |  6 ++--
 system/memory.c   | 83 +--
 system/physmem.c  |  5 +--
 6 files changed, 101 insertions(+), 22 deletions(-)

diff --git a/include/exec/memory.h b/include/exec/memory.h
index 
8626a355b310ed7b1a1db7978ba4b394032c2f15..4bc146c5ebdd377cd14a4e462f32cc945db5a0a8
 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -998,8 +998,11 @@ struct MemoryListener {
  * active at that time.
  *
  * @listener: The #MemoryListener.
+ * @errp: pointer to Error*, to store an error if it happens.
+ *
+ * Return: true on success, else false setting @errp with error.
  */
-void (*log_global_start)(MemoryListener *listener);
+bool (*log_global_start)(MemoryListener *listener, Error **errp);
 
 /**
  * @log_global_stop:
@@ -1009,8 +1012,11 @@ struct MemoryListener {
  * the address space.
  *
  * @listener: The #MemoryListener.
+ * @errp: pointer to Error*, to store an error if it happens.
+ *
+ * Return: true on success, else false setting @errp with error.
  */
-void (*log_global_stop)(MemoryListener *listener);
+bool (*log_global_stop)(MemoryListener *listener, Error **errp);
 
 /**
  * @log_global_after_sync:
@@ -1019,8 +1025,11 @@ struct MemoryListener {
  * for any #MemoryRegionSection.
  *
  * @listener: The #MemoryListener.
+ * @errp: pointer to Error*, to store an error if it happens.
+ *
+ * Return: true on success, else false setting @errp with error.
  */
-void (*log_global_after_sync)(MemoryListener *listener);
+bool (*log_global_after_sync)(MemoryListener *listener, Error **errp);
 
 /**
  * @eventfd_add:
diff --git a/hw/i386/xen/xen-hvm.c b/hw/i386/xen/xen-hvm.c
index 
f42621e6742552035122ea58092c91c3458338ff..925a207b494b4eed52d5f360b554f18ac8a9806d
 100644
--- a/hw/i386/xen/xen-hvm.c
+++ b/hw/i386/xen/xen-hvm.c
@@ -446,16 +446,18 @@ static void xen_log_sync(MemoryListener *listener, 
MemoryRegionSection *section)
   int128_get64(section->size));
 }
 
-static void xen_log_global_start(MemoryListener *listener)
+static bool xen_log_global_start(MemoryListener *listener, Error **errp)
 {
 if (xen_enabled()) {
 xen_in_migration = true;
 }
+return true;
 }
 
-static void xen_log_global_stop(MemoryListener *listener)
+static bool xen_log_global_stop(MemoryListener *listener, Error **errp)
 {
 xen_in_migration = false;
+return true;
 }
 
 static const MemoryListener xen_memory_listener = {
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 
059bfdc07a85e2eb908df828c1f42104d683e911..8bba95ba6a2010b78cae54c6905857686bbb6309
 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1075,7 +1075,8 @@ out:
 return ret;
 }
 
-static void vfio_listener_log_global_start(MemoryListener *listener)
+static bool vfio_listener_log_global_start(MemoryListener *listener,
+   Error **errp)
 {
 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
  listener);
@@ -1092,9 +1093,11 @@ static void 
vfio_listener_log_global_start(MemoryListener *listener)
  ret, strerror(-ret));
 vfio_set_migration_error(ret);
 }
+return !!ret;
 }
 
-static void vfio_listener_log_global_stop(MemoryListener *listener)
+static bool vfio_listener_log_global_stop(MemoryListener *listener,
+  Error **errp)
 {
 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
  listener);
@@ -,6 +1114,7 @@ static void vfio_listener_log_global_stop(MemoryListener 
*listener)
  ret, strerror(-ret));
 vfio_set_migration_error(ret);
 }
+return !!ret;
 }
 
 static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova,
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 
2c9ac794680ea9b65eba6cc22e70cf141e90aa73..7a555f941934991a72a2817e5505fe0ce6d6fc64
 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -1044,7 +1044,7 @@ check_dev_state:
 return r;
 }
 
-static void vhost_log_global_start(MemoryListener *listener)
+static bool

[PATCH v2 06/21] migration: Add Error** argument to .save_setup() handler

2024-02-27 Thread Cédric Le Goater

The purpose is to record a potential error in the migration stream if
qemu_savevm_state_setup() fails. Most of the current .save_setup()
handlers can be modified to use the Error argument instead of managing
their own and calling locally error_report(). The following patches
will introduce such changes for VFIO first.

Cc: Nicholas Piggin 
Cc: Harsh Prateek Bora 
Cc: Halil Pasic 
Cc: Thomas Huth 
Cc: Eric Blake 
Cc: Vladimir Sementsov-Ogievskiy 
Cc: John Snow 
Cc: Stefan Hajnoczi 
Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Peter Xu 
Signed-off-by: Cédric Le Goater 
---

Changes in v2: 

 - dropped qemu_file_set_error_obj(f, ret, local_err); 

include/migration/register.h   | 3 ++-
 hw/ppc/spapr.c | 2 +-
 hw/s390x/s390-stattrib.c   | 2 +-
 hw/vfio/migration.c| 2 +-
 migration/block-dirty-bitmap.c | 2 +-
 migration/block.c  | 2 +-
 migration/ram.c| 3 ++-
 migration/savevm.c | 2 +-
 8 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/include/migration/register.h b/include/migration/register.h
index 
2cc71ec45f65bf2884c9e7a823d2968752f15c20..96eae9dba2970552c379c732393e3ab6ef578a58
 100644
--- a/include/migration/register.h
+++ b/include/migration/register.h
@@ -60,10 +60,11 @@ typedef struct SaveVMHandlers {
  *
  * @f: QEMUFile where to send the data
  * @opaque: data pointer passed to register_savevm_live()
+ * @errp: pointer to Error*, to store an error if it happens.
  *
  * Returns zero to indicate success and negative for error
  */
-int (*save_setup)(QEMUFile *f, void *opaque);
+int (*save_setup)(QEMUFile *f, void *opaque, Error **errp);
 
 /**
  * @save_cleanup
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 
55263f0815ed7671b32ea20b394ae71c82e616cb..045c024ffa76eacfc496bd486cb6cafbee2df73e
 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2142,7 +2142,7 @@ static const VMStateDescription vmstate_spapr = {
 }
 };
 
-static int htab_save_setup(QEMUFile *f, void *opaque)
+static int htab_save_setup(QEMUFile *f, void *opaque, Error **errp)
 {
 SpaprMachineState *spapr = opaque;
 
diff --git a/hw/s390x/s390-stattrib.c b/hw/s390x/s390-stattrib.c
index 
c483b62a9b5f71772639fc180bdad15ecb6711cb..c934df424a555d83d2198f5ddfc0cbe0ea98e9ec
 100644
--- a/hw/s390x/s390-stattrib.c
+++ b/hw/s390x/s390-stattrib.c
@@ -166,7 +166,7 @@ static int cmma_load(QEMUFile *f, void *opaque, int 
version_id)
 return ret;
 }
 
-static int cmma_save_setup(QEMUFile *f, void *opaque)
+static int cmma_save_setup(QEMUFile *f, void *opaque, Error **errp)
 {
 S390StAttribState *sas = S390_STATTRIB(opaque);
 S390StAttribClass *sac = S390_STATTRIB_GET_CLASS(sas);
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 
70e6b1a709f9b67e4c9eb41033d76347275cac42..8bcb4bc73cd5ba5338e3ffa4d907d0e6bfbb9485
 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -378,7 +378,7 @@ static int vfio_save_prepare(void *opaque, Error **errp)
 return 0;
 }
 
-static int vfio_save_setup(QEMUFile *f, void *opaque)
+static int vfio_save_setup(QEMUFile *f, void *opaque, Error **errp)
 {
 VFIODevice *vbasedev = opaque;
 VFIOMigration *migration = vbasedev->migration;
diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
index 
2708abf3d762de774ed294d3fdb8e56690d2974c..16f84e6c57c2403a8c2d6319f4e7b6360dade28c
 100644
--- a/migration/block-dirty-bitmap.c
+++ b/migration/block-dirty-bitmap.c
@@ -1213,7 +1213,7 @@ fail:
 return ret;
 }
 
-static int dirty_bitmap_save_setup(QEMUFile *f, void *opaque)
+static int dirty_bitmap_save_setup(QEMUFile *f, void *opaque, Error **errp)
 {
 DBMSaveState *s = &((DBMState *)opaque)->save;
 SaveBitmapState *dbms = NULL;
diff --git a/migration/block.c b/migration/block.c
index 
8c6ebafacc1ffe930d1d4f19d968817b14852c69..df15319ceab66201b043f15eac1b0a7d6522b60c
 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -708,7 +708,7 @@ static void block_migration_cleanup(void *opaque)
 blk_mig_unlock();
 }
 
-static int block_save_setup(QEMUFile *f, void *opaque)
+static int block_save_setup(QEMUFile *f, void *opaque, Error **errp)
 {
 int ret;
 
diff --git a/migration/ram.c b/migration/ram.c
index 
4649a8120492a03d331d660622e1a0a51adb0a96..745482899e18c86b73261b683c1bec04039a76d2
 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2930,8 +2930,9 @@ void qemu_guest_free_page_hint(void *addr, size_t len)
  *
  * @f: QEMUFile where to send the data
  * @opaque: RAMState pointer
+ * @errp: pointer to Error*, to store an error if it happens.
  */
-static int ram_save_setup(QEMUFile *f, void *opaque)
+static int ram_save_setup(QEMUFile *f, void *opaque, Error **errp)
 {
 RAMState **rsp = opaque;
 RAMBlock *block;
diff --git a/migration/savevm.c b/migration/savevm.c
index 
bc168371a31acf85f29f2c284be181250db45df4..b5b3b51bad94dc4c04ae22cd687ba111299339aa
 100644
--- a/migration/savevm.c
+++

[PATCH v2 04/21] migration: Do not call PRECOPY_NOTIFY_SETUP notifiers in case of error

2024-02-27 Thread Cédric Le Goater

When commit bd2270608fa0 ("migration/ram.c: add a notifier chain for
precopy") added PRECOPY_NOTIFY_SETUP notifiers at the end of
qemu_savevm_state_setup(), it didn't take into account a possible
error in the loop calling vmstate_save() or .save_setup() handlers.

Check ret value before calling the notifiers.

Signed-off-by: Cédric Le Goater 
---
 migration/savevm.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/migration/savevm.c b/migration/savevm.c
index 
d612c8a9020b204d5d078d5df85f0e6449c27645..51876f2ef674bb76c7e7ef96e1119a083883deac
 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1316,7 +1316,7 @@ void qemu_savevm_state_setup(QEMUFile *f)
 MigrationState *ms = migrate_get_current();
 SaveStateEntry *se;
 Error *local_err = NULL;
-int ret;
+int ret = 0;
 
 json_writer_int64(ms->vmdesc, "page_size", qemu_target_page_size());
 json_writer_start_array(ms->vmdesc, "devices");
@@ -1350,6 +1350,10 @@ void qemu_savevm_state_setup(QEMUFile *f)
 }
 }
 
+if (ret) {
+return;
+}
+
 if (precopy_notify(PRECOPY_NOTIFY_SETUP, _err)) {
 error_report_err(local_err);
 }
-- 
2.43.2

[PATCH v2 16/21] vfio: Add Error** argument to .vfio_save_config() handler

2024-02-27 Thread Cédric Le Goater

Use vmstate_save_state_with_err() to improve error reporting in the
callers and store a reported error under the migration stream. Add
documentation while at it.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Cédric Le Goater 
---
 include/hw/vfio/vfio-common.h | 25 -
 hw/vfio/migration.c   | 18 --
 hw/vfio/pci.c |  5 +++--
 3 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 
9b7ef7d02b5a0ad5266bcc4d06cd6874178978e4..6d9dee626afc491645d2c2398f3e3210961f67e9
 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -133,7 +133,30 @@ struct VFIODeviceOps {
 int (*vfio_hot_reset_multi)(VFIODevice *vdev);
 void (*vfio_eoi)(VFIODevice *vdev);
 Object *(*vfio_get_object)(VFIODevice *vdev);
-void (*vfio_save_config)(VFIODevice *vdev, QEMUFile *f);
+
+/**
+ * @vfio_save_config
+ *
+ * Save device config state
+ *
+ * @vdev: #VFIODevice for which to save the config
+ * @f: #QEMUFile where to send the data
+ * @errp: pointer to Error*, to store an error if it happens.
+ *
+ * Returns zero to indicate success and negative for error
+ */
+int (*vfio_save_config)(VFIODevice *vdev, QEMUFile *f, Error **errp);
+
+/**
+ * @vfio_load_config
+ *
+ * Load device config state
+ *
+ * @vdev: #VFIODevice for which to load the config
+ * @f: #QEMUFile where to get the data
+ *
+ * Returns zero to indicate success and negative for error
+ */
 int (*vfio_load_config)(VFIODevice *vdev, QEMUFile *f);
 };
 
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 
8bdc68c66516710c52443135284262580825e0b8..228e8854594f3714b7c6f4fcfc5468d6b56337cb
 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -190,14 +190,19 @@ static int vfio_load_buffer(QEMUFile *f, VFIODevice 
*vbasedev,
 return ret;
 }
 
-static int vfio_save_device_config_state(QEMUFile *f, void *opaque)
+static int vfio_save_device_config_state(QEMUFile *f, void *opaque,
+ Error **errp)
 {
 VFIODevice *vbasedev = opaque;
+int ret;
 
 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_CONFIG_STATE);
 
 if (vbasedev->ops && vbasedev->ops->vfio_save_config) {
-vbasedev->ops->vfio_save_config(vbasedev, f);
+ret = vbasedev->ops->vfio_save_config(vbasedev, f, errp);
+if (ret) {
+return ret;
+}
 }
 
 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
@@ -581,13 +586,14 @@ static int vfio_save_complete_precopy(QEMUFile *f, void 
*opaque)
 static void vfio_save_state(QEMUFile *f, void *opaque)
 {
 VFIODevice *vbasedev = opaque;
+Error *local_err = NULL;
 int ret;
 
-ret = vfio_save_device_config_state(f, opaque);
+ret = vfio_save_device_config_state(f, opaque, _err);
 if (ret) {
-error_report("%s: Failed to save device config space",
- vbasedev->name);
-qemu_file_set_error(f, ret);
+error_prepend(_err, "%s: Failed to save device config space",
+  vbasedev->name);
+qemu_file_set_error_obj(f, ret, local_err);
 }
 }
 
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 
4fa387f0430d62ca2ba1b5ae5b7037f8f06b33f9..99d86e1d40ef25133fc76ad6e58294b07bd20843
 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2585,11 +2585,12 @@ const VMStateDescription vmstate_vfio_pci_config = {
 }
 };
 
-static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)
+static int vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f, Error 
**errp)
 {
 VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
 
-vmstate_save_state(f, _vfio_pci_config, vdev, NULL);
+return vmstate_save_state_with_err(f, _vfio_pci_config, vdev, NULL,
+   errp);
 }
 
 static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
-- 
2.43.2

[PATCH v2 11/21] migration: Fix migration termination

2024-02-27 Thread Cédric Le Goater

Handle migration termination when in SETUP state. This can happen if
qemu_savevm_state_setup() fails.

Signed-off-by: Cédric Le Goater 
---
 migration/migration.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/migration/migration.c b/migration/migration.c
index 
c1a62b696f62c0d5aca0505e58bc4dc0ff561fde..63294417ff9cae868ad8a167094a795fc30e4da0
 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -3161,6 +3161,8 @@ static void migration_iteration_finish(MigrationState *s)
 }
 }
 break;
+case MIGRATION_STATUS_SETUP:
+break;
 
 default:
 /* Should not reach here, but if so, forgive the VM. */
@@ -3192,6 +3194,8 @@ static void bg_migration_iteration_finish(MigrationState 
*s)
 case MIGRATION_STATUS_CANCELLED:
 case MIGRATION_STATUS_CANCELLING:
 break;
+case MIGRATION_STATUS_SETUP:
+break;
 
 default:
 /* Should not reach here, but if so, forgive the VM. */
-- 
2.43.2

[PATCH v2 12/21] vfio: Add Error** argument to .set_dirty_page_tracking() handler

2024-02-27 Thread Cédric Le Goater

We will use the Error object to improve error reporting in the
.log_global*() handlers of VFIO. Add documentation while at it.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Cédric Le Goater 
---
 include/hw/vfio/vfio-container-base.h | 18 --
 hw/vfio/common.c  |  4 ++--
 hw/vfio/container-base.c  |  4 ++--
 hw/vfio/container.c   |  6 +++---
 4 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/include/hw/vfio/vfio-container-base.h 
b/include/hw/vfio/vfio-container-base.h
index 
b2813b0c117985425c842d91f011bb895955d738..dec2023eceb6c7d62b0ee35008cc58f8e695e190
 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -81,7 +81,7 @@ int vfio_container_add_section_window(VFIOContainerBase 
*bcontainer,
 void vfio_container_del_section_window(VFIOContainerBase *bcontainer,
MemoryRegionSection *section);
 int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer,
-   bool start);
+   bool start, Error **errp);
 int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
   VFIOBitmap *vbmap,
   hwaddr iova, hwaddr size);
@@ -120,9 +120,23 @@ struct VFIOIOMMUClass {
 int (*attach_device)(const char *name, VFIODevice *vbasedev,
  AddressSpace *as, Error **errp);
 void (*detach_device)(VFIODevice *vbasedev);
+
 /* migration feature */
+
+/**
+ * @set_dirty_page_tracking
+ *
+ * Start or stop dirty pages tracking on VFIO container
+ *
+ * @bcontainer: #VFIOContainerBase on which to de/activate dirty
+ *  pages tracking
+ * @start: indicates whether to start or stop dirty pages tracking
+ * @errp: pointer to Error*, to store an error if it happens.
+ *
+ * Returns zero to indicate success and negative for error
+ */
 int (*set_dirty_page_tracking)(const VFIOContainerBase *bcontainer,
-   bool start);
+   bool start, Error **errp);
 int (*query_dirty_bitmap)(const VFIOContainerBase *bcontainer,
   VFIOBitmap *vbmap,
   hwaddr iova, hwaddr size);
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 
8bba95ba6a2010b78cae54c6905857686bbb6309..560f4bc38499f7f4a3bc84ef7e4184fd6dc89935
 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1085,7 +1085,7 @@ static bool vfio_listener_log_global_start(MemoryListener 
*listener,
 if (vfio_devices_all_device_dirty_tracking(bcontainer)) {
 ret = vfio_devices_dma_logging_start(bcontainer);
 } else {
-ret = vfio_container_set_dirty_page_tracking(bcontainer, true);
+ret = vfio_container_set_dirty_page_tracking(bcontainer, true, NULL);
 }
 
 if (ret) {
@@ -1106,7 +1106,7 @@ static bool vfio_listener_log_global_stop(MemoryListener 
*listener,
 if (vfio_devices_all_device_dirty_tracking(bcontainer)) {
 vfio_devices_dma_logging_stop(bcontainer);
 } else {
-ret = vfio_container_set_dirty_page_tracking(bcontainer, false);
+ret = vfio_container_set_dirty_page_tracking(bcontainer, false, NULL);
 }
 
 if (ret) {
diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
index 
913ae49077c4f09b7b27517c1231cfbe4befb7fb..7c0764121d24b02b6c4e66e368d7dff78a6d65aa
 100644
--- a/hw/vfio/container-base.c
+++ b/hw/vfio/container-base.c
@@ -53,14 +53,14 @@ void vfio_container_del_section_window(VFIOContainerBase 
*bcontainer,
 }
 
 int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer,
-   bool start)
+   bool start, Error **errp)
 {
 if (!bcontainer->dirty_pages_supported) {
 return 0;
 }
 
 g_assert(bcontainer->ops->set_dirty_page_tracking);
-return bcontainer->ops->set_dirty_page_tracking(bcontainer, start);
+return bcontainer->ops->set_dirty_page_tracking(bcontainer, start, errp);
 }
 
 int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 
bd25b9fbad2e717e63c2ab0e331186e5f63cef49..f772ac79b9c413c86d7e60f6dc4e6699852d5aac
 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -210,7 +210,7 @@ static int vfio_legacy_dma_map(const VFIOContainerBase 
*bcontainer, hwaddr iova,
 
 static int
 vfio_legacy_set_dirty_page_tracking(const VFIOContainerBase *bcontainer,
-bool start)
+bool start, Error **errp)
 {
 const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
   bcontainer);
@@ -228,8 +228,8 @@

[PATCH v2 21/21] vfio: Extend vfio_set_migration_error() with Error* argument

2024-02-27 Thread Cédric Le Goater

vfio_set_migration_error() sets the 'return' error on the migration
stream if a migration is in progress. To improve error reporting, add
a new Error* argument to also set the Error object on the migration
stream, if a migration is progress.

Signed-off-by: Cédric Le Goater 
---
 hw/vfio/common.c | 36 +++-
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 
8fbf04e55d1b304bc80fdd9ef6f5f5089acd3360..5e6353ae468c885af0fa169b671902a518df4c75
 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -148,16 +148,18 @@ bool vfio_viommu_preset(VFIODevice *vbasedev)
 return vbasedev->bcontainer->space->as != _space_memory;
 }
 
-static void vfio_set_migration_error(int err)
+static void vfio_set_migration_error(int ret, Error *err)
 {
 MigrationState *ms = migrate_get_current();
 
 if (migration_is_setup_or_active(ms->state)) {
 WITH_QEMU_LOCK_GUARD(>qemu_file_lock) {
 if (ms->to_dst_file) {
-qemu_file_set_error(ms->to_dst_file, err);
+qemu_file_set_error_obj(ms->to_dst_file, ret, err);
 }
 }
+} else {
+error_report_err(err);
 }
 }
 
@@ -304,9 +306,10 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, 
IOMMUTLBEntry *iotlb)
 iova, iova + iotlb->addr_mask);
 
 if (iotlb->target_as != _space_memory) {
-error_report("Wrong target AS \"%s\", only system memory is allowed",
- iotlb->target_as->name ? iotlb->target_as->name : "none");
-vfio_set_migration_error(-EINVAL);
+error_setg(_err,
+   "Wrong target AS \"%s\", only system memory is allowed",
+   iotlb->target_as->name ? iotlb->target_as->name : "none");
+vfio_set_migration_error(-EINVAL, local_err);
 return;
 }
 
@@ -339,11 +342,12 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, 
IOMMUTLBEntry *iotlb)
 ret = vfio_container_dma_unmap(bcontainer, iova,
iotlb->addr_mask + 1, iotlb);
 if (ret) {
-error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
- "0x%"HWADDR_PRIx") = %d (%s)",
- bcontainer, iova,
- iotlb->addr_mask + 1, ret, strerror(-ret));
-vfio_set_migration_error(ret);
+error_setg(_err,
+   "vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
+   "0x%"HWADDR_PRIx") = %d (%s)",
+   bcontainer, iova,
+   iotlb->addr_mask + 1, ret, strerror(-ret));
+vfio_set_migration_error(ret, local_err);
 }
 }
 out:
@@ -1239,14 +1243,14 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier 
*n, IOMMUTLBEntry *iotlb)
 trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask);
 
 if (iotlb->target_as != _space_memory) {
-error_report("Wrong target AS \"%s\", only system memory is allowed",
- iotlb->target_as->name ? iotlb->target_as->name : "none");
+error_setg(_err,
+   "Wrong target AS \"%s\", only system memory is allowed",
+   iotlb->target_as->name ? iotlb->target_as->name : "none");
 goto out;
 }
 
 rcu_read_lock();
 if (!vfio_get_xlat_addr(iotlb, NULL, _addr, NULL, _err)) {
-error_report_err(local_err);
 goto out_lock;
 }
 
@@ -1257,7 +1261,6 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, 
IOMMUTLBEntry *iotlb)
   "vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
   "0x%"HWADDR_PRIx") failed :", bcontainer, iova,
   iotlb->addr_mask + 1);
-error_report_err(local_err);
 }
 
 out_lock:
@@ -1265,7 +1268,7 @@ out_lock:
 
 out:
 if (ret) {
-vfio_set_migration_error(ret);
+vfio_set_migration_error(ret, local_err);
 }
 }
 
@@ -1385,8 +1388,7 @@ static void vfio_listener_log_sync(MemoryListener 
*listener,
 if (vfio_devices_all_dirty_tracking(bcontainer)) {
 ret = vfio_sync_dirty_bitmap(bcontainer, section, _err);
 if (ret) {
-error_report_err(local_err);
-vfio_set_migration_error(ret);
+vfio_set_migration_error(ret, local_err);
 }
 }
 }
-- 
2.43.2

[PATCH v2 19/21] vfio: Add Error** argument to .get_dirty_bitmap() handler

2024-02-27 Thread Cédric Le Goater

Let the callers do the error reporting. Add documentation while at it.

Signed-off-by: Cédric Le Goater 
---
 include/hw/vfio/vfio-common.h |  4 +-
 include/hw/vfio/vfio-container-base.h | 17 +++-
 hw/vfio/common.c  | 59 ++-
 hw/vfio/container-base.c  |  5 ++-
 hw/vfio/container.c   | 13 +++---
 5 files changed, 67 insertions(+), 31 deletions(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 
6d9dee626afc491645d2c2398f3e3210961f67e9..83ffad89f5cf434452332fe29fb752d9ec71b2f0
 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -271,9 +271,9 @@ bool
 vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer);
 int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
 VFIOBitmap *vbmap, hwaddr iova,
-hwaddr size);
+hwaddr size, Error **errp);
 int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova,
-  uint64_t size, ram_addr_t ram_addr);
+  uint64_t size, ram_addr_t ram_addr, Error **errp);
 
 /* Returns 0 on success, or a negative errno. */
 int vfio_device_get_name(VFIODevice *vbasedev, Error **errp);
diff --git a/include/hw/vfio/vfio-container-base.h 
b/include/hw/vfio/vfio-container-base.h
index 
dec2023eceb6c7d62b0ee35008cc58f8e695e190..3ee713014cb414f18b34092641a17717983b5559
 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -84,7 +84,7 @@ int vfio_container_set_dirty_page_tracking(VFIOContainerBase 
*bcontainer,
bool start, Error **errp);
 int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
   VFIOBitmap *vbmap,
-  hwaddr iova, hwaddr size);
+  hwaddr iova, hwaddr size, Error **errp);
 
 void vfio_container_init(VFIOContainerBase *bcontainer,
  VFIOAddressSpace *space,
@@ -137,9 +137,22 @@ struct VFIOIOMMUClass {
  */
 int (*set_dirty_page_tracking)(const VFIOContainerBase *bcontainer,
bool start, Error **errp);
+/**
+ * @query_dirty_bitmap
+ *
+ * Get list of dirty pages from container
+ *
+ * @bcontainer: #VFIOContainerBase from which to get dirty pages
+ * @vbmap: #VFIOBitmap internal bitmap structure
+ * @iova: iova base address
+ * @size: size of iova range
+ * @errp: pointer to Error*, to store an error if it happens.
+ *
+ * Returns zero to indicate success and negative for error
+ */
 int (*query_dirty_bitmap)(const VFIOContainerBase *bcontainer,
   VFIOBitmap *vbmap,
-  hwaddr iova, hwaddr size);
+  hwaddr iova, hwaddr size, Error **errp);
 /* PCI specific */
 int (*pci_hot_reset)(VFIODevice *vbasedev, bool single);
 
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 
43f37447e3692ffa97788b02f83b81b44aaf301a..8fbf04e55d1b304bc80fdd9ef6f5f5089acd3360
 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1154,7 +1154,7 @@ static int vfio_device_dma_logging_report(VFIODevice 
*vbasedev, hwaddr iova,
 
 int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
 VFIOBitmap *vbmap, hwaddr iova,
-hwaddr size)
+hwaddr size, Error **errp)
 {
 VFIODevice *vbasedev;
 int ret;
@@ -1163,10 +1163,10 @@ int vfio_devices_query_dirty_bitmap(const 
VFIOContainerBase *bcontainer,
 ret = vfio_device_dma_logging_report(vbasedev, iova, size,
  vbmap->bitmap);
 if (ret) {
-error_report("%s: Failed to get DMA logging report, iova: "
- "0x%" HWADDR_PRIx ", size: 0x%" HWADDR_PRIx
- ", err: %d (%s)",
- vbasedev->name, iova, size, ret, strerror(-ret));
+error_setg(errp, "%s: Failed to get DMA logging report, iova: "
+   "0x%" HWADDR_PRIx ", size: 0x%" HWADDR_PRIx
+   ", err: %d (%s)",
+   vbasedev->name, iova, size, ret, strerror(-ret));
 
 return ret;
 }
@@ -1176,7 +1176,7 @@ int vfio_devices_query_dirty_bitmap(const 
VFIOContainerBase *bcontainer,
 }
 
 int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova,
-  uint64_t size, ram_addr_t ram_addr)
+  uint64_t size, ram_addr_t ram_addr, Error **errp)
 {
 bool all_device_dirty_tracking =

[PATCH v2 05/21] migration: Add Error** argument to qemu_savevm_state_setup()

2024-02-27 Thread Cédric Le Goater

This prepares ground for the changes coming next which add an Error**
argument to the .save_setup() handler. Callers of qemu_savevm_state_setup()
now handle the error and fail earlier. This is a functional change
that should be examined closely.

Signed-off-by: Cédric Le Goater 
---
 migration/savevm.h|  2 +-
 migration/migration.c | 20 ++--
 migration/savevm.c| 14 +++---
 3 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/migration/savevm.h b/migration/savevm.h
index 
74669733dd63a080b765866c703234a5c4939223..9ec96a995c93a42aad621595f0ed58596c532328
 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -32,7 +32,7 @@
 bool qemu_savevm_state_blocked(Error **errp);
 void qemu_savevm_non_migratable_list(strList **reasons);
 int qemu_savevm_state_prepare(Error **errp);
-void qemu_savevm_state_setup(QEMUFile *f);
+int qemu_savevm_state_setup(QEMUFile *f, Error **errp);
 bool qemu_savevm_state_guest_unplug_pending(void);
 int qemu_savevm_state_resume_prepare(MigrationState *s);
 void qemu_savevm_state_header(QEMUFile *f);
diff --git a/migration/migration.c b/migration/migration.c
index 
5316bbe6704742e604ae55dc7b47a4e11e73c2a4..c1a62b696f62c0d5aca0505e58bc4dc0ff561fde
 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -3314,6 +3314,8 @@ static void *migration_thread(void *opaque)
 int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
 MigThrError thr_error;
 bool urgent = false;
+Error *local_err = NULL;
+int ret;
 
 thread = migration_threads_add("live_migration", qemu_get_thread_id());
 
@@ -3357,9 +3359,15 @@ static void *migration_thread(void *opaque)
 }
 
 bql_lock();
-qemu_savevm_state_setup(s->to_dst_file);
+ret = qemu_savevm_state_setup(s->to_dst_file, _err);
 bql_unlock();
 
+if (ret) {
+migrate_set_error(s, local_err);
+error_free(local_err);
+goto out;
+ }
+
 qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP,
MIGRATION_STATUS_ACTIVE);
 
@@ -3436,6 +3444,8 @@ static void *bg_migration_thread(void *opaque)
 MigThrError thr_error;
 QEMUFile *fb;
 bool early_fail = true;
+Error *local_err = NULL;
+int ret;
 
 rcu_register_thread();
 object_ref(OBJECT(s));
@@ -3469,9 +3479,15 @@ static void *bg_migration_thread(void *opaque)
 
 bql_lock();
 qemu_savevm_state_header(s->to_dst_file);
-qemu_savevm_state_setup(s->to_dst_file);
+ret = qemu_savevm_state_setup(s->to_dst_file, _err);
 bql_unlock();
 
+if (ret) {
+migrate_set_error(s, local_err);
+error_free(local_err);
+goto fail;
+}
+
 qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP,
MIGRATION_STATUS_ACTIVE);
 
diff --git a/migration/savevm.c b/migration/savevm.c
index 
51876f2ef674bb76c7e7ef96e1119a083883deac..bc168371a31acf85f29f2c284be181250db45df4
 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1311,11 +1311,10 @@ int qemu_savevm_state_prepare(Error **errp)
 return 0;
 }
 
-void qemu_savevm_state_setup(QEMUFile *f)
+int qemu_savevm_state_setup(QEMUFile *f, Error **errp)
 {
 MigrationState *ms = migrate_get_current();
 SaveStateEntry *se;
-Error *local_err = NULL;
 int ret = 0;
 
 json_writer_int64(ms->vmdesc, "page_size", qemu_target_page_size());
@@ -1351,12 +1350,10 @@ void qemu_savevm_state_setup(QEMUFile *f)
 }
 
 if (ret) {
-return;
+return ret;
 }
 
-if (precopy_notify(PRECOPY_NOTIFY_SETUP, _err)) {
-error_report_err(local_err);
-}
+return precopy_notify(PRECOPY_NOTIFY_SETUP, errp);
 }
 
 int qemu_savevm_state_resume_prepare(MigrationState *s)
@@ -1725,7 +1722,10 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp)
 ms->to_dst_file = f;
 
 qemu_savevm_state_header(f);
-qemu_savevm_state_setup(f);
+ret = qemu_savevm_state_setup(f, errp);
+if (ret) {
+return ret;
+}
 
 while (qemu_file_get_error(f) == 0) {
 if (qemu_savevm_state_iterate(f, false) > 0) {
-- 
2.43.2

[PATCH v2 20/21] vfio: Also trace event failures in vfio_save_complete_precopy()

2024-02-27 Thread Cédric Le Goater

vfio_save_complete_precopy() currently returns before doing the trace
event. Change that.

Signed-off-by: Cédric Le Goater 
---
 hw/vfio/migration.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 
228e8854594f3714b7c6f4fcfc5468d6b56337cb..f3b500dd1cab944722ccbc41575b15046c2420c9
 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -574,9 +574,6 @@ static int vfio_save_complete_precopy(QEMUFile *f, void 
*opaque)
 
 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
 ret = qemu_file_get_error(f);
-if (ret) {
-return ret;
-}
 
 trace_vfio_save_complete_precopy(vbasedev->name, ret);
 
-- 
2.43.2

[PATCH v2 03/21] migration: Add documentation for SaveVMHandlers

2024-02-27 Thread Cédric Le Goater

The SaveVMHandlers structure is still in use for complex subsystems
and devices. Document the handlers since we are going to modify a few
later.

Signed-off-by: Cédric Le Goater 
---
 include/migration/register.h | 257 +++
 1 file changed, 231 insertions(+), 26 deletions(-)

diff --git a/include/migration/register.h b/include/migration/register.h
index 
2e6a7d766e62f64940086b7b511249c9ff21fa62..2cc71ec45f65bf2884c9e7a823d2968752f15c20
 100644
--- a/include/migration/register.h
+++ b/include/migration/register.h
@@ -16,30 +16,129 @@
 
 #include "hw/vmstate-if.h"
 
+/**
+ * struct SaveVMHandlers: handler structure to finely control
+ * migration of complex subsystems and devices, such as RAM, block and
+ * VFIO.
+ */
 typedef struct SaveVMHandlers {
-/* This runs inside the BQL.  */
+
+/* The following handlers runs inside the BQL. */
+
+/**
+ * @save_state
+ *
+ * Saves state section on the source using the latest state format
+ * version.
+ *
+ * Legacy method. Should be deprecated when all users are ported
+ * to VMState.
+ *
+ * @f: QEMUFile where to send the data
+ * @opaque: data pointer passed to register_savevm_live()
+ */
 void (*save_state)(QEMUFile *f, void *opaque);
 
-/*
- * save_prepare is called early, even before migration starts, and can be
- * used to perform early checks.
+/**
+ * @save_prepare
+ *
+ * Called early, even before migration starts, and can be used to
+ * perform early checks.
+ *
+ * @opaque: data pointer passed to register_savevm_live()
+ * @errp: pointer to Error*, to store an error if it happens.
+ *
+ * Returns zero to indicate success and negative for error
  */
 int (*save_prepare)(void *opaque, Error **errp);
+
+/**
+ * @save_setup
+ *
+ * Initializes the data structures on the source and transmits
+ * first section containing information on the device
+ *
+ * @f: QEMUFile where to send the data
+ * @opaque: data pointer passed to register_savevm_live()
+ *
+ * Returns zero to indicate success and negative for error
+ */
 int (*save_setup)(QEMUFile *f, void *opaque);
+
+/**
+ * @save_cleanup
+ *
+ * Performs save related cleanup
+ *
+ * @opaque: data pointer passed to register_savevm_live()
+ *
+ * Returns zero to indicate success and negative for error
+ */
 void (*save_cleanup)(void *opaque);
+
+/**
+ * @save_live_complete_postcopy
+ *
+ * Called at the end of postcopy for all postcopyiable devices.
+ *
+ * @f: QEMUFile where to send the data
+ * @opaque: data pointer passed to register_savevm_live()
+ *
+ * Returns zero to indicate success and negative for error
+ */
 int (*save_live_complete_postcopy)(QEMUFile *f, void *opaque);
+
+/**
+ * @save_live_complete_precopy
+ *
+ * Transmits the last section for the device containing any
+ * remaining data.
+ *
+ * @f: QEMUFile where to send the data
+ * @opaque: data pointer passed to register_savevm_live()
+ *
+ * Returns zero to indicate success and negative for error
+ */
 int (*save_live_complete_precopy)(QEMUFile *f, void *opaque);
 
 /* This runs both outside and inside the BQL.  */
+
+/**
+ * @is_active
+ *
+ * Will skip a state section if not active
+ *
+ * @opaque: data pointer passed to register_savevm_live()
+ *
+ * Returns true if state section is active else false
+ */
 bool (*is_active)(void *opaque);
+
+/**
+ * @has_postcopy
+ *
+ * checks if a device supports postcopy
+ *
+ * @opaque: data pointer passed to register_savevm_live()
+ *
+ * Returns true for postcopy support else false
+ */
 bool (*has_postcopy)(void *opaque);
 
-/* is_active_iterate
- * If it is not NULL then qemu_savevm_state_iterate will skip iteration if
- * it returns false. For example, it is needed for only-postcopy-states,
- * which needs to be handled by qemu_savevm_state_setup and
- * qemu_savevm_state_pending, but do not need iterations until not in
- * postcopy stage.
+/**
+ * @is_active_iterate
+ *
+ * As #SaveVMHandlers.is_active(), will skip an inactive state
+ * section in qemu_savevm_state_iterate.
+ *
+ * For example, it is needed for only-postcopy-states, which needs
+ * to be handled by qemu_savevm_state_setup() and
+ * qemu_savevm_state_pending(), but do not need iterations until
+ * not in postcopy stage.
+ *
+ * @opaque: data pointer passed to register_savevm_live()
+ *
+ * Returns true if state section is active else false
  */
 bool (*is_active_iterate)(void *opaque);
 
@@ -48,44 +147,150 @@ typedef struct SaveVMHandlers {
  * use data that is local to the migration thread or protected
  * by other locks.
  */

[PATCH v2 10/21] migration: Modify ram_init_bitmaps() to report dirty tracking errors

2024-02-27 Thread Cédric Le Goater

The .save_setup() handler has now an Error** argument that we can use
to propagate errors reported by the .log_global_start() handler. Do
that for the RAM. The caller qemu_savevm_state_setup() will store the
error under the migration stream for later detection in the migration
sequence.

Signed-off-by: Cédric Le Goater 
---
 migration/ram.c | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 
9fb1875aad73b2fa009199bdfa8960339df7287d..23f4df4779309bbbe164c56c1436b60d65749860
 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2802,9 +2802,8 @@ static void 
migration_bitmap_clear_discarded_pages(RAMState *rs)
 }
 }
 
-static void ram_init_bitmaps(RAMState *rs)
+static bool ram_init_bitmaps(RAMState *rs, Error **errp)
 {
-Error *local_err = NULL;
 bool ret = true;
 
 qemu_mutex_lock_ramlist();
@@ -2813,10 +2812,8 @@ static void ram_init_bitmaps(RAMState *rs)
 ram_list_init_bitmaps();
 /* We don't use dirty log with background snapshots */
 if (!migrate_background_snapshot()) {
-ret = memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION,
-_err);
+ret = memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION, errp);
 if (!ret) {
-error_report_err(local_err);
 goto out_unlock;
 }
 migration_bitmap_sync_precopy(rs, false);
@@ -2826,7 +2823,7 @@ out_unlock:
 qemu_mutex_unlock_ramlist();
 
 if (!ret) {
-return;
+return false;
 }
 
 /*
@@ -2834,9 +2831,10 @@ out_unlock:
  * containing all 1s to exclude any discarded pages from migration.
  */
 migration_bitmap_clear_discarded_pages(rs);
+return true;
 }
 
-static int ram_init_all(RAMState **rsp)
+static int ram_init_all(RAMState **rsp, Error **errp)
 {
 if (ram_state_init(rsp)) {
 return -1;
@@ -2847,7 +2845,9 @@ static int ram_init_all(RAMState **rsp)
 return -1;
 }
 
-ram_init_bitmaps(*rsp);
+if (!ram_init_bitmaps(*rsp, errp)) {
+return -1;
+}
 
 return 0;
 }
@@ -2961,7 +2961,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque, 
Error **errp)
 
 /* migration has already setup the bitmap, reuse it. */
 if (!migration_in_colo_state()) {
-if (ram_init_all(rsp) != 0) {
+if (ram_init_all(rsp, errp) != 0) {
 compress_threads_save_cleanup();
 return -1;
 }
-- 
2.43.2

[PATCH v2 00/21] migration: Improve error reporting

2024-02-27 Thread Cédric Le Goater

Hello,

The motivation behind these changes is to improve error reporting to
the upper management layer (libvirt) with a more detailed error, this
to let it decide, depending on the reported error, whether to try
migration again later. It would be useful in cases where migration
fails due to lack of HW resources on the host. For instance, some
adapters can only initiate a limited number of simultaneous dirty
tracking requests and this imposes a limit on the the number of VMs
that can be migrated simultaneously.

We are not quite ready for such a mechanism but what we can do first is
to cleanup the error reporting in the early save_setup sequence. This
is what the following changes propose, by adding an Error** argument to
various handlers and propagating it to the core migration subsystem.
 
Thanks,

C.

Changes in v2:

- Removed v1 patches addressing the return-path thread termination as
  they are now superseded by :  
  https://lore.kernel.org/qemu-devel/20240226203122.22894-1-faro...@suse.de/
- Documentation updates of handlers
- Removed call to PRECOPY_NOTIFY_SETUP notifiers in case of errors
- Modified routines taking an Error** argument to return a bool when
  possible and made adjustments in callers.
- new MEMORY_LISTENER_CALL_LOG_GLOBAL macro for .log_global*()
  handlers
- Handled SETUP state when migration terminates
- Modified memory_get_xlat_addr() to take an Error** argument
- Various refinements on error handling

Cédric Le Goater (21):
  migration: Report error when shutdown fails
  migration: Remove SaveStateHandler and LoadStateHandler typedefs
  migration: Add documentation for SaveVMHandlers
  migration: Do not call PRECOPY_NOTIFY_SETUP notifiers in case of error
  migration: Add Error** argument to qemu_savevm_state_setup()
  migration: Add Error** argument to .save_setup() handler
  migration: Add Error** argument to .load_setup() handler
  memory: Add Error** argument to .log_global*() handlers
  memory: Add Error** argument to the global_dirty_log routines
  migration: Modify ram_init_bitmaps() to report dirty tracking errors
  migration: Fix migration termination
  vfio: Add Error** argument to .set_dirty_page_tracking() handler
  vfio: Add Error** argument to vfio_devices_dma_logging_start()
  vfio: Add Error** argument to vfio_devices_dma_logging_stop()
  vfio: Use new Error** argument in vfio_save_setup()
  vfio: Add Error** argument to .vfio_save_config() handler
  vfio: Reverse test on vfio_get_dirty_bitmap()
  memory: Add Error** argument to memory_get_xlat_addr()
  vfio: Add Error** argument to .get_dirty_bitmap() handler
  vfio: Also trace event failures in vfio_save_complete_precopy()
  vfio: Extend vfio_set_migration_error() with Error* argument

 include/exec/memory.h |  40 +++-
 include/hw/vfio/vfio-common.h |  29 ++-
 include/hw/vfio/vfio-container-base.h |  35 +++-
 include/migration/register.h  | 267 +++---
 include/qemu/typedefs.h   |   2 -
 migration/savevm.h|   2 +-
 hw/i386/xen/xen-hvm.c |  10 +-
 hw/ppc/spapr.c|   2 +-
 hw/s390x/s390-stattrib.c  |   2 +-
 hw/vfio/common.c  | 160 +--
 hw/vfio/container-base.c  |   9 +-
 hw/vfio/container.c   |  19 +-
 hw/vfio/migration.c   |  89 ++---
 hw/vfio/pci.c |   5 +-
 hw/virtio/vhost-vdpa.c|   5 +-
 hw/virtio/vhost.c |   6 +-
 migration/block-dirty-bitmap.c|   2 +-
 migration/block.c |   2 +-
 migration/dirtyrate.c |  21 +-
 migration/migration.c |  24 ++-
 migration/qemu-file.c |   5 +-
 migration/ram.c   |  48 -
 migration/savevm.c|  28 +--
 system/memory.c   |  95 +++--
 system/physmem.c  |   5 +-
 25 files changed, 699 insertions(+), 213 deletions(-)

-- 
2.43.2

Re: [PATCH v5 3/4] hw: Set virtio-iommu aw-bits default value on pc_q35 and arm virt

2024-02-27 Thread Cédric Le Goater


Hello Eric,

On 2/15/24 09:42, Eric Auger wrote:

Currently the default input range can extend to 64 bits. On x86,
when the virtio-iommu protects vfio devices, the physical iommu
may support only 39 bits. Let's set the default to 39, as done
for the intel-iommu. On ARM we set 48b as a default (matching
SMMUv3 SMMU_IDR5.VAX == 0).

We use hw_compat_8_2 to handle the compatibility for machines
before 9.0 which used to have a virtio-iommu default input range
of 64 bits.

Of course if aw-bits is set from the command line, the default
is overriden.

Signed-off-by: Eric Auger 
Reviewed-by: Zhenzhong Duan 
Tested-by: Yanghang Liu


We need a property fixup for pseries also:

$ build/ppc64-softmmu/qemu-system-ppc64 -M pseries  -device 
virtio-iommu-pci,addr=04.0
qemu-system-ppc64: -device virtio-iommu-pci,addr=04.0: aw-bits must be within 
[32,64]


Thanks,

C.




---

v3 -> v4:
- update the qos test to relax the check on the max input IOVA

v2 -> v3:
- collected Zhenzhong's R-b
- use _abort instead of NULL error handle
   on object_property_get_uint() call (Cédric)
- use VTD_HOST_AW_39BIT (Cédric)

v1 -> v2:
- set aw-bits to 48b on ARM
- use hw_compat_8_2 to handle the compat for older machines
   which used 64b as a default
---
  hw/arm/virt.c   | 6 ++
  hw/core/machine.c   | 5 -
  hw/i386/pc.c| 6 ++
  hw/virtio/virtio-iommu.c| 2 +-
  tests/qtest/virtio-iommu-test.c | 2 +-
  5 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 368c2a415a..0994f2a560 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -2716,10 +2716,16 @@ static void 
virt_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev,
  } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) {
  virtio_md_pci_pre_plug(VIRTIO_MD_PCI(dev), MACHINE(hotplug_dev), 
errp);
  } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) {
+uint8_t aw_bits = object_property_get_uint(OBJECT(dev),
+   "aw-bits", _abort);
  hwaddr db_start = 0, db_end = 0;
  QList *reserved_regions;
  char *resv_prop_str;
  
+if (!aw_bits) {

+qdev_prop_set_uint8(dev, "aw-bits", 48);
+}
+
  if (vms->iommu != VIRT_IOMMU_NONE) {
  error_setg(errp, "virt machine does not support multiple IOMMUs");
  return;
diff --git a/hw/core/machine.c b/hw/core/machine.c
index fb5afdcae4..70ac96954c 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -30,9 +30,12 @@
  #include "exec/confidential-guest-support.h"
  #include "hw/virtio/virtio-pci.h"
  #include "hw/virtio/virtio-net.h"
+#include "hw/virtio/virtio-iommu.h"
  #include "audio/audio.h"
  
-GlobalProperty hw_compat_8_2[] = {};

+GlobalProperty hw_compat_8_2[] = {
+{ TYPE_VIRTIO_IOMMU_PCI, "aw-bits", "64" },
+};
  const size_t hw_compat_8_2_len = G_N_ELEMENTS(hw_compat_8_2);
  
  GlobalProperty hw_compat_8_1[] = {

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 196827531a..ee2d379c90 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1456,6 +1456,8 @@ static void pc_machine_device_pre_plug_cb(HotplugHandler 
*hotplug_dev,
  } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) {
  virtio_md_pci_pre_plug(VIRTIO_MD_PCI(dev), MACHINE(hotplug_dev), 
errp);
  } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) {
+uint8_t aw_bits = object_property_get_uint(OBJECT(dev),
+   "aw-bits", _abort);
  /* Declare the APIC range as the reserved MSI region */
  char *resv_prop_str = g_strdup_printf("0xfee0:0xfeef:%d",
VIRTIO_IOMMU_RESV_MEM_T_MSI);
@@ -1464,6 +1466,10 @@ static void pc_machine_device_pre_plug_cb(HotplugHandler 
*hotplug_dev,
  qlist_append_str(reserved_regions, resv_prop_str);
  qdev_prop_set_array(dev, "reserved-regions", reserved_regions);
  
+if (!aw_bits) {

+qdev_prop_set_uint8(dev, "aw-bits", VTD_HOST_AW_39BIT);
+}
+
  g_free(resv_prop_str);
  }
  
diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c

index 8b541de850..2ec5ef3cd1 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -1526,7 +1526,7 @@ static Property virtio_iommu_properties[] = {
  DEFINE_PROP_LINK("primary-bus", VirtIOIOMMU, primary_bus,
   TYPE_PCI_BUS, PCIBus *),
  DEFINE_PROP_BOOL("boot-bypass", VirtIOIOMMU, boot_bypass, true),
-DEFINE_PROP_UINT8("aw-bits", VirtIOIOMMU, aw_bits, 64),
+DEFINE_PROP_UINT8("aw-bits", VirtIOIOMMU, aw_bits, 0),
  DEFINE_PROP_END_OF_LIST(),
  };
  
diff --git a/tests/qtest/virtio-iommu-test.c b/tests/qtest/virtio-iommu-test.c

index 068e7a9e6c..0f36381acb 100644
--- a/tests/qtest/virtio-iommu-test.c
+++ b/tests/qtest/virtio-iommu-test.c
@@ -34,7 +34,7 @@

[PATCH v2 09/21] memory: Add Error** argument to the global_dirty_log routines

2024-02-27 Thread Cédric Le Goater

Now that the log_global*() handlers take an Error** parameter and
return a bool, do the same for memory_global_dirty_log_start() and
memory_global_dirty_log_stop(). The error is reported in the callers
for now and it will be propagated in the call stack in the next
changes.

To be noted a functional change in ram_init_bitmaps(), if the dirty
pages logger fails to start, there is no need to synchronize the dirty
pages bitmaps. colo_incoming_start_dirty_log() could be modified in a
similar way.

Cc: Stefano Stabellini 
Cc: Anthony Perard 
Cc: Paul Durrant 
Cc: Michael S. Tsirkin 
Cc: Paolo Bonzini 
Cc: David Hildenbrand 
Cc: Hyman Huang 
Signed-off-by: Cédric Le Goater 
---
 include/exec/memory.h | 10 --
 hw/i386/xen/xen-hvm.c |  4 ++--
 migration/dirtyrate.c | 21 +
 migration/ram.c   | 34 ++
 system/memory.c   | 30 --
 5 files changed, 69 insertions(+), 30 deletions(-)

diff --git a/include/exec/memory.h b/include/exec/memory.h
index 
4bc146c5ebdd377cd14a4e462f32cc945db5a0a8..8b019465ab13ce85c03075c80865a0865ea1feed
 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -2576,15 +2576,21 @@ void memory_listener_unregister(MemoryListener 
*listener);
  * memory_global_dirty_log_start: begin dirty logging for all regions
  *
  * @flags: purpose of starting dirty log, migration or dirty rate
+ * @errp: pointer to Error*, to store an error if it happens.
+ *
+ * Return: true on success, else false setting @errp with error.
  */
-void memory_global_dirty_log_start(unsigned int flags);
+bool memory_global_dirty_log_start(unsigned int flags, Error **errp);
 
 /**
  * memory_global_dirty_log_stop: end dirty logging for all regions
  *
  * @flags: purpose of stopping dirty log, migration or dirty rate
+ * @errp: pointer to Error*, to store an error if it happens.
+ *
+ * Return: true on success, else false setting @errp with error.
  */
-void memory_global_dirty_log_stop(unsigned int flags);
+bool memory_global_dirty_log_stop(unsigned int flags, Error **errp);
 
 void mtree_info(bool flatview, bool dispatch_tree, bool owner, bool disabled);
 
diff --git a/hw/i386/xen/xen-hvm.c b/hw/i386/xen/xen-hvm.c
index 
925a207b494b4eed52d5f360b554f18ac8a9806d..286269b47572d90e57df5ff44835bb5f8e16c7ad
 100644
--- a/hw/i386/xen/xen-hvm.c
+++ b/hw/i386/xen/xen-hvm.c
@@ -655,9 +655,9 @@ void xen_hvm_modified_memory(ram_addr_t start, ram_addr_t 
length)
 void qmp_xen_set_global_dirty_log(bool enable, Error **errp)
 {
 if (enable) {
-memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
+memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION, errp);
 } else {
-memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
+memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION, errp);
 }
 }
 
diff --git a/migration/dirtyrate.c b/migration/dirtyrate.c
index 
1d2e85746fb7b10eb7f149976970f9a92125af8a..34f6d803ff5f4e6ccf2e06aaaed65a336c4be469
 100644
--- a/migration/dirtyrate.c
+++ b/migration/dirtyrate.c
@@ -90,11 +90,17 @@ static int64_t do_calculate_dirtyrate(DirtyPageRecord 
dirty_pages,
 
 void global_dirty_log_change(unsigned int flag, bool start)
 {
+Error *local_err = NULL;
+bool ret;
+
 bql_lock();
 if (start) {
-memory_global_dirty_log_start(flag);
+ret = memory_global_dirty_log_start(flag, _err);
 } else {
-memory_global_dirty_log_stop(flag);
+ret = memory_global_dirty_log_stop(flag, _err);
+}
+if (!ret) {
+error_report_err(local_err);
 }
 bql_unlock();
 }
@@ -106,10 +112,14 @@ void global_dirty_log_change(unsigned int flag, bool 
start)
  */
 static void global_dirty_log_sync(unsigned int flag, bool one_shot)
 {
+Error *local_err = NULL;
+
 bql_lock();
 memory_global_dirty_log_sync(false);
 if (one_shot) {
-memory_global_dirty_log_stop(flag);
+if (!memory_global_dirty_log_stop(flag, _err)) {
+error_report_err(local_err);
+}
 }
 bql_unlock();
 }
@@ -608,9 +618,12 @@ static void calculate_dirtyrate_dirty_bitmap(struct 
DirtyRateConfig config)
 {
 int64_t start_time;
 DirtyPageRecord dirty_pages;
+Error *local_err = NULL;
 
 bql_lock();
-memory_global_dirty_log_start(GLOBAL_DIRTY_DIRTY_RATE);
+if (!memory_global_dirty_log_start(GLOBAL_DIRTY_DIRTY_RATE, _err)) {
+error_report_err(local_err);
+}
 
 /*
  * 1'round of log sync may return all 1 bits with
diff --git a/migration/ram.c b/migration/ram.c
index 
d648134133fc22cd91c7b2064198a90287ee733d..9fb1875aad73b2fa009199bdfa8960339df7287d
 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2391,6 +2391,7 @@ static void ram_save_cleanup(void *opaque)
 {
 RAMState **rsp = opaque;
 RAMBlock *block;
+Error *local_err = NULL;
 
 /* We don't use dirty log with background snapshots */
 if (!migrate_background_snapshot()) {
@@ -2403,7 +2404,10 @@ static

[PATCH v2 07/21] migration: Add Error** argument to .load_setup() handler

2024-02-27 Thread Cédric Le Goater

This will be useful to report errors at a higher level, mostly in VFIO
today.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Cédric Le Goater 
---
 include/migration/register.h |  3 ++-
 hw/vfio/migration.c  |  2 +-
 migration/ram.c  |  3 ++-
 migration/savevm.c   | 10 ++
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/include/migration/register.h b/include/migration/register.h
index 
96eae9dba2970552c379c732393e3ab6ef578a58..2cfc167f717de8e08c1ca8accdc3011c03eb1554
 100644
--- a/include/migration/register.h
+++ b/include/migration/register.h
@@ -231,10 +231,11 @@ typedef struct SaveVMHandlers {
  *
  * @f: QEMUFile where to receive the data
  * @opaque: data pointer passed to register_savevm_live()
+ * @errp: pointer to Error*, to store an error if it happens.
  *
  * Returns zero to indicate success and negative for error
  */
-int (*load_setup)(QEMUFile *f, void *opaque);
+int (*load_setup)(QEMUFile *f, void *opaque, Error **errp);
 
 /**
  * @load_cleanup
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 
8bcb4bc73cd5ba5338e3ffa4d907d0e6bfbb9485..2dfbe671f6f45aa530c7341177bb532d8292cecd
 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -580,7 +580,7 @@ static void vfio_save_state(QEMUFile *f, void *opaque)
 }
 }
 
-static int vfio_load_setup(QEMUFile *f, void *opaque)
+static int vfio_load_setup(QEMUFile *f, void *opaque, Error **errp)
 {
 VFIODevice *vbasedev = opaque;
 
diff --git a/migration/ram.c b/migration/ram.c
index 
745482899e18c86b73261b683c1bec04039a76d2..d648134133fc22cd91c7b2064198a90287ee733d
 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -3498,8 +3498,9 @@ void colo_release_ram_cache(void)
  *
  * @f: QEMUFile where to receive the data
  * @opaque: RAMState pointer
+ * @errp: pointer to Error*, to store an error if it happens.
  */
-static int ram_load_setup(QEMUFile *f, void *opaque)
+static int ram_load_setup(QEMUFile *f, void *opaque, Error **errp)
 {
 xbzrle_load_setup();
 ramblock_recv_map_init();
diff --git a/migration/savevm.c b/migration/savevm.c
index 
b5b3b51bad94dc4c04ae22cd687ba111299339aa..a4ef41d3ff5b471a1cd4166c2dc5813e44ea3a5a
 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -2741,7 +2741,7 @@ static void 
qemu_loadvm_state_switchover_ack_needed(MigrationIncomingState *mis)
 trace_loadvm_state_switchover_ack_needed(mis->switchover_ack_pending_num);
 }
 
-static int qemu_loadvm_state_setup(QEMUFile *f)
+static int qemu_loadvm_state_setup(QEMUFile *f, Error **errp)
 {
 SaveStateEntry *se;
 int ret;
@@ -2757,10 +2757,11 @@ static int qemu_loadvm_state_setup(QEMUFile *f)
 }
 }
 
-ret = se->ops->load_setup(f, se->opaque);
+ret = se->ops->load_setup(f, se->opaque, errp);
 if (ret < 0) {
+error_prepend(errp, "Load state of device %s failed: ",
+  se->idstr);
 qemu_file_set_error(f, ret);
-error_report("Load state of device %s failed", se->idstr);
 return ret;
 }
 }
@@ -2941,7 +2942,8 @@ int qemu_loadvm_state(QEMUFile *f)
 return ret;
 }
 
-if (qemu_loadvm_state_setup(f) != 0) {
+if (qemu_loadvm_state_setup(f, _err) != 0) {
+error_report_err(local_err);
 return -EINVAL;
 }
 
-- 
2.43.2

[PATCH v2 01/21] migration: Report error when shutdown fails

2024-02-27 Thread Cédric Le Goater

This will help detect issues regarding I/O channels usage.

Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Peter Xu 
Signed-off-by: Cédric Le Goater 
---
 migration/qemu-file.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 
94231ff2955c80b3d0fab11a40510d34c334a826..b69e0c62e2fcf21d346a3687df7eebee23791fdc
 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -62,6 +62,8 @@ struct QEMUFile {
  */
 int qemu_file_shutdown(QEMUFile *f)
 {
+Error *err = NULL;
+
 /*
  * We must set qemufile error before the real shutdown(), otherwise
  * there can be a race window where we thought IO all went though
@@ -90,7 +92,8 @@ int qemu_file_shutdown(QEMUFile *f)
 return -ENOSYS;
 }
 
-if (qio_channel_shutdown(f->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL) < 0) {
+if (qio_channel_shutdown(f->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, ) < 0) {
+error_report_err(err);
 return -EIO;
 }
 
-- 
2.43.2

[PATCH v2 18/21] memory: Add Error** argument to memory_get_xlat_addr()

2024-02-27 Thread Cédric Le Goater

Let the callers do the reporting. This will be useful in
vfio_iommu_map_dirty_notify().

Cc: Michael S. Tsirkin 
Cc: Paolo Bonzini 
Cc: David Hildenbrand 
Signed-off-by: Cédric Le Goater 
---
 include/exec/memory.h  | 15 ++-
 hw/vfio/common.c   | 13 +
 hw/virtio/vhost-vdpa.c |  5 -
 system/memory.c| 10 +-
 4 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/include/exec/memory.h b/include/exec/memory.h
index 
8b019465ab13ce85c03075c80865a0865ea1feed..baca989023415b69be3b4b4e7a622f983182314b
 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -771,9 +771,22 @@ void 
ram_discard_manager_register_listener(RamDiscardManager *rdm,
 void ram_discard_manager_unregister_listener(RamDiscardManager *rdm,
  RamDiscardListener *rdl);
 
+/**
+ * memory_get_xlat_addr: Extract addresses from a TLB entry
+ *
+ * @iotlb: pointer to an #IOMMUTLBEntry
+ * @vaddr: virtual addressf
+ * @ram_addr: RAM address
+ * @read_only: indicates if writes are allowed
+ * @mr_has_discard_manager: indicates memory is controlled by a
+ *  RamDiscardManager
+ * @errp: pointer to Error*, to store an error if it happens.
+ *
+ * Return: true on success, else false setting @errp with error.
+ */
 bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
   ram_addr_t *ram_addr, bool *read_only,
-  bool *mr_has_discard_manager);
+  bool *mr_has_discard_manager, Error **errp);
 
 typedef struct CoalescedMemoryRange CoalescedMemoryRange;
 typedef struct MemoryRegionIoeventfd MemoryRegionIoeventfd;
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 
e51757e7d747c60b67deb966bb29b946a511b328..43f37447e3692ffa97788b02f83b81b44aaf301a
 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -262,12 +262,13 @@ static bool 
vfio_listener_skipped_section(MemoryRegionSection *section)
 
 /* Called with rcu_read_lock held.  */
 static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
-   ram_addr_t *ram_addr, bool *read_only)
+   ram_addr_t *ram_addr, bool *read_only,
+   Error **errp)
 {
 bool ret, mr_has_discard_manager;
 
 ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only,
-   _has_discard_manager);
+   _has_discard_manager, errp);
 if (ret && mr_has_discard_manager) {
 /*
  * Malicious VMs might trigger discarding of IOMMU-mapped memory. The
@@ -297,6 +298,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, 
IOMMUTLBEntry *iotlb)
 hwaddr iova = iotlb->iova + giommu->iommu_offset;
 void *vaddr;
 int ret;
+Error *local_err = NULL;
 
 trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP",
 iova, iova + iotlb->addr_mask);
@@ -313,7 +315,8 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, 
IOMMUTLBEntry *iotlb)
 if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
 bool read_only;
 
-if (!vfio_get_xlat_addr(iotlb, , NULL, _only)) {
+if (!vfio_get_xlat_addr(iotlb, , NULL, _only, _err)) {
+error_report_err(local_err);
 goto out;
 }
 /*
@@ -1226,6 +1229,7 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, 
IOMMUTLBEntry *iotlb)
 VFIOContainerBase *bcontainer = giommu->bcontainer;
 hwaddr iova = iotlb->iova + giommu->iommu_offset;
 ram_addr_t translated_addr;
+Error *local_err = NULL;
 int ret = -EINVAL;
 
 trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask);
@@ -1237,7 +1241,8 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, 
IOMMUTLBEntry *iotlb)
 }
 
 rcu_read_lock();
-if (!vfio_get_xlat_addr(iotlb, NULL, _addr, NULL)) {
+if (!vfio_get_xlat_addr(iotlb, NULL, _addr, NULL, _err)) {
+error_report_err(local_err);
 goto out_lock;
 }
 
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 
ddae494ca8e8154ce03b88bc781fe9f1e639aceb..a6f06266cfc798b20b98001fa97ce771722175ec
 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -203,6 +203,7 @@ static void vhost_vdpa_iommu_map_notify(IOMMUNotifier *n, 
IOMMUTLBEntry *iotlb)
 void *vaddr;
 int ret;
 Int128 llend;
+Error *local_err = NULL;
 
 if (iotlb->target_as != _space_memory) {
 error_report("Wrong target AS \"%s\", only system memory is allowed",
@@ -222,7 +223,9 @@ static void vhost_vdpa_iommu_map_notify(IOMMUNotifier *n, 
IOMMUTLBEntry *iotlb)
 if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
 bool read_only;
 
-if (!memory_get_xlat_addr(iotlb, , NULL, _only, NULL)) {
+if (!memory_get_xlat_addr(iotlb, , NULL, _only, NULL,
+  _err)) {
+

1 2 3 4 5 >

1 - 100 of 447 matches

Mail list logo