[PATCH RFC 0/1] tcg: add perfmap and jitdump

2022-10-11 Thread Ilya Leoshkevich
Hi,

This is a rework of jitdump and perfmap patches from Vanderson and
Alex:

- jitdump: https://wiki.qemu.org/Features/TCGCodeQuality

  v1: https://lists.gnu.org/archive/html/qemu-devel/2019-08/msg02676.html
  v2: 
https://lore.kernel.org/qemu-devel/20190830121903.17585-1-vanderson...@gmail.com/
  v9: https://www.mail-archive.com/qemu-devel@nongnu.org/msg650269.html

- perfmap

  rfc: https://lists.nongnu.org/archive/html/qemu-devel/2014-03/msg05386.html
  v2: https://lists.gnu.org/archive/html/qemu-devel/2014-07/msg02061.html
  v3: https://lists.gnu.org/archive/html/qemu-devel/2014-07/msg04289.html
  v4: https://lists.gnu.org/archive/html/qemu-devel/2015-08/msg00095.html

It addresses some of the feedback (among other things: symlink attacks,
Elf64_Ehdr on 32-bit hosts, flockfile, license headers, style) and adds
debuginfo support.

Best regards,
Ilya

Ilya Leoshkevich (1):
  tcg: add perfmap and jitdump

 accel/tcg/debuginfo.c | 108 +
 accel/tcg/debuginfo.h |  54 +++
 accel/tcg/meson.build |   2 +
 accel/tcg/perf.c  | 333 ++
 accel/tcg/perf.h  |  28 
 accel/tcg/translate-all.c |   3 +
 docs/devel/tcg.rst|  20 +++
 linux-user/elfload.c  |   3 +
 linux-user/exit.c |   2 +
 linux-user/main.c |  15 ++
 linux-user/meson.build|   1 +
 meson.build   |   8 +
 qemu-options.hx   |  20 +++
 softmmu/vl.c  |  11 ++
 tcg/tcg.c |   2 +
 15 files changed, 610 insertions(+)
 create mode 100644 accel/tcg/debuginfo.c
 create mode 100644 accel/tcg/debuginfo.h
 create mode 100644 accel/tcg/perf.c
 create mode 100644 accel/tcg/perf.h

-- 
2.37.2




[PATCH 1/1] tcg: add perfmap and jitdump

2022-10-11 Thread Ilya Leoshkevich
Add ability to dump /tmp/perf-.map and jit-.dump.
The first one allows the perf tool to map samples to each individual
translation block. The second one adds the ability to resolve symbol
names, line numbers and inspect JITed code.

Example of use:

perf record qemu-x86_64 -perfmap ./a.out
perf report

or

perf record -k 1 qemu-x86_64 -jitdump ./a.out
perf inject -j -i perf.data -o perf.data.jitted
perf report -i perf.data.jitted

Co-developed-by: Vanderson M. do Rosario 
Co-developed-by: Alex Bennée 
Signed-off-by: Ilya Leoshkevich 
---
 accel/tcg/debuginfo.c | 108 +
 accel/tcg/debuginfo.h |  54 +++
 accel/tcg/meson.build |   2 +
 accel/tcg/perf.c  | 333 ++
 accel/tcg/perf.h  |  28 
 accel/tcg/translate-all.c |   3 +
 docs/devel/tcg.rst|  20 +++
 linux-user/elfload.c  |   3 +
 linux-user/exit.c |   2 +
 linux-user/main.c |  15 ++
 linux-user/meson.build|   1 +
 meson.build   |   8 +
 qemu-options.hx   |  20 +++
 softmmu/vl.c  |  11 ++
 tcg/tcg.c |   2 +
 15 files changed, 610 insertions(+)
 create mode 100644 accel/tcg/debuginfo.c
 create mode 100644 accel/tcg/debuginfo.h
 create mode 100644 accel/tcg/perf.c
 create mode 100644 accel/tcg/perf.h

diff --git a/accel/tcg/debuginfo.c b/accel/tcg/debuginfo.c
new file mode 100644
index 00..904eb23103
--- /dev/null
+++ b/accel/tcg/debuginfo.c
@@ -0,0 +1,108 @@
+/*
+ * Debug information support.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+
+#include 
+
+#include "debuginfo.h"
+
+static QemuMutex lock;
+static Dwfl *dwfl;
+static const Dwfl_Callbacks dwfl_callbacks = {
+.find_elf = NULL,
+.find_debuginfo = dwfl_standard_find_debuginfo,
+.section_address = NULL,
+.debuginfo_path = NULL,
+};
+
+__attribute__((constructor))
+static void debuginfo_init(void)
+{
+qemu_mutex_init();
+}
+
+bool debuginfo_report_elf(const char *image_name, int image_fd,
+  target_ulong load_bias)
+{
+qemu_mutex_lock();
+
+if (dwfl == NULL) {
+dwfl = dwfl_begin(_callbacks);
+} else {
+dwfl_report_begin_add(dwfl);
+}
+
+if (dwfl == NULL) {
+qemu_mutex_unlock();
+return false;
+}
+
+dwfl_report_elf(dwfl, image_name, image_name, image_fd, load_bias, true);
+dwfl_report_end(dwfl, NULL, NULL);
+qemu_mutex_unlock();
+return true;
+}
+
+bool debuginfo_get_symbol(target_ulong address,
+  const char **symbol, target_ulong *offset)
+{
+Dwfl_Module *dwfl_module;
+GElf_Off dwfl_offset;
+GElf_Sym dwfl_sym;
+
+qemu_mutex_lock();
+
+if (dwfl == NULL) {
+qemu_mutex_unlock();
+return false;
+}
+
+dwfl_module = dwfl_addrmodule(dwfl, address);
+if (dwfl_module == NULL) {
+qemu_mutex_unlock();
+return false;
+}
+
+*symbol = dwfl_module_addrinfo(dwfl_module, address, _offset,
+   _sym, NULL, NULL, NULL);
+if (*symbol == NULL) {
+qemu_mutex_unlock();
+return false;
+}
+*offset = dwfl_offset;
+qemu_mutex_unlock();
+return true;
+}
+
+bool debuginfo_get_line(target_ulong address,
+const char **file, int *line)
+{
+Dwfl_Module *dwfl_module;
+Dwfl_Line *dwfl_line;
+
+qemu_mutex_lock();
+
+if (dwfl == NULL) {
+qemu_mutex_unlock();
+return false;
+}
+
+dwfl_module = dwfl_addrmodule(dwfl, address);
+if (dwfl_module == NULL) {
+qemu_mutex_unlock();
+return false;
+}
+
+dwfl_line = dwfl_module_getsrc(dwfl_module, address);
+if (dwfl_line == NULL) {
+qemu_mutex_unlock();
+return false;
+}
+*file = dwfl_lineinfo(dwfl_line, NULL, line, 0, NULL, NULL);
+qemu_mutex_unlock();
+return true;
+}
diff --git a/accel/tcg/debuginfo.h b/accel/tcg/debuginfo.h
new file mode 100644
index 00..f4f22aa786
--- /dev/null
+++ b/accel/tcg/debuginfo.h
@@ -0,0 +1,54 @@
+/*
+ * Debug information support.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef ACCEL_TCG_DEBUGINFO_H
+#define ACCEL_TCG_DEBUGINFO_H
+
+#include "exec/cpu-defs.h"
+
+#ifdef CONFIG_LIBDW
+/*
+ * Load debuginfo for the specified guest ELF image.
+ * Return true on success, false on failure.
+ */
+bool debuginfo_report_elf(const char *image_name, int image_fd,
+  target_ulong load_bias);
+
+/*
+ * Find a symbol name associated with the specified guest PC.
+ * Return true on success, false if there is no associated symbol.
+ */
+bool debuginfo_get_symbol(target_ulong address,
+  const char **symbol, target_ulong *offset);
+
+/*
+ * Find a line number associated with the specified guest PC.
+ * Return true on success, false if there is no associated line number.
+ */
+bool 

Re: [PATCH v2 3/8] riscv: re-randomize rng-seed on reboot

2022-10-11 Thread Alistair Francis
On Wed, Oct 12, 2022 at 6:55 AM Jason A. Donenfeld  wrote:
>
> When the system reboots, the rng-seed that the FDT has should be
> re-randomized, so that the new boot gets a new seed. Since the FDT is in
> the ROM region at this point, we add a hook right after the ROM has been
> added, so that we have a pointer to that copy of the FDT.
>
> Cc: Palmer Dabbelt 
> Cc: Alistair Francis 
> Cc: Bin Meng 
> Cc: qemu-ri...@nongnu.org
> Signed-off-by: Jason A. Donenfeld 

Reviewed-by: Alistair Francis 

Alistair

> ---
>  hw/riscv/boot.c | 3 +++
>  1 file changed, 3 insertions(+)
>
> diff --git a/hw/riscv/boot.c b/hw/riscv/boot.c
> index 1ae7596873..aaecf21543 100644
> --- a/hw/riscv/boot.c
> +++ b/hw/riscv/boot.c
> @@ -30,6 +30,7 @@
>  #include "sysemu/device_tree.h"
>  #include "sysemu/qtest.h"
>  #include "sysemu/kvm.h"
> +#include "sysemu/reset.h"
>
>  #include 
>
> @@ -241,6 +242,8 @@ uint64_t riscv_load_fdt(hwaddr dram_base, uint64_t 
> mem_size, void *fdt)
>
>  rom_add_blob_fixed_as("fdt", fdt, fdtsize, fdt_addr,
>_space_memory);
> +qemu_register_reset(qemu_fdt_randomize_seeds,
> +rom_ptr_for_as(_space_memory, fdt_addr, 
> fdtsize));
>
>  return fdt_addr;
>  }
> --
> 2.37.3
>
>



Re: [PATCH v2 1/8] device-tree: add re-randomization helper function

2022-10-11 Thread Alistair Francis
On Wed, Oct 12, 2022 at 6:57 AM Jason A. Donenfeld  wrote:
>
> When the system reboots, the rng-seed that the FDT has should be
> re-randomized, so that the new boot gets a new seed. Several
> architectures require this functionality, so export a function for
> injecting a new seed into the given FDT.
>
> Cc: Alistair Francis 
> Cc: David Gibson 
> Signed-off-by: Jason A. Donenfeld 

Reviewed-by: Alistair Francis 

Alistair

> ---
>  include/sysemu/device_tree.h |  9 +
>  softmmu/device_tree.c| 21 +
>  2 files changed, 30 insertions(+)
>
> diff --git a/include/sysemu/device_tree.h b/include/sysemu/device_tree.h
> index ef060a9759..d552f324b6 100644
> --- a/include/sysemu/device_tree.h
> +++ b/include/sysemu/device_tree.h
> @@ -196,6 +196,15 @@ int qemu_fdt_setprop_sized_cells_from_array(void *fdt,
>  qdt_tmp); \
>  })
>
> +
> +/**
> + * qemu_fdt_randomize_seeds:
> + * @fdt: device tree blob
> + *
> + * Re-randomize all "rng-seed" properties with new seeds.
> + */
> +void qemu_fdt_randomize_seeds(void *fdt);
> +
>  #define FDT_PCI_RANGE_RELOCATABLE  0x8000
>  #define FDT_PCI_RANGE_PREFETCHABLE 0x4000
>  #define FDT_PCI_RANGE_ALIASED  0x2000
> diff --git a/softmmu/device_tree.c b/softmmu/device_tree.c
> index 6ca3fad285..d986c7b7b3 100644
> --- a/softmmu/device_tree.c
> +++ b/softmmu/device_tree.c
> @@ -22,6 +22,7 @@
>  #include "qemu/option.h"
>  #include "qemu/bswap.h"
>  #include "qemu/cutils.h"
> +#include "qemu/guest-random.h"
>  #include "sysemu/device_tree.h"
>  #include "hw/loader.h"
>  #include "hw/boards.h"
> @@ -643,3 +644,23 @@ out:
>  g_free(propcells);
>  return ret;
>  }
> +
> +void qemu_fdt_randomize_seeds(void *fdt)
> +{
> +int noffset, poffset, len;
> +const char *name;
> +uint8_t *data;
> +
> +for (noffset = fdt_next_node(fdt, 0, NULL);
> + noffset >= 0;
> + noffset = fdt_next_node(fdt, noffset, NULL)) {
> +for (poffset = fdt_first_property_offset(fdt, noffset);
> + poffset >= 0;
> + poffset = fdt_next_property_offset(fdt, poffset)) {
> +data = (uint8_t *)fdt_getprop_by_offset(fdt, poffset, , 
> );
> +if (!data || strcmp(name, "rng-seed"))
> +continue;
> +qemu_guest_getrandom_nofail(data, len);
> +}
> +}
> +}
> --
> 2.37.3
>
>



Re: [PATCH 1/4] tests/acpi: virt: allow acpi MADT and FADT changes

2022-10-11 Thread Ani Sinha



On Tue, 11 Oct 2022, Miguel Luis wrote:

> Step 3 from bios-tables-test.c documented procedure.
>
> Signed-off-by: Miguel Luis 

Acked-by: Ani Sinha 

> ---
>  tests/qtest/bios-tables-test-allowed-diff.h | 6 ++
>  1 file changed, 6 insertions(+)
>
> diff --git a/tests/qtest/bios-tables-test-allowed-diff.h 
> b/tests/qtest/bios-tables-test-allowed-diff.h
> index dfb8523c8b..8dc50f7a8a 100644
> --- a/tests/qtest/bios-tables-test-allowed-diff.h
> +++ b/tests/qtest/bios-tables-test-allowed-diff.h
> @@ -1 +1,7 @@
>  /* List of comma-separated changed AML files to ignore */
> +"tests/data/acpi/virt/FACP",
> +"tests/data/acpi/virt/FACP.numamem",
> +"tests/data/acpi/virt/FACP.memhp",
> +"tests/data/acpi/virt/APIC",
> +"tests/data/acpi/virt/APIC.memhp",
> +"tests/data/acpi/virt/APIC.numamem",
> --
> 2.37.3
>
>



Re: [PATCH 4/4] tests/acpi: virt: update ACPI MADT and FADT binaries

2022-10-11 Thread Ani Sinha



On Tue, 11 Oct 2022, Miguel Luis wrote:

> Step 6 & 7 of the bios-tables-test.c documented procedure.
>
> Differences between disassembled ASL files for MADT:
>
> @@ -11,9 +11,9 @@
>   */
>
>  [000h    4]Signature : "APIC"[Multiple APIC 
> Description Table (MADT)]
> -[004h 0004   4] Table Length : 00A8
> -[008h 0008   1] Revision : 03
> -[009h 0009   1] Checksum : 50
> +[004h 0004   4] Table Length : 00AC
> +[008h 0008   1] Revision : 04
> +[009h 0009   1] Checksum : 47
>  [00Ah 0010   6]   Oem ID : "BOCHS "
>  [010h 0016   8] Oem Table ID : "BXPC"
>  [018h 0024   4] Oem Revision : 0001
> @@ -34,7 +34,7 @@
>  [041h 0065   3] Reserved : 00
>
>  [044h 0068   1]Subtable Type : 0B [Generic Interrupt 
> Controller]
> -[045h 0069   1]   Length : 4C
> +[045h 0069   1]   Length : 50
>  [046h 0070   2] Reserved : 
>  [048h 0072   4] CPU Interface Number : 
>  [04Ch 0076   4]Processor UID : 
> @@ -51,28 +51,29 @@
>  [07Ch 0124   4]Virtual GIC Interrupt : 
>  [080h 0128   8]   Redistributor Base Address : 
>  [088h 0136   8]ARM MPIDR : 
> -/ ACPI subtable terminates early - may be older version (dump table) */
> +[090h 0144   1] Efficiency Class : 00
> +[091h 0145   3] Reserved : 00
>
> -[090h 0144   1]Subtable Type : 0D [Generic MSI Frame]
> -[091h 0145   1]   Length : 18
> -[092h 0146   2] Reserved : 
> -[094h 0148   4] MSI Frame ID : 
> -[098h 0152   8] Base Address : 0802
> -[0A0h 0160   4]Flags (decoded below) : 0001
> +[094h 0148   1]Subtable Type : 0D [Generic MSI Frame]
> +[095h 0149   1]   Length : 18
> +[096h 0150   2] Reserved : 
> +[098h 0152   4] MSI Frame ID : 
> +[09Ch 0156   8] Base Address : 0802
> +[0A4h 0164   4]Flags (decoded below) : 0001
>Select SPI : 1
> -[0A4h 0164   2]SPI Count : 0040
> -[0A6h 0166   2] SPI Base : 0050
> +[0A8h 0168   2]SPI Count : 0040
> +[0AAh 0170   2] SPI Base : 0050
>
> -Raw Table Data: Length 168 (0xA8)
> +Raw Table Data: Length 172 (0xAC)
>
> -: 41 50 49 43 A8 00 00 00 03 50 42 4F 43 48 53 20  // APIC.PBOCHS
> +: 41 50 49 43 AC 00 00 00 04 47 42 4F 43 48 53 20  // APIC.GBOCHS
>  0010: 42 58 50 43 20 20 20 20 01 00 00 00 42 58 50 43  // BXPC
> BXPC
>  0020: 01 00 00 00 00 00 00 00 00 00 00 00 0C 18 00 00  // 
> 
>  0030: 00 00 00 00 00 00 00 08 00 00 00 00 00 00 00 00  // 
> 
> -0040: 02 00 00 00 0B 4C 00 00 00 00 00 00 00 00 00 00  // 
> .L..
> +0040: 02 00 00 00 0B 50 00 00 00 00 00 00 00 00 00 00  // 
> .P..
>  0050: 01 00 00 00 00 00 00 00 17 00 00 00 00 00 00 00  // 
> 
>  0060: 00 00 00 00 00 00 01 08 00 00 00 00 00 00 04 08  // 
> 
>  0070: 00 00 00 00 00 00 03 08 00 00 00 00 00 00 00 00  // 
> 
>  0080: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  // 
> 
> -0090: 0D 18 00 00 00 00 00 00 00 00 02 08 00 00 00 00  // 
> 
> -00A0: 01 00 00 00 40 00 50 00  // @.P.
> +0090: 00 00 00 00 0D 18 00 00 00 00 00 00 00 00 02 08  // 
> 
> +00A0: 00 00 00 00 01 00 00 00 40 00 50 00  // @.P.
>
> Differences between disassembled ASL files for FADT:
>
> @@ -11,9 +11,9 @@
>   */
>
>  [000h    4]Signature : "FACP"[Fixed ACPI 
> Description Table (FADT)]
> -[004h 0004   4] Table Length : 010C
> -[008h 0008   1] Revision : 05
> -[009h 0009   1] Checksum : 55
> +[004h 0004   4] Table Length : 0114
> +[008h 0008   1] Revision : 06
> +[009h 0009   1] Checksum : 15
>  [00Ah 0010   6]   Oem ID : "BOCHS "
>  [010h 0016   8] Oem Table ID : "BXPC"
>  [018h 0024   4] Oem Revision : 0001
> @@ -99,7 +99,7 @@
>PSCI Compliant : 1
> Must use HVC for PSCI : 1
>
> -[083h 0131   1]  FADT Minor Revision : 01
> +[083h 0131   1]  FADT Minor Revision : 00
>  [084h 0132   8] FACS Address : 
>  [08Ch 0140 

Re: [PATCH v2] vhost-vdpa: allow passing opened vhostfd to vhost-vdpa

2022-10-11 Thread Jason Wang
On Tue, Oct 11, 2022 at 1:18 AM Si-Wei Liu  wrote:
>
>
>
> On 10/8/2022 10:43 PM, Jason Wang wrote:
>
> On Sat, Oct 8, 2022 at 5:04 PM Si-Wei Liu  wrote:
>
> Similar to other vhost backends, vhostfd can be passed to vhost-vdpa
> backend as another parameter to instantiate vhost-vdpa net client.
> This would benefit the use case where only open file descriptors, as
> opposed to raw vhost-vdpa device paths, are accessible from the QEMU
> process.
>
> (qemu) netdev_add type=vhost-vdpa,vhostfd=61,id=vhost-vdpa1
>
> Adding Cindy.
>
> This has been discussed before, we've already had
> vhostdev=/dev/fdset/$fd which should be functional equivalent to what
> has been proposed here. (And this is how libvirt works if I understand
> correctly).
>
> Yes, I was aware of that discussion. However, our implementation of the 
> management software is a bit different from libvirt, in which the paths in 
> /dev/fdset/NNN can't be dynamically passed to the container where QEMU is 
> running. By using a specific vhostfd property with existing code, it would 
> allow our mgmt software smooth adaption without having to add too much infra 
> code to support the /dev/fdset/NNN trick.

I think fdset has extra flexibility in e.g hot-plug to allow the file
descriptor to be passed with SCM_RIGHTS. It would still be good to add
the support.

>
> On the other hand, the other vhost backends, e.g. tap (via vhost-net), 
> vhost-scsi and vhost-vsock all accept vhostfd as parameter to instantiate 
> device, although the /dev/fdset trick also works there. I think vhost-vdpa is 
> not  unprecedented in this case?

Yes.

Thanks

>
> Thanks,
> -Siwei
>
>
>
> Thanks
>
> Signed-off-by: Si-Wei Liu 
> Acked-by: Eugenio Pérez 
>
> ---
> v2:
>   - fixed typo in commit message
>   - s/fd's/file descriptors/
> ---
>  net/vhost-vdpa.c | 25 -
>  qapi/net.json|  3 +++
>  qemu-options.hx  |  6 --
>  3 files changed, 27 insertions(+), 7 deletions(-)
>
> diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
> index 182b3a1..366b070 100644
> --- a/net/vhost-vdpa.c
> +++ b/net/vhost-vdpa.c
> @@ -683,14 +683,29 @@ int net_init_vhost_vdpa(const Netdev *netdev, const 
> char *name,
>
>  assert(netdev->type == NET_CLIENT_DRIVER_VHOST_VDPA);
>  opts = >u.vhost_vdpa;
> -if (!opts->vhostdev) {
> -error_setg(errp, "vdpa character device not specified with 
> vhostdev");
> +if (!opts->has_vhostdev && !opts->has_vhostfd) {
> +error_setg(errp,
> +   "vhost-vdpa: neither vhostdev= nor vhostfd= was 
> specified");
>  return -1;
>  }
>
> -vdpa_device_fd = qemu_open(opts->vhostdev, O_RDWR, errp);
> -if (vdpa_device_fd == -1) {
> -return -errno;
> +if (opts->has_vhostdev && opts->has_vhostfd) {
> +error_setg(errp,
> +   "vhost-vdpa: vhostdev= and vhostfd= are mutually 
> exclusive");
> +return -1;
> +}
> +
> +if (opts->has_vhostdev) {
> +vdpa_device_fd = qemu_open(opts->vhostdev, O_RDWR, errp);
> +if (vdpa_device_fd == -1) {
> +return -errno;
> +}
> +} else if (opts->has_vhostfd) {
> +vdpa_device_fd = monitor_fd_param(monitor_cur(), opts->vhostfd, 
> errp);
> +if (vdpa_device_fd == -1) {
> +error_prepend(errp, "vhost-vdpa: unable to parse vhostfd: ");
> +return -1;
> +}
>  }
>
>  r = vhost_vdpa_get_features(vdpa_device_fd, , errp);
> diff --git a/qapi/net.json b/qapi/net.json
> index dd088c0..926ecc8 100644
> --- a/qapi/net.json
> +++ b/qapi/net.json
> @@ -442,6 +442,8 @@
>  # @vhostdev: path of vhost-vdpa device
>  #(default:'/dev/vhost-vdpa-0')
>  #
> +# @vhostfd: file descriptor of an already opened vhost vdpa device
> +#
>  # @queues: number of queues to be created for multiqueue vhost-vdpa
>  #  (default: 1)
>  #
> @@ -456,6 +458,7 @@
>  { 'struct': 'NetdevVhostVDPAOptions',
>'data': {
>  '*vhostdev': 'str',
> +'*vhostfd':  'str',
>  '*queues':   'int',
>  '*x-svq':{'type': 'bool', 'features' : [ 'unstable'] } } }
>
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 913c71e..c040f74 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -2774,8 +2774,10 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
>  "configure a vhost-user network, backed by a chardev 
> 'dev'\n"
>  #endif
>  #ifdef __linux__
> -"-netdev vhost-vdpa,id=str,vhostdev=/path/to/dev\n"
> +"-netdev vhost-vdpa,id=str[,vhostdev=/path/to/dev][,vhostfd=h]\n"
>  "configure a vhost-vdpa network,Establish a vhost-vdpa 
> netdev\n"
> +"use 'vhostdev=/path/to/dev' to open a vhost vdpa 
> device\n"
> +"use 'vhostfd=h' to connect to an already opened vhost 
> vdpa device\n"
>  #endif
>  #ifdef CONFIG_VMNET
>  "-netdev vmnet-host,id=str[,isolated=on|off][,net-uuid=uuid]\n"
> @@ -3280,7 +3282,7 @@ SRST
>   

Re: [PATCH] target/riscv: pmp: Fixup TLB size calculation

2022-10-11 Thread Alistair Francis
On Wed, Oct 12, 2022 at 12:50 PM LIU Zhiwei
 wrote:
>
> Reviewed-by: LIU Zhiwei

Thanks!

>
> By the way, we missed one related patch that once had been picked to 
> riscv-next patch.
>
> The patch v3:
> https://lore.kernel.org/all/ceeb4037-6d17-0a09-f35a-eaf328033...@c-sky.com/T/#m183e4430bda408bc3a2b2751aa94eff7fc02e23c

So this was applied but caused boot failures so it was dropped from my
RISC-V tree

>
> The patch v4:
> https://lists.gnu.org/archive/html/qemu-devel/2021-12/msg02854.html

I think I misunderstood this comment [1] as applying to v4 and it
never got applied.

Do you mind resending the patch?

1: 
https://lore.kernel.org/all/ceeb4037-6d17-0a09-f35a-eaf328033...@c-sky.com/T/#m5e958d702d9905169a941f2ae59fdf7ac4a02383

Alistair

>
> I think the patch v4 should be taken at the same time with this patch.
>
> Thanks,
> Zhiwei
>
> On 2022/10/12 9:14, Alistair Francis wrote:
> > From: Alistair Francis 
> >
> > Since commit 4047368938f6 "accel/tcg: Introduce tlb_set_page_full" we
> > have been seeing this assert
> >
> >  ../accel/tcg/cputlb.c:1294: tlb_set_page_with_attrs: Assertion 
> > `is_power_of_2(size)' failed.
> >
> > When running Tock on the OpenTitan machine.
> >
> > The issue is that pmp_get_tlb_size() would return a TLB size that wasn't
> > a power of 2. The size was also smaller then TARGET_PAGE_SIZE.
> >
> > This patch ensures that any TLB size less then TARGET_PAGE_SIZE is
> > rounded down to 1 to ensure it's a valid size.
> >
> > Signed-off-by: Alistair Francis 
> > ---
> > This is based on advice from Richard:
> > https://patchwork.kernel.org/project/qemu-devel/patch/20221004141051.110653-9-richard.hender...@linaro.org/#25043166
> >
> >   target/riscv/pmp.c | 12 
> >   1 file changed, 12 insertions(+)
> >
> > diff --git a/target/riscv/pmp.c b/target/riscv/pmp.c
> > index ea2b67d947..2b43e399b8 100644
> > --- a/target/riscv/pmp.c
> > +++ b/target/riscv/pmp.c
> > @@ -628,6 +628,18 @@ bool pmp_is_range_in_tlb(CPURISCVState *env, hwaddr 
> > tlb_sa,
> >   }
> >
> >   if (*tlb_size != 0) {
> > +/*
> > + * At this point we have a tlb_size that is the smallest possible 
> > size
> > + * That fits within a TARGET_PAGE_SIZE and the PMP region.
> > + *
> > + * If the size is less then TARGET_PAGE_SIZE we drop the size to 1.
> > + * This means the result isn't cached in the TLB and is only used 
> > for
> > + * a single translation.
> > + */
> > +if (*tlb_size < TARGET_PAGE_SIZE) {
> > +*tlb_size = 1;
> > +}
> > +
> >   return true;
> >   }
> >



Re: [PATCH] target/riscv: pmp: Fixup TLB size calculation

2022-10-11 Thread LIU Zhiwei

Reviewed-by: LIU Zhiwei

By the way, we missed one related patch that once had been picked to riscv-next 
patch.

The patch v3:
https://lore.kernel.org/all/ceeb4037-6d17-0a09-f35a-eaf328033...@c-sky.com/T/#m183e4430bda408bc3a2b2751aa94eff7fc02e23c

The patch v4:
https://lists.gnu.org/archive/html/qemu-devel/2021-12/msg02854.html

I think the patch v4 should be taken at the same time with this patch.

Thanks,
Zhiwei

On 2022/10/12 9:14, Alistair Francis wrote:

From: Alistair Francis 

Since commit 4047368938f6 "accel/tcg: Introduce tlb_set_page_full" we
have been seeing this assert

 ../accel/tcg/cputlb.c:1294: tlb_set_page_with_attrs: Assertion 
`is_power_of_2(size)' failed.

When running Tock on the OpenTitan machine.

The issue is that pmp_get_tlb_size() would return a TLB size that wasn't
a power of 2. The size was also smaller then TARGET_PAGE_SIZE.

This patch ensures that any TLB size less then TARGET_PAGE_SIZE is
rounded down to 1 to ensure it's a valid size.

Signed-off-by: Alistair Francis 
---
This is based on advice from Richard:
https://patchwork.kernel.org/project/qemu-devel/patch/20221004141051.110653-9-richard.hender...@linaro.org/#25043166

  target/riscv/pmp.c | 12 
  1 file changed, 12 insertions(+)

diff --git a/target/riscv/pmp.c b/target/riscv/pmp.c
index ea2b67d947..2b43e399b8 100644
--- a/target/riscv/pmp.c
+++ b/target/riscv/pmp.c
@@ -628,6 +628,18 @@ bool pmp_is_range_in_tlb(CPURISCVState *env, hwaddr tlb_sa,
  }
  
  if (*tlb_size != 0) {

+/*
+ * At this point we have a tlb_size that is the smallest possible size
+ * That fits within a TARGET_PAGE_SIZE and the PMP region.
+ *
+ * If the size is less then TARGET_PAGE_SIZE we drop the size to 1.
+ * This means the result isn't cached in the TLB and is only used for
+ * a single translation.
+ */
+if (*tlb_size < TARGET_PAGE_SIZE) {
+*tlb_size = 1;
+}
+
  return true;
  }
  




Re: [PATCH v8 5/8] KVM: Register/unregister the guest private memory regions

2022-10-11 Thread Chao Peng
On Tue, Oct 11, 2022 at 10:48:58AM +0100, Fuad Tabba wrote:
> Hi,
> 
> On Thu, Sep 15, 2022 at 3:38 PM Chao Peng  wrote:
> >
> > If CONFIG_HAVE_KVM_PRIVATE_MEM=y, userspace can register/unregister the
> > guest private memory regions through KVM_MEMORY_ENCRYPT_{UN,}REG_REGION
> > ioctls. The patch reuses existing SEV ioctl number but differs that the
> > address in the region for KVM_PRIVATE_MEM case is gpa while for SEV case
> > it's hva. Which usages should the ioctls go is determined by the newly
> > added kvm_arch_has_private_mem(). Architecture which supports
> > KVM_PRIVATE_MEM should override this function.
> >
> > The current implementation defaults all memory to private. The shared
> > memory regions are stored in a xarray variable for memory efficiency and
> > zapping existing memory mappings is also a side effect of these two
> > ioctls when defined.
> >
> > Signed-off-by: Chao Peng 
> > ---
> >  Documentation/virt/kvm/api.rst  | 17 ++--
> >  arch/x86/include/asm/kvm_host.h |  1 +
> >  arch/x86/kvm/mmu.h  |  2 -
> >  include/linux/kvm_host.h| 13 ++
> >  virt/kvm/kvm_main.c | 73 +
> >  5 files changed, 100 insertions(+), 6 deletions(-)
> >
> > diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> > index 1a6c003b2a0b..c0f800d04ffc 100644
> > --- a/Documentation/virt/kvm/api.rst
> > +++ b/Documentation/virt/kvm/api.rst
> > @@ -4715,10 +4715,19 @@ 
> > Documentation/virt/kvm/x86/amd-memory-encryption.rst.
> >  This ioctl can be used to register a guest memory region which may
> >  contain encrypted data (e.g. guest RAM, SMRAM etc).
> >
> > -It is used in the SEV-enabled guest. When encryption is enabled, a guest
> > -memory region may contain encrypted data. The SEV memory encryption
> > -engine uses a tweak such that two identical plaintext pages, each at
> > -different locations will have differing ciphertexts. So swapping or
> > +Currently this ioctl supports registering memory regions for two usages:
> > +private memory and SEV-encrypted memory.
> > +
> > +When private memory is enabled, this ioctl is used to register guest 
> > private
> > +memory region and the addr/size of kvm_enc_region represents guest physical
> > +address (GPA). In this usage, this ioctl zaps the existing guest memory
> > +mappings in KVM that fallen into the region.
> > +
> > +When SEV-encrypted memory is enabled, this ioctl is used to register guest
> > +memory region which may contain encrypted data for a SEV-enabled guest. The
> > +addr/size of kvm_enc_region represents userspace address (HVA). The SEV
> > +memory encryption engine uses a tweak such that two identical plaintext 
> > pages,
> > +each at different locations will have differing ciphertexts. So swapping or
> >  moving ciphertext of those pages will not result in plaintext being
> >  swapped. So relocating (or migrating) physical backing pages for the SEV
> >  guest will require some additional steps.
> > diff --git a/arch/x86/include/asm/kvm_host.h 
> > b/arch/x86/include/asm/kvm_host.h
> > index 2c96c43c313a..cfad6ba1a70a 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -37,6 +37,7 @@
> >  #include 
> >
> >  #define __KVM_HAVE_ARCH_VCPU_DEBUGFS
> > +#define __KVM_HAVE_ZAP_GFN_RANGE
> >
> >  #define KVM_MAX_VCPUS 1024
> >
> > diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> > index 6bdaacb6faa0..c94b620bf94b 100644
> > --- a/arch/x86/kvm/mmu.h
> > +++ b/arch/x86/kvm/mmu.h
> > @@ -211,8 +211,6 @@ static inline u8 permission_fault(struct kvm_vcpu 
> > *vcpu, struct kvm_mmu *mmu,
> > return -(u32)fault & errcode;
> >  }
> >
> > -void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
> > -
> >  int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
> >
> >  int kvm_mmu_post_init_vm(struct kvm *kvm);
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 2125b50f6345..d65690cae80b 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -260,6 +260,15 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct 
> > kvm_gfn_range *range);
> >  bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> >  #endif
> >
> > +#ifdef __KVM_HAVE_ZAP_GFN_RANGE
> > +void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
> > +#else
> > +static inline void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start
> > + gfn_t gfn_end)
> > +{
> > +}
> > +#endif
> > +
> >  enum {
> > OUTSIDE_GUEST_MODE,
> > IN_GUEST_MODE,
> > @@ -795,6 +804,9 @@ struct kvm {
> > struct notifier_block pm_notifier;
> >  #endif
> > char stats_id[KVM_STATS_NAME_SIZE];
> > +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
> > +   struct xarray mem_attr_array;
> > +#endif
> >  };
> >
> >  #define kvm_err(fmt, ...) \
> > @@ -1454,6 +1466,7 @@ bool kvm_arch_dy_has_pending_interrupt(struct 
> 

Re: [PATCH 1/4] hw/acpi/aml-build: Only generate cluster node in PPTT when specified

2022-10-11 Thread Yicong Yang via
On 2022/10/7 21:48, Michael S. Tsirkin wrote:
> On Thu, Sep 22, 2022 at 09:11:40PM +0800, Yicong Yang wrote:
>> From: Yicong Yang 
>>
>> Currently we'll always generate a cluster node no matter user has
>> specified '-smp clusters=X' or not. Cluster is an optional level
>> and it's unncessary to build it if user don't need. So only generate
>> it when user specify explicitly.
>>
>> Also update the test ACPI tables.
>>
>> Signed-off-by: Yicong Yang 
> 
> This is an example of a commit log repeating what the patch does.
> Which is ok but the important thing is to explain the motivation -
> why is it a bug to generate a cluster node without '-smp clusters'?
> 

It may not be a bug but may build the unneeded topology unconsciously
and doesn't provide a way to inhibit this. So I thought the policy
can be improved.

Thanks.

> 
>> ---
>>  hw/acpi/aml-build.c   | 2 +-
>>  hw/core/machine-smp.c | 3 +++
>>  include/hw/boards.h   | 2 ++
>>  3 files changed, 6 insertions(+), 1 deletion(-)
>>
>> diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
>> index e6bfac95c7..aab73af66d 100644
>> --- a/hw/acpi/aml-build.c
>> +++ b/hw/acpi/aml-build.c
>> @@ -2030,7 +2030,7 @@ void build_pptt(GArray *table_data, BIOSLinker 
>> *linker, MachineState *ms,
>>  0, socket_id, NULL, 0);
>>  }
>>  
>> -if (mc->smp_props.clusters_supported) {
>> +if (mc->smp_props.clusters_supported && ms->smp.build_cluster) {
>>  if (cpus->cpus[n].props.cluster_id != cluster_id) {
>>  assert(cpus->cpus[n].props.cluster_id > cluster_id);
>>  cluster_id = cpus->cpus[n].props.cluster_id;
>> diff --git a/hw/core/machine-smp.c b/hw/core/machine-smp.c
>> index b39ed21e65..5d37e8d07a 100644
>> --- a/hw/core/machine-smp.c
>> +++ b/hw/core/machine-smp.c
>> @@ -158,6 +158,9 @@ void machine_parse_smp_config(MachineState *ms,
>>  ms->smp.threads = threads;
>>  ms->smp.max_cpus = maxcpus;
>>  
>> +if (config->has_clusters)
>> +ms->smp.build_cluster = true;
>> +
>>  /* sanity-check of the computed topology */
>>  if (sockets * dies * clusters * cores * threads != maxcpus) {
>>  g_autofree char *topo_msg = cpu_hierarchy_to_string(ms);
>> diff --git a/include/hw/boards.h b/include/hw/boards.h
>> index 7b416c9787..24aafc213d 100644
>> --- a/include/hw/boards.h
>> +++ b/include/hw/boards.h
>> @@ -305,6 +305,7 @@ typedef struct DeviceMemoryState {
>>   * @cores: the number of cores in one cluster
>>   * @threads: the number of threads in one core
>>   * @max_cpus: the maximum number of logical processors on the machine
>> + * @build_cluster: build cluster topology or not
>>   */
>>  typedef struct CpuTopology {
>>  unsigned int cpus;
>> @@ -314,6 +315,7 @@ typedef struct CpuTopology {
>>  unsigned int cores;
>>  unsigned int threads;
>>  unsigned int max_cpus;
>> +bool build_cluster;
>>  } CpuTopology;
>>  
>>  /**
>> -- 
>> 2.24.0
> 
> .
> 



Re: [PATCH 1/4] hw/acpi/aml-build: Only generate cluster node in PPTT when specified

2022-10-11 Thread Yicong Yang via
On 2022/10/9 14:46, wangyanan (Y) wrote:
> Hi Yicong,
> 
> On 2022/9/22 21:11, Yicong Yang wrote:
>> From: Yicong Yang
>>
>> Currently we'll always generate a cluster node no matter user has
>> specified '-smp clusters=X' or not. Cluster is an optional level
>> and it's unncessary to build it if user don't need. So only generate
>> it when user specify explicitly.
>>
>> Also update the test ACPI tables.
> It would be much more helpful to explain the problem you
> have met in practice without this patch. (maybe have some
> description or a link of the issue in the cover-letter if we
> need a v2).
> 

My problem is related to this but not fully caused by this.

I found my schedule domains are not built as expected with command
`-smp 8` and 4 NUMA nodes. The final schedule domains built look
like below with no NUMA domains built.

[2.141316] CPU0 attaching sched-domain(s):
[2.142558]  domain-0: span=0-7 level=MC
[2.145364]   groups: 0:{ span=0 cap=964 }, 1:{ span=1 cap=914 }, 2:{ span=2 
cap=921 }, 3:{ span=3 cap=964 }, 4:{ span=4 cap=925 }, 5:{ span=5 cap=964 }, 
6:{ span=6 cap=967 }, 7:{ span=7 cap=967 }
[2.158357] CPU1 attaching sched-domain(s):
[2.158964]  domain-0: span=0-7 level=MC

should be:

[2.008885] CPU0 attaching sched-domain(s):
[2.009764]  domain-0: span=0-1 level=MC
[2.012654]   groups: 0:{ span=0 cap=962 }, 1:{ span=1 cap=925 }
[2.016532]   domain-1: span=0-3 level=NUMA
[2.017444]groups: 0:{ span=0-1 cap=1887 }, 2:{ span=2-3 cap=1871 }
[2.019354]domain-2: span=0-5 level=NUMA
[2.019983] groups: 0:{ span=0-3 cap=3758 }, 4:{ span=4-5 cap=1935 }
[2.021527] domain-3: span=0-7 level=NUMA
[2.022516]  groups: 0:{ span=0-5 mask=0-1 cap=5693 }, 6:{ span=4-7 
mask=6-7 cap=3978 }
[...]

It's because the MC level span extends to Cluster level which spans
all the cpus in the system, then the schedule domain building stops
at MC level since it already includes all the cpus.

It makes people confusing that cluster node is generated without
asking for it.

A discussion for the problem:
https://lore.kernel.org/lkml/2c079860-ee82-7719-d3d2-756192f41...@huawei.com/

> In qemu which behaves as like a firmware vendor for VM,
> the ACPI PPTT is built based on the topology info produced
> by machine_parse_smp_config(). And machine_parse_smp_config
> will always calculate a complete topology hierarchy using its
> algorithm, if the user gives an incomplete -smp CLI.
> 

Considering cluster is an optional level and most platforms don't
have it, they may even don't realize this is built and a always
build policy cannot emulate the topology on these platforms.
Also it may influences the build of schedule domains uncousiously
in some cases so...

> I think there are two options for us to chose:
> 1) approach described in this patch
> 2) qemu will always generate a full topology hierarchy in PPTT
> with all the topo members it currently supports. While users
> need to consider the necessity to use an incomplete -smp or
> an complete one according to their specific scenario, and
> should be aware of the kernel behavior resulted from the
> config.
> 

...I'd prefer 1) then users can generate this *only* when they
explicitly know what they want and what they'll get. A full
topology hierachy generation lacks flexibility. Any thought?

> There is some Doc for users to explain how qemu will
> parse user-specified -smp in [1].
> [1] https://www.mankier.com/1/qemu#Options
> 

Thanks!
Yicong

> Thanks,
> Yanan
>> Signed-off-by: Yicong Yang
>> ---
>>   hw/acpi/aml-build.c   | 2 +-
>>   hw/core/machine-smp.c | 3 +++
>>   include/hw/boards.h   | 2 ++
>>   3 files changed, 6 insertions(+), 1 deletion(-)
>>
>> diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
>> index e6bfac95c7..aab73af66d 100644
>> --- a/hw/acpi/aml-build.c
>> +++ b/hw/acpi/aml-build.c
>> @@ -2030,7 +2030,7 @@ void build_pptt(GArray *table_data, BIOSLinker 
>> *linker, MachineState *ms,
>>   0, socket_id, NULL, 0);
>>   }
>>   -    if (mc->smp_props.clusters_supported) {
>> +    if (mc->smp_props.clusters_supported && ms->smp.build_cluster) {
>>   if (cpus->cpus[n].props.cluster_id != cluster_id) {
>>   assert(cpus->cpus[n].props.cluster_id > cluster_id);
>>   cluster_id = cpus->cpus[n].props.cluster_id;
>> diff --git a/hw/core/machine-smp.c b/hw/core/machine-smp.c
>> index b39ed21e65..5d37e8d07a 100644
>> --- a/hw/core/machine-smp.c
>> +++ b/hw/core/machine-smp.c
>> @@ -158,6 +158,9 @@ void machine_parse_smp_config(MachineState *ms,
>>   ms->smp.threads = threads;
>>   ms->smp.max_cpus = maxcpus;
>>   +    if (config->has_clusters)
>> +    ms->smp.build_cluster = true;
>> +
>>   /* sanity-check of the computed topology */
>>   if (sockets * dies * clusters * cores * threads != maxcpus) {
>>   g_autofree char *topo_msg = cpu_hierarchy_to_string(ms);
>> diff --git a/include/hw/boards.h 

[PATCH] semihosting: Write back semihosting data before completion callback

2022-10-11 Thread Keith Packard via
'lock_user' allocates a host buffer to shadow a target buffer,
'unlock_user' copies that host buffer back to the target and frees the
host memory. If the completion function uses the target buffer, it
must be called after unlock_user to ensure the data are present.

This caused the arm-compatible TARGET_SYS_READC to fail as the
completion function, common_semi_readc_cb, pulled data from the target
buffer which would not have been gotten the console data.

I decided to fix all instances of this pattern instead of just the
console_read function to make things consistent and potentially fix
bugs in other cases.

Signed-off-by: Keith Packard 
---
 semihosting/syscalls.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/semihosting/syscalls.c b/semihosting/syscalls.c
index 508a0ad88c..78ba97d7ab 100644
--- a/semihosting/syscalls.c
+++ b/semihosting/syscalls.c
@@ -321,11 +321,11 @@ static void host_read(CPUState *cs, 
gdb_syscall_complete_cb complete,
 ret = read(gf->hostfd, ptr, len);
 } while (ret == -1 && errno == EINTR);
 if (ret == -1) {
-complete(cs, -1, errno);
 unlock_user(ptr, buf, 0);
+complete(cs, -1, errno);
 } else {
-complete(cs, ret, 0);
 unlock_user(ptr, buf, ret);
+complete(cs, ret, 0);
 }
 }
 
@@ -341,8 +341,8 @@ static void host_write(CPUState *cs, 
gdb_syscall_complete_cb complete,
 return;
 }
 ret = write(gf->hostfd, ptr, len);
-complete(cs, ret, ret == -1 ? errno : 0);
 unlock_user(ptr, buf, 0);
+complete(cs, ret, ret == -1 ? errno : 0);
 }
 
 static void host_lseek(CPUState *cs, gdb_syscall_complete_cb complete,
@@ -428,8 +428,8 @@ static void host_stat(CPUState *cs, gdb_syscall_complete_cb 
complete,
 ret = -1;
 }
 }
-complete(cs, ret, err);
 unlock_user(name, fname, 0);
+complete(cs, ret, err);
 }
 
 static void host_remove(CPUState *cs, gdb_syscall_complete_cb complete,
@@ -446,8 +446,8 @@ static void host_remove(CPUState *cs, 
gdb_syscall_complete_cb complete,
 }
 
 ret = remove(p);
-complete(cs, ret, ret ? errno : 0);
 unlock_user(p, fname, 0);
+complete(cs, ret, ret ? errno : 0);
 }
 
 static void host_rename(CPUState *cs, gdb_syscall_complete_cb complete,
@@ -471,9 +471,9 @@ static void host_rename(CPUState *cs, 
gdb_syscall_complete_cb complete,
 }
 
 ret = rename(ostr, nstr);
-complete(cs, ret, ret ? errno : 0);
 unlock_user(ostr, oname, 0);
 unlock_user(nstr, nname, 0);
+complete(cs, ret, ret ? errno : 0);
 }
 
 static void host_system(CPUState *cs, gdb_syscall_complete_cb complete,
@@ -490,8 +490,8 @@ static void host_system(CPUState *cs, 
gdb_syscall_complete_cb complete,
 }
 
 ret = system(p);
-complete(cs, ret, ret == -1 ? errno : 0);
 unlock_user(p, cmd, 0);
+complete(cs, ret, ret == -1 ? errno : 0);
 }
 
 static void host_gettimeofday(CPUState *cs, gdb_syscall_complete_cb complete,
@@ -556,8 +556,8 @@ static void staticfile_read(CPUState *cs, 
gdb_syscall_complete_cb complete,
 }
 memcpy(ptr, gf->staticfile.data + gf->staticfile.off, len);
 gf->staticfile.off += len;
-complete(cs, len, 0);
 unlock_user(ptr, buf, len);
+complete(cs, len, 0);
 }
 
 static void staticfile_lseek(CPUState *cs, gdb_syscall_complete_cb complete,
@@ -610,8 +610,8 @@ static void console_read(CPUState *cs, 
gdb_syscall_complete_cb complete,
 return;
 }
 ret = qemu_semihosting_console_read(cs, ptr, len);
-complete(cs, ret, 0);
 unlock_user(ptr, buf, ret);
+complete(cs, ret, 0);
 }
 
 static void console_write(CPUState *cs, gdb_syscall_complete_cb complete,
@@ -626,8 +626,8 @@ static void console_write(CPUState *cs, 
gdb_syscall_complete_cb complete,
 return;
 }
 ret = qemu_semihosting_console_write(ptr, len);
-complete(cs, ret ? ret : -1, ret ? 0 : EIO);
 unlock_user(ptr, buf, 0);
+complete(cs, ret ? ret : -1, ret ? 0 : EIO);
 }
 
 static void console_fstat(CPUState *cs, gdb_syscall_complete_cb complete,
-- 
2.37.2




Re: [PATCH v10 00/17] qapi: net: add unix socket type support to netdev backend

2022-10-11 Thread Jason Wang
On Wed, Oct 12, 2022 at 4:05 AM Laurent Vivier  wrote:
>
> On 10/9/22 07:52, Jason Wang wrote:
> > On Thu, Oct 6, 2022 at 7:21 PM Michael S. Tsirkin  wrote:
> >>
> >> On Wed, Oct 05, 2022 at 06:20:34PM +0200, Laurent Vivier wrote:
> >>> "-netdev socket" only supports inet sockets.
> >>>
> >>> It's not a complex task to add support for unix sockets, but
> >>> the socket netdev parameters are not defined to manage well unix
> >>> socket parameters.
> >>
> >> Looks good.
> >>
> >> Acked-by: Michael S. Tsirkin 
> >>
> >> Belongs in Jason's tree.
> >
> > I've queued this series.
>
> I've found some minor problems. Could you queue incoming v11 instead?

Yes. Done.

Thanks

>
> Thanks,
> Laurent
>
> >
> > Thanks
> >
> >
> >>
> >>> As discussed in:
> >>>
> >>>"socket.c added support for unix domain socket datagram transport"
> >>>
> >>> https://lore.kernel.org/qemu-devel/1c0e1bc5-904f-46b0-8044-68e43e67b...@gmail.com/
> >>>
> >>> This series adds support of unix socket type using SocketAddress QAPI 
> >>> structure.
> >>>
> >>> Two new netdev backends, "stream" and "dgram" are added, that are barely 
> >>> a copy of "socket"
> >>> backend but they use the SocketAddress QAPI to provide socket parameters.
> >>> And then they also implement unix sockets (TCP and UDP).
> >>>
> >>> Some examples of CLI syntax:
> >>>
> >>>for TCP:
> >>>
> >>>-netdev 
> >>> stream,id=socket0,addr.type=inet,addr.host=localhost,addr.port=1234
> >>>-netdev 
> >>> stream,id=socket0,server=off,addr.type=inet,addr.host=localhost,addr.port=1234
> >>>
> >>>-netdev dgram,id=socket0,\
> >>>local.type=inet,local.host=localhost,local.port=1234,\
> >>>remote.type=inet,remote.host=localhost,remote.port=1235
> >>>
> >>>for UNIX:
> >>>
> >>>-netdev stream,id=socket0,addr.type=unix,addr.path=/tmp/qemu0
> >>>-netdev 
> >>> stream,id=socket0,server=off,addr.type=unix,addr.path=/tmp/qemu0
> >>>
> >>>-netdev dgram,id=socket0,\
> >>>local.type=unix,local.path=/tmp/qemu0,\
> >>>remote.type=unix,remote.path=/tmp/qemu1
> >>>
> >>>for FD:
> >>>
> >>>-netdev stream,id=socket0,addr.type=fd,addr.str=4
> >>>-netdev stream,id=socket0,server=off,addr.type=fd,addr.str=5
> >>>
> >>>-netdev dgram,id=socket0,local.type=fd,addr.str=4
> >>>
> >>> v10:
> >>>- add Red Hat copyright
> >>>- initialize dgram_dst to NULL in SOCKET_ADDRESS_TYPE_FD
> >>>- remove redundente _stream / _dgram in functions name
> >>>- move net_dgram_init() into net_init_dgram()
> >>>- address Thomas' comments on qtest
> >>>- add a function qemu_set_info_str() to set info string
> >>>- tested stream netdev with fd type using qrap/passt and
> >>>  "-netdev stream,addr.type=fd,server=off,addr.str=5,id=netdev0"
> >>>
> >>> v9:
> >>>- add events to report stream connection/disconnection
> >>>- remove from net/dgram.c send_fn, listen_fd, net_dgram_accept()
> >>>  net_dgram_connect() and net_dgram_send() that are only
> >>>  needed by net/stream.c
> >>>- remove from net/stream.c send_fn
> >>>- add Red Hat copyright
> >>>- add original net/socket.c Stefano's patch (EINVAL)
> >>>
> >>> v8:
> >>>- test ipv4 and ipv6 parameters (stream inet)
> >>>- test abstract parameter (stream unix)
> >>>- add SocketAddressInet supported parameters in qemu-options.hx
> >>>  (only stream, supported by the move to QIO)
> >>>- with qio_channel_writev() replace (ret == -1 && errno == EAGAIN)
> >>>  by (ret == QIO_CHANNEL_ERR_BLOCK)
> >>>
> >>> v7:
> >>>- add qtests
> >>>- update parameters table in net.json
> >>>- update socket_uri() and socket_parse()
> >>>
> >>> v6:
> >>>- s/netdev option/-netdev option/ PATCH 4
> >>>- s/ / /
> >>>- update @NetdevStreamOptions and @NetdevDgramOptions comments
> >>>- update PATCH 4 description message
> >>>- add missing return in error case for unix stream socket
> >>>- split socket_uri() patch: move and rename, then change content
> >>>
> >>> v5:
> >>>- remove RFC prefix
> >>>- put the change of net_client_parse() into its own patch (exit() in 
> >>> the
> >>>  function)
> >>>- update comments regarding netdev_is_modern() and 
> >>> netdev_parse_modern()
> >>>- update error case in net_stream_server_init()
> >>>- update qemu-options.hx with unix type
> >>>- fix HMP "info network" with unix protocol/server side.
> >>>
> >>> v4:
> >>>- net_client_parse() fails with exit() rather than with return.
> >>>- keep "{ 'name': 'vmnet-host', 'if': 'CONFIG_VMNET' }" on its
> >>>  own line in qapi/net.json
> >>>- add a comment in qapi/net.json about parameters usage
> >>>- move netdev_is_modern() check to qemu_init()
> >>>- in netdev_is_modern(), check for JSON and use qemu_opts_do_parse()
> >>>  to parse parameters and detect type value.
> >>>- add a blank line after copyright comment
> >>>
> >>> v3:
> >>>- remove support of 

[PATCH] target/riscv: pmp: Fixup TLB size calculation

2022-10-11 Thread Alistair Francis
From: Alistair Francis 

Since commit 4047368938f6 "accel/tcg: Introduce tlb_set_page_full" we
have been seeing this assert

../accel/tcg/cputlb.c:1294: tlb_set_page_with_attrs: Assertion 
`is_power_of_2(size)' failed.

When running Tock on the OpenTitan machine.

The issue is that pmp_get_tlb_size() would return a TLB size that wasn't
a power of 2. The size was also smaller then TARGET_PAGE_SIZE.

This patch ensures that any TLB size less then TARGET_PAGE_SIZE is
rounded down to 1 to ensure it's a valid size.

Signed-off-by: Alistair Francis 
---
This is based on advice from Richard:
https://patchwork.kernel.org/project/qemu-devel/patch/20221004141051.110653-9-richard.hender...@linaro.org/#25043166

 target/riscv/pmp.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/target/riscv/pmp.c b/target/riscv/pmp.c
index ea2b67d947..2b43e399b8 100644
--- a/target/riscv/pmp.c
+++ b/target/riscv/pmp.c
@@ -628,6 +628,18 @@ bool pmp_is_range_in_tlb(CPURISCVState *env, hwaddr tlb_sa,
 }
 
 if (*tlb_size != 0) {
+/*
+ * At this point we have a tlb_size that is the smallest possible size
+ * That fits within a TARGET_PAGE_SIZE and the PMP region.
+ *
+ * If the size is less then TARGET_PAGE_SIZE we drop the size to 1.
+ * This means the result isn't cached in the TLB and is only used for
+ * a single translation.
+ */
+if (*tlb_size < TARGET_PAGE_SIZE) {
+*tlb_size = 1;
+}
+
 return true;
 }
 
-- 
2.37.3




[PATCH v5 5/6] hw/arm/virt: Improve high memory region address assignment

2022-10-11 Thread Gavin Shan
There are three high memory regions, which are VIRT_HIGH_REDIST2,
VIRT_HIGH_PCIE_ECAM and VIRT_HIGH_PCIE_MMIO. Their base addresses
are floating on highest RAM address. However, they can be disabled
in several cases.

(1) One specific high memory region is disabled by developer by
toggling vms->highmem_{redists, ecam, mmio}.

(2) VIRT_HIGH_PCIE_ECAM region is disabled on machine, which is
'virt-2.12' or ealier than it.

(3) VIRT_HIGH_PCIE_ECAM region is disabled when firmware is loaded
on 32-bits system.

(4) One specific high memory region is disabled when it breaks the
PA space limit.

The current implementation of virt_set_memmap() isn't comprehensive
because the space for one specific high memory region is always
reserved from the PA space for case (1), (2) and (3). In the code,
'base' and 'vms->highest_gpa' are always increased for those three
cases. It's unnecessary since the assigned space of the disabled
high memory region won't be used afterwards.

This improves the address assignment for those three high memory
region by skipping the address assignment for one specific high
memory region if it has been disabled in case (1), (2) and (3).
'vms->high_compact' is false for now, meaning that we don't have
any behavior changes until it becomes configurable through property
'compact-highmem' in next patch.

Signed-off-by: Gavin Shan 
Tested-by: Zhenyu Zhang 
---
 hw/arm/virt.c | 23 +++
 include/hw/arm/virt.h |  1 +
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index ee98a8a3b6..c05cfb5314 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -1717,22 +1717,29 @@ static void virt_set_high_memmap(VirtMachineState *vms,
 region_base = ROUND_UP(base, extended_memmap[i].size);
 region_size = extended_memmap[i].size;
 
-vms->memmap[i].base = region_base;
-vms->memmap[i].size = region_size;
-
 /*
  * Check each device to see if they fit in the PA space,
- * moving highest_gpa as we go.
+ * moving highest_gpa as we go. For compatibility, move
+ * highest_gpa for disabled fitting devices as well, if
+ * the compact layout has been disabled.
  *
  * For each device that doesn't fit, disable it.
  */
 fits = (region_base + region_size) <= BIT_ULL(pa_bits);
-if (fits) {
+if (*region_enabled && fits) {
+vms->memmap[i].base = region_base;
+vms->memmap[i].size = region_size;
 vms->highest_gpa = region_base + region_size - 1;
+base = region_base + region_size;
+} else {
+*region_enabled = false;
+if (!vms->highmem_compact) {
+base = region_base + region_size;
+if (fits) {
+vms->highest_gpa = region_base + region_size - 1;
+}
+}
 }
-
-*region_enabled &= fits;
-base = region_base + region_size;
 }
 }
 
diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
index 6ec479ca2b..709f623741 100644
--- a/include/hw/arm/virt.h
+++ b/include/hw/arm/virt.h
@@ -144,6 +144,7 @@ struct VirtMachineState {
 PFlashCFI01 *flash[2];
 bool secure;
 bool highmem;
+bool highmem_compact;
 bool highmem_ecam;
 bool highmem_mmio;
 bool highmem_redists;
-- 
2.23.0




[PATCH v5 2/6] hw/arm/virt: Rename variable size to region_size in virt_set_high_memmap()

2022-10-11 Thread Gavin Shan
This renames variable 'size' to 'region_size' in virt_set_high_memmap().
Its counterpart ('region_base') will be introduced in next patch.

No functional change intended.

Signed-off-by: Gavin Shan 
Reviewed-by: Eric Auger 
Reviewed-by: Cornelia Huck 
Tested-by: Zhenyu Zhang 
---
 hw/arm/virt.c | 15 ---
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 7572c44bda..e2ae88cf8b 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -1692,15 +1692,16 @@ static uint64_t virt_cpu_mp_affinity(VirtMachineState 
*vms, int idx)
 static void virt_set_high_memmap(VirtMachineState *vms,
  hwaddr base, int pa_bits)
 {
+hwaddr region_size;
+bool fits;
 int i;
 
 for (i = VIRT_LOWMEMMAP_LAST; i < ARRAY_SIZE(extended_memmap); i++) {
-hwaddr size = extended_memmap[i].size;
-bool fits;
+region_size = extended_memmap[i].size;
 
-base = ROUND_UP(base, size);
+base = ROUND_UP(base, region_size);
 vms->memmap[i].base = base;
-vms->memmap[i].size = size;
+vms->memmap[i].size = region_size;
 
 /*
  * Check each device to see if they fit in the PA space,
@@ -1708,9 +1709,9 @@ static void virt_set_high_memmap(VirtMachineState *vms,
  *
  * For each device that doesn't fit, disable it.
  */
-fits = (base + size) <= BIT_ULL(pa_bits);
+fits = (base + region_size) <= BIT_ULL(pa_bits);
 if (fits) {
-vms->highest_gpa = base + size - 1;
+vms->highest_gpa = base + region_size - 1;
 }
 
 switch (i) {
@@ -1725,7 +1726,7 @@ static void virt_set_high_memmap(VirtMachineState *vms,
 break;
 }
 
-base += size;
+base += region_size;
 }
 }
 
-- 
2.23.0




[PATCH v5 4/6] hw/arm/virt: Introduce virt_get_high_memmap_enabled() helper

2022-10-11 Thread Gavin Shan
This introduces virt_get_high_memmap_enabled() helper, which returns
the pointer to vms->highmem_{redists, ecam, mmio}. The pointer will
be used in the subsequent patches.

No functional change intended.

Signed-off-by: Gavin Shan 
Tested-by: Zhenyu Zhang 
---
 hw/arm/virt.c | 32 +++-
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 0bf3cb7057..ee98a8a3b6 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -1689,14 +1689,31 @@ static uint64_t virt_cpu_mp_affinity(VirtMachineState 
*vms, int idx)
 return arm_cpu_mp_affinity(idx, clustersz);
 }
 
+static inline bool *virt_get_high_memmap_enabled(VirtMachineState *vms,
+ int index)
+{
+bool *enabled_array[] = {
+>highmem_redists,
+>highmem_ecam,
+>highmem_mmio,
+};
+
+assert(ARRAY_SIZE(extended_memmap) - VIRT_LOWMEMMAP_LAST ==
+   ARRAY_SIZE(enabled_array));
+assert(index - VIRT_LOWMEMMAP_LAST < ARRAY_SIZE(enabled_array));
+
+return enabled_array[index - VIRT_LOWMEMMAP_LAST];
+}
+
 static void virt_set_high_memmap(VirtMachineState *vms,
  hwaddr base, int pa_bits)
 {
 hwaddr region_base, region_size;
-bool fits;
+bool *region_enabled, fits;
 int i;
 
 for (i = VIRT_LOWMEMMAP_LAST; i < ARRAY_SIZE(extended_memmap); i++) {
+region_enabled = virt_get_high_memmap_enabled(vms, i);
 region_base = ROUND_UP(base, extended_memmap[i].size);
 region_size = extended_memmap[i].size;
 
@@ -1714,18 +1731,7 @@ static void virt_set_high_memmap(VirtMachineState *vms,
 vms->highest_gpa = region_base + region_size - 1;
 }
 
-switch (i) {
-case VIRT_HIGH_GIC_REDIST2:
-vms->highmem_redists &= fits;
-break;
-case VIRT_HIGH_PCIE_ECAM:
-vms->highmem_ecam &= fits;
-break;
-case VIRT_HIGH_PCIE_MMIO:
-vms->highmem_mmio &= fits;
-break;
-}
-
+*region_enabled &= fits;
 base = region_base + region_size;
 }
 }
-- 
2.23.0




[PATCH v5 6/6] hw/arm/virt: Add 'compact-highmem' property

2022-10-11 Thread Gavin Shan
After the improvement to high memory region address assignment is
applied, the memory layout can be changed, introducing possible
migration breakage. For example, VIRT_HIGH_PCIE_MMIO memory region
is disabled or enabled when the optimization is applied or not, with
the following configuration.

  pa_bits  = 40;
  vms->highmem_redists = false;
  vms->highmem_ecam= false;
  vms->highmem_mmio= true;

  # qemu-system-aarch64 -accel kvm -cpu host\
-machine virt-7.2,compact-highmem={on, off} \
-m 4G,maxmem=511G -monitor stdio

  Regioncompact-highmem=off compact-highmem=on
  
  RAM   [1GB 512GB][1GB 512GB]
  HIGH_GIC_REDISTS  [512GB   512GB+64MB]   [disabled]
  HIGH_PCIE_ECAM[512GB+256MB 512GB+512MB]  [disabled]
  HIGH_PCIE_MMIO[disabled] [512GB   1TB]

In order to keep backwords compatibility, we need to disable the
optimization on machines, which is virt-7.1 or ealier than it. It
means the optimization is enabled by default from virt-7.2. Besides,
'compact-highmem' property is added so that the optimization can be
explicitly enabled or disabled on all machine types by users.

Signed-off-by: Gavin Shan 
Tested-by: Zhenyu Zhang 
---
 docs/system/arm/virt.rst |  4 
 hw/arm/virt.c| 47 
 include/hw/arm/virt.h|  1 +
 3 files changed, 52 insertions(+)

diff --git a/docs/system/arm/virt.rst b/docs/system/arm/virt.rst
index 20442ea2c1..75bf5a4994 100644
--- a/docs/system/arm/virt.rst
+++ b/docs/system/arm/virt.rst
@@ -94,6 +94,10 @@ highmem
   address space above 32 bits. The default is ``on`` for machine types
   later than ``virt-2.12``.
 
+compact-highmem
+  Set ``on``/``off`` to enable/disable compact space for high memory regions.
+  The default is ``on`` for machine types later than ``virt-7.2``
+
 gic-version
   Specify the version of the Generic Interrupt Controller (GIC) to provide.
   Valid values are:
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index c05cfb5314..8f1dba0ece 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -174,6 +174,27 @@ static const MemMapEntry base_memmap[] = {
  * Note the extended_memmap is sized so that it eventually also includes the
  * base_memmap entries (VIRT_HIGH_GIC_REDIST2 index is greater than the last
  * index of base_memmap).
+ *
+ * The addresses assigned to these regions are affected by 'compact-highmem'
+ * property, which is to enable or disable the compact space in the Highmem
+ * IO regions. For example, VIRT_HIGH_PCIE_MMIO can be disabled or enabled
+ * depending on the property in the following scenario.
+ *
+ * pa_bits  = 40;
+ * vms->highmem_redists = false;
+ * vms->highmem_ecam= false;
+ * vms->highmem_mmio= true;
+ *
+ * # qemu-system-aarch64 -accel kvm -cpu host\
+ *   -machine virt-7.2,compact-highmem={on, off} \
+ *   -m 4G,maxmem=511G -monitor stdio
+ *
+ * Regioncompact-highmem=offcompact-highmem=on
+ * 
+ * RAM   [1GB 512GB][1GB 512GB]
+ * HIGH_GIC_REDISTS  [512GB   512GB+64MB]   [disabled]
+ * HIGH_PCIE_ECAM[512GB+256GB 512GB+512MB]  [disabled]
+ * HIGH_PCIE_MMIO[disabled] [512GB   1TB]
  */
 static MemMapEntry extended_memmap[] = {
 /* Additional 64 MB redist region (can contain up to 512 redistributors) */
@@ -2353,6 +2374,20 @@ static void virt_set_highmem(Object *obj, bool value, 
Error **errp)
 vms->highmem = value;
 }
 
+static bool virt_get_compact_highmem(Object *obj, Error **errp)
+{
+VirtMachineState *vms = VIRT_MACHINE(obj);
+
+return vms->highmem_compact;
+}
+
+static void virt_set_compact_highmem(Object *obj, bool value, Error **errp)
+{
+VirtMachineState *vms = VIRT_MACHINE(obj);
+
+vms->highmem_compact = value;
+}
+
 static bool virt_get_its(Object *obj, Error **errp)
 {
 VirtMachineState *vms = VIRT_MACHINE(obj);
@@ -2971,6 +3006,13 @@ static void virt_machine_class_init(ObjectClass *oc, 
void *data)
   "Set on/off to enable/disable using "
   "physical address space above 32 
bits");
 
+object_class_property_add_bool(oc, "compact-highmem",
+   virt_get_compact_highmem,
+   virt_set_compact_highmem);
+object_class_property_set_description(oc, "compact-highmem",
+  "Set on/off to enable/disable 
compact "
+  "space for high memory regions");
+
 object_class_property_add_str(oc, "gic-version", virt_get_gic_version,
   virt_set_gic_version);
 object_class_property_set_description(oc, "gic-version",
@@ -3055,6 +3097,7 @@ static void 

[PATCH v5 3/6] hw/arm/virt: Introduce variable region_base in virt_set_high_memmap()

2022-10-11 Thread Gavin Shan
This introduces variable 'region_base' for the base address of the
specific high memory region. It's the preparatory work to optimize
high memory region address assignment.

No functional change intended.

Signed-off-by: Gavin Shan 
Reviewed-by: Eric Auger 
Reviewed-by: Cornelia Huck 
Tested-by: Zhenyu Zhang 
---
 hw/arm/virt.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index e2ae88cf8b..0bf3cb7057 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -1692,15 +1692,15 @@ static uint64_t virt_cpu_mp_affinity(VirtMachineState 
*vms, int idx)
 static void virt_set_high_memmap(VirtMachineState *vms,
  hwaddr base, int pa_bits)
 {
-hwaddr region_size;
+hwaddr region_base, region_size;
 bool fits;
 int i;
 
 for (i = VIRT_LOWMEMMAP_LAST; i < ARRAY_SIZE(extended_memmap); i++) {
+region_base = ROUND_UP(base, extended_memmap[i].size);
 region_size = extended_memmap[i].size;
 
-base = ROUND_UP(base, region_size);
-vms->memmap[i].base = base;
+vms->memmap[i].base = region_base;
 vms->memmap[i].size = region_size;
 
 /*
@@ -1709,9 +1709,9 @@ static void virt_set_high_memmap(VirtMachineState *vms,
  *
  * For each device that doesn't fit, disable it.
  */
-fits = (base + region_size) <= BIT_ULL(pa_bits);
+fits = (region_base + region_size) <= BIT_ULL(pa_bits);
 if (fits) {
-vms->highest_gpa = base + region_size - 1;
+vms->highest_gpa = region_base + region_size - 1;
 }
 
 switch (i) {
@@ -1726,7 +1726,7 @@ static void virt_set_high_memmap(VirtMachineState *vms,
 break;
 }
 
-base += region_size;
+base = region_base + region_size;
 }
 }
 
-- 
2.23.0




[PATCH v5 1/6] hw/arm/virt: Introduce virt_set_high_memmap() helper

2022-10-11 Thread Gavin Shan
This introduces virt_set_high_memmap() helper. The logic of high
memory region address assignment is moved to the helper. The intention
is to make the subsequent optimization for high memory region address
assignment easier.

No functional change intended.

Signed-off-by: Gavin Shan 
Reviewed-by: Eric Auger 
Reviewed-by: Cornelia Huck 
Tested-by: Zhenyu Zhang 
---
 hw/arm/virt.c | 74 ---
 1 file changed, 41 insertions(+), 33 deletions(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index cda9defe8f..7572c44bda 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -1689,6 +1689,46 @@ static uint64_t virt_cpu_mp_affinity(VirtMachineState 
*vms, int idx)
 return arm_cpu_mp_affinity(idx, clustersz);
 }
 
+static void virt_set_high_memmap(VirtMachineState *vms,
+ hwaddr base, int pa_bits)
+{
+int i;
+
+for (i = VIRT_LOWMEMMAP_LAST; i < ARRAY_SIZE(extended_memmap); i++) {
+hwaddr size = extended_memmap[i].size;
+bool fits;
+
+base = ROUND_UP(base, size);
+vms->memmap[i].base = base;
+vms->memmap[i].size = size;
+
+/*
+ * Check each device to see if they fit in the PA space,
+ * moving highest_gpa as we go.
+ *
+ * For each device that doesn't fit, disable it.
+ */
+fits = (base + size) <= BIT_ULL(pa_bits);
+if (fits) {
+vms->highest_gpa = base + size - 1;
+}
+
+switch (i) {
+case VIRT_HIGH_GIC_REDIST2:
+vms->highmem_redists &= fits;
+break;
+case VIRT_HIGH_PCIE_ECAM:
+vms->highmem_ecam &= fits;
+break;
+case VIRT_HIGH_PCIE_MMIO:
+vms->highmem_mmio &= fits;
+break;
+}
+
+base += size;
+}
+}
+
 static void virt_set_memmap(VirtMachineState *vms, int pa_bits)
 {
 MachineState *ms = MACHINE(vms);
@@ -1744,39 +1784,7 @@ static void virt_set_memmap(VirtMachineState *vms, int 
pa_bits)
 /* We know for sure that at least the memory fits in the PA space */
 vms->highest_gpa = memtop - 1;
 
-for (i = VIRT_LOWMEMMAP_LAST; i < ARRAY_SIZE(extended_memmap); i++) {
-hwaddr size = extended_memmap[i].size;
-bool fits;
-
-base = ROUND_UP(base, size);
-vms->memmap[i].base = base;
-vms->memmap[i].size = size;
-
-/*
- * Check each device to see if they fit in the PA space,
- * moving highest_gpa as we go.
- *
- * For each device that doesn't fit, disable it.
- */
-fits = (base + size) <= BIT_ULL(pa_bits);
-if (fits) {
-vms->highest_gpa = base + size - 1;
-}
-
-switch (i) {
-case VIRT_HIGH_GIC_REDIST2:
-vms->highmem_redists &= fits;
-break;
-case VIRT_HIGH_PCIE_ECAM:
-vms->highmem_ecam &= fits;
-break;
-case VIRT_HIGH_PCIE_MMIO:
-vms->highmem_mmio &= fits;
-break;
-}
-
-base += size;
-}
+virt_set_high_memmap(vms, base, pa_bits);
 
 if (device_memory_size > 0) {
 ms->device_memory = g_malloc0(sizeof(*ms->device_memory));
-- 
2.23.0




[PATCH v5 0/6] hw/arm/virt: Improve address assignment for high memory regions

2022-10-11 Thread Gavin Shan
There are three high memory regions, which are VIRT_HIGH_REDIST2,
VIRT_HIGH_PCIE_ECAM and VIRT_HIGH_PCIE_MMIO. Their base addresses
are floating on highest RAM address. However, they can be disabled
in several cases.

(1) One specific high memory region is disabled by developer by
toggling vms->highmem_{redists, ecam, mmio}.

(2) VIRT_HIGH_PCIE_ECAM region is disabled on machine, which is
'virt-2.12' or ealier than it.

(3) VIRT_HIGH_PCIE_ECAM region is disabled when firmware is loaded
on 32-bits system.

(4) One specific high memory region is disabled when it breaks the
PA space limit.

The current implementation of virt_set_memmap() isn't comprehensive
because the space for one specific high memory region is always
reserved from the PA space for case (1), (2) and (3). In the code,
'base' and 'vms->highest_gpa' are always increased for those three
cases. It's unnecessary since the assigned space of the disabled
high memory region won't be used afterwards.

The series intends to improve the address assignment for these
high memory regions.

PATCH[1-4] preparatory work for the improvment
PATCH[5]   improve high memory region address assignment
PATCH[6]   adds 'compact-highmem' to enable or disable the optimization

v4: https://lists.nongnu.org/archive/html/qemu-arm/2022-10/msg00067.html
v3: https://lists.nongnu.org/archive/html/qemu-arm/2022-09/msg00258.html
v2: https://lore.kernel.org/all/20220815062958.100366-1-gs...@redhat.com/T/
v1: https://lists.nongnu.org/archive/html/qemu-arm/2022-08/msg00013.html

Changelog
==
v5:
  * Pick review-by and tested-by   (Connie/Zhenyu)
  * Add extra check in PATCH[v5 4/6]   (Connie)
  * Improve comments about compatibility for disabled regions
in PATCH[v5 5/6]   (Connie)
v4:
  * Add virt_get_high_memmap_enabled() helper  (Eric)
  * Move 'vms->highmem_compact' and related logic from
PATCH[v4 6/6] to PATCH[v4 5/6] to avoid git-bisect
breakage   (Eric)
  * Document the legacy and optimized high memory region
layout in commit log and source code   (Eric)
v3:
  * Reorder the patches(Gavin)
  * Add 'highmem-compact' property for backwards compatibility (Eric)
v2:
  * Split the patches for easier review(Gavin)
  * Improved changelog (Marc)
  * Use 'bool fits' in virt_set_high_memmap()  (Eric)

Gavin Shan (6):
  hw/arm/virt: Introduce virt_set_high_memmap() helper
  hw/arm/virt: Rename variable size to region_size in
virt_set_high_memmap()
  hw/arm/virt: Introduce variable region_base in virt_set_high_memmap()
  hw/arm/virt: Introduce virt_get_high_memmap_enabled() helper
  hw/arm/virt: Improve high memory region address assignment
  hw/arm/virt: Add 'compact-highmem' property

 docs/system/arm/virt.rst |   4 ++
 hw/arm/virt.c| 135 +--
 include/hw/arm/virt.h|   2 +
 3 files changed, 108 insertions(+), 33 deletions(-)

-- 
2.23.0




Re: [PATCH v4 4/6] hw/arm/virt: Introduce virt_get_high_memmap_enabled() helper

2022-10-11 Thread Gavin Shan

On 10/12/22 12:45 AM, Eric Auger wrote:

On 10/5/22 00:47, Gavin Shan wrote:

On 10/4/22 6:41 PM, Cornelia Huck wrote:

On Tue, Oct 04 2022, Gavin Shan  wrote:


This introduces virt_get_high_memmap_enabled() helper, which returns
the pointer to vms->highmem_{redists, ecam, mmio}. The pointer will
be used in the subsequent patches.

No functional change intended.

Signed-off-by: Gavin Shan 
---
   hw/arm/virt.c | 30 +-
   1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index b0b679d1f4..59de7b78b5 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -1689,14 +1689,29 @@ static uint64_t
virt_cpu_mp_affinity(VirtMachineState *vms, int idx)
   return arm_cpu_mp_affinity(idx, clustersz);
   }
   +static inline bool *virt_get_high_memmap_enabled(VirtMachineState
*vms,
+ int index)
+{
+    bool *enabled_array[] = {
+    >highmem_redists,
+    >highmem_ecam,
+    >highmem_mmio,
+    };
+
+    assert(index - VIRT_LOWMEMMAP_LAST < ARRAY_SIZE(enabled_array));


I wonder whether we want an assert(ARRAY_SIZE(extended_memmap) ==
ARRAY_SIZE(enabled_array))? IIUC, we never want those two to get out of
sync?



Yeah, It makes sense to ensure both arrays synchronized. I will add
the extra check in next respin.


With Connie's suggestion this looks good to me.



What we need is actually like below because the array (extended_memmap)
starts from VIRT_LOWMEMMAP_LAST instead of zero. I'm adding the extra
check into v5, which will be posted shortly.

   assert(ARRAY_SIZE(extended_memmap) - VIRT_LOWMEMMAP_LAST ==
  ARRAY_SIZE(enabled_array));




+
+    return enabled_array[index - VIRT_LOWMEMMAP_LAST];
+}
+


Thanks,
Gavin




Re: [PATCH v3 0/2] Enhance maximum priority support of PLIC

2022-10-11 Thread Alistair Francis
On Mon, Oct 3, 2022 at 2:18 PM Jim Shu  wrote:
>
> This patchset fixes hard-coded maximum priority of interrupt priority
> register and also changes this register to WARL field to align the PLIC
> spec.
>
> Changelog:
>
> v3:
>   * fix opposite of power-of-2 max priority checking expression.
>
> v2:
>   * change interrupt priority register to WARL field.
>
> Jim Shu (2):
>   hw/intc: sifive_plic: fix hard-coded max priority level
>   hw/intc: sifive_plic: change interrupt priority register to WARL field

Thanks!

Applied to riscv-to-apply.next

Alistair

>
>  hw/intc/sifive_plic.c | 25 ++---
>  1 file changed, 22 insertions(+), 3 deletions(-)
>
> --
> 2.17.1
>
>



Re: [PATCH v5 0/2] hw/ssi/ibex_spi: bug fixes

2022-10-11 Thread Alistair Francis
On Fri, Sep 30, 2022 at 1:34 PM Wilfred Mallawa
 wrote:
>
> From: Wilfred Mallawa 
>
> The remaining patches in this series address:
> - Coverity issues for `ibex_spi`
> - Adds rw1c functionality
>
> Changes since V4:
> - Fixup compiler warning for unused variable `data` in [1/2]
>
> Wilfred Mallawa (2):
>   hw/ssi: ibex_spi: fixup coverity issue
>   hw/ssi: ibex_spi: fixup/add rw1c functionality

Thanks!

Applied to riscv-to-apply.next

Alistair

>
>  hw/ssi/ibex_spi_host.c | 166 -
>  include/hw/ssi/ibex_spi_host.h |   4 +-
>  2 files changed, 102 insertions(+), 68 deletions(-)
>
> --
> 2.37.3
>
>



[PATCH RFC 2/2] qemu-thread: Fail hard for suspecious mutex unlocks

2022-10-11 Thread Peter Xu
Add a field for QemuMutex to remember the locked status, then assert
properly when CONFIG_DEBUG_MUTEX enabled on illegal unlocks.

The pthread library is by default quite loose on this by allowing the
unlock to quietly succeed.  But that could cause the follow up things very
unpredictable so if there's a bug it'll be harder to track than failing
early at the illegal unlock.

Signed-off-by: Peter Xu 
---
 include/qemu/thread-posix.h |  1 +
 util/qemu-thread-common.h   | 10 ++
 2 files changed, 11 insertions(+)

diff --git a/include/qemu/thread-posix.h b/include/qemu/thread-posix.h
index 5f2f3d1386..e13bd5492c 100644
--- a/include/qemu/thread-posix.h
+++ b/include/qemu/thread-posix.h
@@ -9,6 +9,7 @@ struct QemuMutex {
 #ifdef CONFIG_DEBUG_MUTEX
 const char *file;
 int line;
+bool locked;
 #endif
 bool initialized;
 };
diff --git a/util/qemu-thread-common.h b/util/qemu-thread-common.h
index 2af6b12085..ed74bdb0d1 100644
--- a/util/qemu-thread-common.h
+++ b/util/qemu-thread-common.h
@@ -21,6 +21,7 @@ static inline void qemu_mutex_post_init(QemuMutex *mutex)
 #ifdef CONFIG_DEBUG_MUTEX
 mutex->file = NULL;
 mutex->line = 0;
+mutex->locked = false;
 #endif
 mutex->initialized = true;
 }
@@ -37,6 +38,7 @@ static inline void qemu_mutex_post_lock(QemuMutex *mutex,
 #ifdef CONFIG_DEBUG_MUTEX
 mutex->file = file;
 mutex->line = line;
+mutex->locked = true;
 #endif
 trace_qemu_mutex_locked(mutex, file, line);
 }
@@ -47,6 +49,14 @@ static inline void qemu_mutex_pre_unlock(QemuMutex *mutex,
 #ifdef CONFIG_DEBUG_MUTEX
 mutex->file = NULL;
 mutex->line = 0;
+/*
+ * pthread_mutex_unlock() by default silently ignore unlocking a mutex
+ * even if it's not locked.  Make it strict with QEMU when DEBUG_MUTEX
+ * is enabled, so that we can capture it at the exact wrong unlock.
+ * It'll be easier to track this than having misterious deadlock later.
+ */
+assert(mutex->locked);
+mutex->locked = false;
 #endif
 trace_qemu_mutex_unlock(mutex, file, line);
 }
-- 
2.37.3




[PATCH RFC 1/2] qemu-thread: Enable the new timedwait to use DEBUG_MUTEX too

2022-10-11 Thread Peter Xu
The new _timedwait() version of qemu cond/mutex doesn't trigger the
DEBUG_MUTEX paths; enable it too.

Cc: Yury Kotov 
Signed-off-by: Peter Xu 
---
 util/qemu-thread-posix.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/util/qemu-thread-posix.c b/util/qemu-thread-posix.c
index ac1d56e673..5840f6e6f5 100644
--- a/util/qemu-thread-posix.c
+++ b/util/qemu-thread-posix.c
@@ -229,9 +229,9 @@ qemu_cond_timedwait_ts(QemuCond *cond, QemuMutex *mutex, 
struct timespec *ts,
 int err;
 
 assert(cond->initialized);
-trace_qemu_mutex_unlock(mutex, file, line);
+qemu_mutex_pre_unlock(mutex, file, line);
 err = pthread_cond_timedwait(>cond, >lock, ts);
-trace_qemu_mutex_locked(mutex, file, line);
+qemu_mutex_post_lock(mutex, file, line);
 if (err && err != ETIMEDOUT) {
 error_exit(err, __func__);
 }
-- 
2.37.3




[PATCH RFC 0/2] qemu-thread: Strict unlock check

2022-10-11 Thread Peter Xu
NOTE: mark patchset RFC because "make check" will easily fail; but I didn't
yet dig into why as I'm not familiar with the code paths that triggers, it
can be bugs hidden or something I missed.  So RFC to just have some thoughts.

The first patch converts the new timedwait to use DEBUG_MUTEX paths too.
IMO this one is pretty much wanted.  The second patch add a strict version
of pthread_mutex_unlock() check by making sure the lock is locked first.

This comes from a debugging of migration code where we have had functions
like:

  /* func() must be with lockA held */
  func() {
...
/* Temporarily release the lock */
qemu_mutex_unlock(lockA);
...
/* Retake the lock */
qemu_mutex_lock(lockA);
...
  }

Since I found that pthread lib is very "friendly" to unpaired unlock and
just silently ignore it, returning 0 as succeed.  It means when func() is
called without lockA held the unlock() above will be ignored, but the
follow up lock() will be real.  Later it will easily cause deadlock
afterwards in func() above because they just don't pair anymore right after
the one ignored unlock().

Since it's harder to know where should we take the lock, it's still easily
to fail the unlock() upon a lock not being held at all, so it's at least
earlier than the deadlock later on lockA in some other thread.

Patch 2 can also be used to further replace [sg]et_iothread_locked(), I
think, then we need to move the "locked" to be outside DEBUG_MUTEX but only
keep the assert() inside.  But we can discuss that later.

Comments welcomed, thanks.

Peter Xu (2):
  qemu-thread: Enable the new timedwait to use DEBUG_MUTEX too
  qemu-thread: Fail hard for suspecious mutex unlocks

 include/qemu/thread-posix.h |  1 +
 util/qemu-thread-common.h   | 10 ++
 util/qemu-thread-posix.c|  4 ++--
 3 files changed, 13 insertions(+), 2 deletions(-)

-- 
2.37.3




Re: [PATCH V5 0/3] hw/riscv: virt: Enable booting S-mode firmware from pflash

2022-10-11 Thread Alistair Francis
On Tue, Oct 4, 2022 at 7:25 PM Sunil V L  wrote:
>
> This series adds the support to boot S-mode FW like EDK2 from the flash. The
> S-mode firmware should be kept in pflash unit 1.
>
> When -kernel (and -initrd) option is also provided along with the flash,
> the kernel (and initrd) will be loaded into fw_cfg table and opensbi will
> branch to the flash address which will be the entry point of the S-mode
> firmware. The S-mode FW then loads and launches the kernel.
>
> When only -pflash option is provided in the command line, the kernel
> will be located and loaded in the usual way by the S-mode firmware.
>
> These patches are available in below branch.
> https://github.com/vlsunil/qemu/tree/pflash_v2
>
> The first two patches in this series are refactor patches.
>
> These changes are tested with a WIP EDK2 port for virt machine. Below
> are the instructions to build and test this feature.
>
> 1) Get EDK2 sources from below branches.
> https://github.com/vlsunil/edk2/tree/virt_refactor_smode_v1
> https://github.com/vlsunil/edk2-platforms/tree/virt_refactor_smode_v1
>
> 2) Build EDK2 for RISC-V
> export WORKSPACE=`pwd`
> export GCC5_RISCV64_PREFIX=riscv64-linux-gnu-
> export PACKAGES_PATH=$WORKSPACE/edk2:$WORKSPACE/edk2-platforms
> export EDK_TOOLS_PATH=$WORKSPACE/edk2/BaseTools
> source edk2/edksetup.sh
> make -C edk2/BaseTools clean
> make -C edk2/BaseTools
> make -C edk2/BaseTools/Source/C
> source edk2/edksetup.sh BaseTools
> build -a RISCV64  -p Platform/Qemu/RiscVVirt/RiscVVirt.dsc -t GCC5
>
> 3)Make the EDK2 image size to match with what qemu flash expects
> truncate -s 32M Build/RiscVVirt/DEBUG_GCC5/FV/RISCV_VIRT.fd
>
> 4) Run
> a) Boot to EFI shell (no -kernel / -initrd option)
> qemu-system-riscv64  -nographic   -drive 
> file=Build/RiscVVirt/DEBUG_GCC5/FV/RISCV_VIRT.fd,if=pflash,format=raw,unit=1  
> -machine virt -M 2G
>
> b) With -kernel, -initrd and -pflash
> qemu-system-riscv64  -nographic   -drive 
> file=Build/RiscVVirt/DEBUG_GCC5/FV/RISCV_VIRT.fd,if=pflash,format=raw,unit=1  
> -machine virt -M 2G -kernel arch/riscv/boot/Image.gz -initrd rootfs.cpio
>
>
> Changes since V4:
> 1) Rebased on riscv-to-apply.next branch
> 2) Added RB tags
> 3) Gerd's feedback on removing the truncate requirement will be 
> addressed as separate
>patch in future.
>
> Changes since V3:
> 1) White space and comment edits
> 2) Added RB tag
>
> Changes since V2:
> 1) Moved the doc comment to .h file
>
> Changes since V1:
> 1) Modified code to support the use case when both -kernel and 
> -pflash are configured.
> 2) Refactor patches added to help (1) above.
> 3) Cover letter added with test instructions.
>
> Sunil V L (3):
>   hw/arm,loongarch: Move load_image_to_fw_cfg() to common location
>   hw/riscv: virt: Move create_fw_cfg() prior to loading kernel
>   hw/riscv: virt: Enable booting S-mode firmware from pflash

Thanks!

Applied to riscv-to-apply.next

Alistair

>
>  hw/arm/boot.c | 49 ---
>  hw/loongarch/virt.c   | 33 --
>  hw/nvram/fw_cfg.c | 32 +
>  hw/riscv/boot.c   | 29 +++
>  hw/riscv/virt.c   | 32 ++---
>  include/hw/nvram/fw_cfg.h | 21 +
>  include/hw/riscv/boot.h   |  1 +
>  7 files changed, 107 insertions(+), 90 deletions(-)
>
> --
> 2.25.1
>
>



Re: [PATCH] gitmodules: recurse by default

2022-10-11 Thread Michael S. Tsirkin
On Fri, Oct 07, 2022 at 12:09:40PM +0100, Daniel P. Berrangé wrote:
> On Fri, Oct 07, 2022 at 11:45:56AM +0100, Daniel P. Berrangé wrote:
> > On Fri, Oct 07, 2022 at 06:11:25AM -0400, Michael S. Tsirkin wrote:
> > > On Fri, Oct 07, 2022 at 09:07:17AM +0100, Daniel P. Berrangé wrote:
> > > > On Thu, Oct 06, 2022 at 08:24:01PM -0400, Michael S. Tsirkin wrote:
> > > > > On Thu, Oct 06, 2022 at 07:54:52PM +0100, Daniel P. Berrangé wrote:
> > > > > > On Thu, Oct 06, 2022 at 07:39:07AM -0400, Michael S. Tsirkin wrote:
> > > > > > > The most commmon complaint about submodules is that
> > > > > > > they don't follow when one switches branches in the
> > > > > > > main repo. Enable recursing into submodules by default
> > > > > > > to address that.
> > > > > > > 
> > > > > > > Signed-off-by: Michael S. Tsirkin 
> > > > > > > ---
> > > > > > >  .gitmodules | 23 +++
> > > > > > >  1 file changed, 23 insertions(+)
> 
> snip
> 
> > > I just retested and it's not working for me either :(
> > > I was sure it worked but I guess the testing wasn't done properly.
> > > Back to the drawing board sorry.
> > 
> > I think the problem is that this setting doesn't apply in the context
> > of .gitmodules. Various commands take a '--recurse-submodules' parameter,
> > and like many params this can be set in the .git/config file. The
> > problem is .git/config isn't a file we can influence automatically,
> > it is upto the dev to set things for every clone they do :-(
> 
> With the correct setting in my .git/config, I've just discovered
> an unexpected & undesirable consequence of using recurse=true.
> It affects the 'push' command. If your submodule contains a hash
> that is not present in the upstream of the submodule, then when
> you try to push, it will also try to push the submodule change.
> 
> eg, I have a qemu.git branch 'work' and i made a change to
> ui/keycodemapdb. If I try to push to my gitlab fork, whose
> remote I called 'gitlab', then it will also try to push
> ui/keycodemapdb to a fork called 'gitlab'.  Except I don't
> have any such fork existing, so my attempt to push my qemu.git
> changes fails because of the submodule.
> 
> This is going to be annoying to people who are working on branches
> with updates to the git submodules if we were to set recurse=true
> by default, as they'll have to also setup remotes for submodules
> they work on.
> 
> With regards,
> Daniel


Well this seems like a reasonable thing to do, no?

If you push qemu commit referring to hash 0xABC, you want
that 0xABC to be available in the remote, no?
Otherwise how will people fetching your tree check it out?


> -- 
> |: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org -o-https://fstop138.berrange.com :|
> |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|




Re: [PATCH 0/5] Multi-Region and Volatile Memory support for CXL Type-3 Devices

2022-10-11 Thread Michael S. Tsirkin
On Tue, Oct 11, 2022 at 05:19:11PM -0400, Gregory Price wrote:
> Summary of Changes:
> 1) Correction of PCI_CLASS from STORAGE_EXPRESS to MEMORY_CXL on init
> 2) Add CXL_CAPACITY_MULTIPLIER definition to replace magic numbers
> 3) Refactor CDAT DSMAS Initialization for multi-region initialization
> 4) Multi-Region and Volatile Memory support for CXL Type-3 Devices
> 5) Test and Documentation updates
> 
> Developed with input from Jonathan Cameron and Davidloh Bueso.
> 
> This series brings 2 features to CXL Type-3 Devices:
> 1) Volatile Memory Region support
> 2) Multi-Region support (1 Volatile, 1 Persistent)
> 
> In this series we implement multi-region and volatile region support
> through 6 major changes to CXL devices
> 1) The HostMemoryBackend [hostmem] has been replaced by two
>[hostvmem] and [hostpmem] to store volatile and persistent memory
>respectively
> 2) The single AddressSpace has been replaced by two AddressSpaces
>[hostvmem_as] and [hostpmem_as] to map respective memdevs.
> 3) Each memory region size and total region are stored separately
> 4) The CDAT and DVSEC memory map entries have been updated:
>a) if vmem is present, vmem is mapped at DPA(0)
>b) if pmem is present
>   i)  and vmem is present, pmem is mapped at DPA(vmem->size)
>   ii) else, pmem is mapped at DPA(0)
>c) partitioning of pmem is not supported in this patch set but
>   has been discussed and this design should suffice.
> 5) Read/Write functions have been updated to access AddressSpaces
>according to the mapping described in #4
> 6) cxl-mailbox has been updated to report the respective size of
>volatile and persistent memory regions
> 
> CXL Spec (3.0) Section 8.2.9.8.2.0 - Get Partition Info
>   Active Volatile Memory
> The device shall provide this volatile capacity starting at DPA 0
>   Active Persistent Memory
> The device shall provide this persistent capacity starting at the
> DPA immediately following the volatile capacity
> 
> Partitioning of Persistent Memory regions may be supported on following
> patch sets.
> 
> Submitted as an extention to the CDAT emulation because the CDAT DSMAS
> entry concerns memory mapping and is required to successfully map memory
> regions correctly in bios/efi.

As there will be v8 of CDAT patches I expect there will be a rebase
of this patchset too.

> Gregory Price (5):
>   hw/cxl: set cxl-type3 device type to PCI_CLASS_MEMORY_CXL
>   hw/cxl: Add CXL_CAPACITY_MULTIPLIER definition
>   hw/mem/cxl_type: Generalize CDATDsmas initialization for Memory
> Regions
>   hw/cxl: Multi-Region CXL Type-3 Devices (Volatile and Persistent)
>   cxl: update tests and documentation for new cxl properties
> 
>  docs/system/devices/cxl.rst |  53 -
>  hw/cxl/cxl-mailbox-utils.c  |  23 +-
>  hw/mem/cxl_type3.c  | 449 +++-
>  include/hw/cxl/cxl_device.h |  11 +-
>  tests/qtest/cxl-test.c  |  81 ++-
>  5 files changed, 416 insertions(+), 201 deletions(-)
> 
> -- 
> 2.37.3




[PATCH v2 14/15] migration: Remove old preempt code around state maintainance

2022-10-11 Thread Peter Xu
With the new code to send pages in rp-return thread, there's little help to
keep lots of the old code on maintaining the preempt state in migration
thread, because the new way should always be faster..

Then if we'll always send pages in the rp-return thread anyway, we don't
need those logic to maintain preempt state anymore because now we serialize
things using the mutex directly instead of using those fields.

It's very unfortunate to have those code for a short period, but that's
still one intermediate step that we noticed the next bottleneck on the
migration thread.  Now what we can do best is to drop unnecessary code as
long as the new code is stable to reduce the burden.  It's actually a good
thing because the new "sending page in rp-return thread" model is (IMHO)
even cleaner and with better performance.

Remove the old code that was responsible for maintaining preempt states, at
the meantime also remove x-postcopy-preempt-break-huge parameter because
with concurrent sender threads we don't really need to break-huge anymore.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Peter Xu 
---
 migration/migration.c |   2 -
 migration/migration.h |   7 -
 migration/ram.c   | 291 +-
 3 files changed, 3 insertions(+), 297 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index c0fd6588a4..59c6aa3960 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -4401,8 +4401,6 @@ static Property migration_properties[] = {
 DEFINE_PROP_SIZE("announce-step", MigrationState,
   parameters.announce_step,
   DEFAULT_MIGRATE_ANNOUNCE_STEP),
-DEFINE_PROP_BOOL("x-postcopy-preempt-break-huge", MigrationState,
-  postcopy_preempt_break_huge, true),
 DEFINE_PROP_STRING("tls-creds", MigrationState, parameters.tls_creds),
 DEFINE_PROP_STRING("tls-hostname", MigrationState, 
parameters.tls_hostname),
 DEFINE_PROP_STRING("tls-authz", MigrationState, parameters.tls_authz),
diff --git a/migration/migration.h b/migration/migration.h
index cdad8aceaa..ae4ffd3454 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -340,13 +340,6 @@ struct MigrationState {
 bool send_configuration;
 /* Whether we send section footer during migration */
 bool send_section_footer;
-/*
- * Whether we allow break sending huge pages when postcopy preempt is
- * enabled.  When disabled, we won't interrupt precopy within sending a
- * host huge page, which is the old behavior of vanilla postcopy.
- * NOTE: this parameter is ignored if postcopy preempt is not enabled.
- */
-bool postcopy_preempt_break_huge;
 
 /* Needed by postcopy-pause state */
 QemuSemaphore postcopy_pause_sem;
diff --git a/migration/ram.c b/migration/ram.c
index d5a3fd610f..db3bf51dad 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -97,28 +97,6 @@ struct PageSearchStatus {
 unsigned long page;
 /* Set once we wrap around */
 bool complete_round;
-/*
- * [POSTCOPY-ONLY] Whether current page is explicitly requested by
- * postcopy.  When set, the request is "urgent" because the dest QEMU
- * threads are waiting for us.
- */
-bool postcopy_requested;
-/*
- * [POSTCOPY-ONLY] The target channel to use to send current page.
- *
- * Note: This may _not_ match with the value in postcopy_requested
- * above. Let's imagine the case where the postcopy request is exactly
- * the page that we're sending in progress during precopy. In this case
- * we'll have postcopy_requested set to true but the target channel
- * will be the precopy channel (so that we don't split brain on that
- * specific page since the precopy channel already contains partial of
- * that page data).
- *
- * Besides that specific use case, postcopy_target_channel should
- * always be equal to postcopy_requested, because by default we send
- * postcopy pages via postcopy preempt channel.
- */
-bool postcopy_target_channel;
 /* Whether we're sending a host page */
 bool  host_page_sending;
 /* The start/end of current host page.  Invalid if 
host_page_sending==false */
@@ -343,20 +321,6 @@ struct RAMSrcPageRequest {
 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 };
 
-typedef struct {
-/*
- * Cached ramblock/offset values if preempted.  They're only meaningful if
- * preempted==true below.
- */
-RAMBlock *ram_block;
-unsigned long ram_page;
-/*
- * Whether a postcopy preemption just happened.  Will be reset after
- * precopy recovered to background migration.
- */
-bool preempted;
-} PostcopyPreemptState;
-
 /* State of RAM for migration */
 struct RAMState {
 /* QEMUFile used for this migration */
@@ -419,14 +383,6 @@ struct RAMState {
 /* Queue of outstanding page requests from the destination */
 QemuMutex 

[PATCH v2 13/15] migration: Send requested page directly in rp-return thread

2022-10-11 Thread Peter Xu
With all the facilities ready, send the requested page directly in the
rp-return thread rather than queuing it in the request queue, if and only
if postcopy preempt is enabled.  It can achieve so because it uses separate
channel for sending urgent pages.  The only shared data is bitmap and it's
protected by the bitmap_mutex.

Note that since we're moving the ownership of the urgent channel from the
migration thread to rp thread it also means the rp thread is responsible
for managing the qemufile, e.g. properly close it when pausing migration
happens.  For this, let migration_release_from_dst_file to cover shutdown
of the urgent channel too, renaming it as migration_release_dst_files() to
better show what it does.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Peter Xu 
---
 migration/migration.c |  35 +++--
 migration/ram.c   | 112 ++
 2 files changed, 131 insertions(+), 16 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 4364813d82..c0fd6588a4 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -2847,8 +2847,11 @@ static int migrate_handle_rp_resume_ack(MigrationState 
*s, uint32_t value)
 return 0;
 }
 
-/* Release ms->rp_state.from_dst_file in a safe way */
-static void migration_release_from_dst_file(MigrationState *ms)
+/*
+ * Release ms->rp_state.from_dst_file (and postcopy_qemufile_src if
+ * existed) in a safe way.
+ */
+static void migration_release_dst_files(MigrationState *ms)
 {
 QEMUFile *file;
 
@@ -2861,6 +2864,18 @@ static void 
migration_release_from_dst_file(MigrationState *ms)
 ms->rp_state.from_dst_file = NULL;
 }
 
+/*
+ * Do the same to postcopy fast path socket too if there is.  No
+ * locking needed because this qemufile should only be managed by
+ * return path thread.
+ */
+if (ms->postcopy_qemufile_src) {
+migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src);
+qemu_file_shutdown(ms->postcopy_qemufile_src);
+qemu_fclose(ms->postcopy_qemufile_src);
+ms->postcopy_qemufile_src = NULL;
+}
+
 qemu_fclose(file);
 }
 
@@ -3005,7 +3020,7 @@ out:
  * Maybe there is something we can do: it looks like a
  * network down issue, and we pause for a recovery.
  */
-migration_release_from_dst_file(ms);
+migration_release_dst_files(ms);
 rp = NULL;
 if (postcopy_pause_return_path_thread(ms)) {
 /*
@@ -3023,7 +3038,7 @@ out:
 }
 
 trace_source_return_path_thread_end();
-migration_release_from_dst_file(ms);
+migration_release_dst_files(ms);
 rcu_unregister_thread();
 return NULL;
 }
@@ -3546,18 +3561,6 @@ static MigThrError postcopy_pause(MigrationState *s)
 qemu_file_shutdown(file);
 qemu_fclose(file);
 
-/*
- * Do the same to postcopy fast path socket too if there is.  No
- * locking needed because no racer as long as we do this before setting
- * status to paused.
- */
-if (s->postcopy_qemufile_src) {
-migration_ioc_unregister_yank_from_file(s->postcopy_qemufile_src);
-qemu_file_shutdown(s->postcopy_qemufile_src);
-qemu_fclose(s->postcopy_qemufile_src);
-s->postcopy_qemufile_src = NULL;
-}
-
 migrate_set_state(>state, s->state,
   MIGRATION_STATUS_POSTCOPY_PAUSED);
 
diff --git a/migration/ram.c b/migration/ram.c
index cbaa5650b8..d5a3fd610f 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -546,6 +546,8 @@ static QemuThread *decompress_threads;
 static QemuMutex decomp_done_lock;
 static QemuCond decomp_done_cond;
 
+static int ram_save_host_page_urgent(PageSearchStatus *pss);
+
 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock 
*block,
  ram_addr_t offset, uint8_t *source_buf);
 
@@ -560,6 +562,16 @@ static void pss_init(PageSearchStatus *pss, RAMBlock *rb, 
ram_addr_t page)
 pss->complete_round = false;
 }
 
+/*
+ * Check whether two PSSs are actively sending the same page.  Return true
+ * if it is, false otherwise.
+ */
+static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
+{
+return pss1->host_page_sending && pss2->host_page_sending &&
+(pss1->host_page_start == pss2->host_page_start);
+}
+
 static void *do_data_compress(void *opaque)
 {
 CompressParam *param = opaque;
@@ -2260,6 +2272,57 @@ int ram_save_queue_pages(const char *rbname, ram_addr_t 
start, ram_addr_t len)
 return -1;
 }
 
+/*
+ * When with postcopy preempt, we send back the page directly in the
+ * rp-return thread.
+ */
+if (postcopy_preempt_active()) {
+ram_addr_t page_start = start >> TARGET_PAGE_BITS;
+size_t page_size = qemu_ram_pagesize(ramblock);
+PageSearchStatus *pss = 

Re: [PATCH V5 3/3] hw/riscv: virt: Enable booting S-mode firmware from pflash

2022-10-11 Thread Bernhard Beschow
Am 4. Oktober 2022 09:23:51 UTC schrieb Sunil V L :
>To boot S-mode firmware payload like EDK2 from persistent
>flash storage, qemu needs to pass the flash address as the
>next_addr in fw_dynamic_info to the opensbi.
>
>When both -kernel and -pflash options are provided in command line,
>the kernel (and initrd if -initrd) will be copied to fw_cfg table.
>The S-mode FW will load the kernel/initrd from fw_cfg table.
>
>If only pflash is given but not -kernel, then it is the job of
>of the S-mode firmware to locate and load the kernel.
>
>In either case, update the kernel_entry with the flash address
>so that the opensbi can jump to the entry point of the S-mode
>firmware.
>
>Signed-off-by: Sunil V L 
>Reviewed-by: Andrew Jones 
>---
> hw/riscv/boot.c | 29 +
> hw/riscv/virt.c | 18 +-
> include/hw/riscv/boot.h |  1 +
> 3 files changed, 47 insertions(+), 1 deletion(-)
>
>diff --git a/hw/riscv/boot.c b/hw/riscv/boot.c
>index 1ae7596873..fa8ad27da2 100644
>--- a/hw/riscv/boot.c
>+++ b/hw/riscv/boot.c
>@@ -338,3 +338,32 @@ void riscv_setup_direct_kernel(hwaddr kernel_addr, hwaddr 
>fdt_addr)
> riscv_cpu->env.fdt_addr = fdt_addr;
> }
> }
>+
>+void riscv_setup_firmware_boot(MachineState *machine)
>+{
>+if (machine->kernel_filename) {
>+FWCfgState *fw_cfg;
>+fw_cfg = fw_cfg_find();
>+
>+assert(fw_cfg);
>+/*
>+ * Expose the kernel, the command line, and the initrd in fw_cfg.
>+ * We don't process them here at all, it's all left to the
>+ * firmware.
>+ */
>+load_image_to_fw_cfg(fw_cfg,
>+ FW_CFG_KERNEL_SIZE, FW_CFG_KERNEL_DATA,
>+ machine->kernel_filename,
>+ true);
>+load_image_to_fw_cfg(fw_cfg,
>+ FW_CFG_INITRD_SIZE, FW_CFG_INITRD_DATA,
>+ machine->initrd_filename, false);
>+
>+if (machine->kernel_cmdline) {
>+fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE,
>+   strlen(machine->kernel_cmdline) + 1);
>+fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA,
>+  machine->kernel_cmdline);
>+}
>+}
>+}
>diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c
>index de2efccebf..a5bc7353b4 100644
>--- a/hw/riscv/virt.c
>+++ b/hw/riscv/virt.c
>@@ -1274,7 +1274,23 @@ static void virt_machine_done(Notifier *notifier, void 
>*data)
> s->fw_cfg = create_fw_cfg(machine);
> rom_set_fw(s->fw_cfg);
> 
>-if (machine->kernel_filename) {
>+if (drive_get(IF_PFLASH, 0, 1)) {
>+/*
>+ * S-mode FW like EDK2 will be kept in second plash (unit 1).

Nitpicking: s/plash/pflash/ ?

Best regards,
Bernhard

>+ * When both kernel, initrd and pflash options are provided in the
>+ * command line, the kernel and initrd will be copied to the fw_cfg
>+ * table and opensbi will jump to the flash address which is the
>+ * entry point of S-mode FW. It is the job of the S-mode FW to load
>+ * the kernel and initrd using fw_cfg table.
>+ *
>+ * If only pflash is given but not -kernel, then it is the job of
>+ * of the S-mode firmware to locate and load the kernel.
>+ * In either case, the next_addr for opensbi will be the flash 
>address.
>+ */
>+riscv_setup_firmware_boot(machine);
>+kernel_entry = virt_memmap[VIRT_FLASH].base +
>+   virt_memmap[VIRT_FLASH].size / 2;
>+} else if (machine->kernel_filename) {
> kernel_start_addr = riscv_calc_kernel_start_addr(>soc[0],
>  firmware_end_addr);
> 
>diff --git a/include/hw/riscv/boot.h b/include/hw/riscv/boot.h
>index a36f7618f5..93e5f8760d 100644
>--- a/include/hw/riscv/boot.h
>+++ b/include/hw/riscv/boot.h
>@@ -57,5 +57,6 @@ void riscv_rom_copy_firmware_info(MachineState *machine, 
>hwaddr rom_base,
>   uint32_t reset_vec_size,
>   uint64_t kernel_entry);
> void riscv_setup_direct_kernel(hwaddr kernel_addr, hwaddr fdt_addr);
>+void riscv_setup_firmware_boot(MachineState *machine);
> 
> #endif /* RISCV_BOOT_H */




[PATCH v2 10/15] migration: Add pss_init()

2022-10-11 Thread Peter Xu
Helper to init PSS structures.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Peter Xu 
---
 migration/ram.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index d81dd3fdac..44967e72b2 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -542,6 +542,14 @@ static bool do_compress_ram_page(QEMUFile *f, z_stream 
*stream, RAMBlock *block,
 static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
  bool postcopy_requested);
 
+/* NOTE: page is the PFN not real ram_addr_t. */
+static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
+{
+pss->block = rb;
+pss->page = page;
+pss->complete_round = false;
+}
+
 static void *do_data_compress(void *opaque)
 {
 CompressParam *param = opaque;
@@ -2645,9 +2653,7 @@ static int ram_find_and_save_block(RAMState *rs)
 rs->last_page = 0;
 }
 
-pss.block = rs->last_seen_block;
-pss.page = rs->last_page;
-pss.complete_round = false;
+pss_init(, rs->last_seen_block, rs->last_page);
 
 do {
 again = true;
-- 
2.37.3




[PATCH v2 05/15] migration: Remove RAMState.f references in compression code

2022-10-11 Thread Peter Xu
Removing referencing to RAMState.f in compress_page_with_multi_thread() and
flush_compressed_data().

Compression code by default isn't compatible with having >1 channels (or it
won't currently know which channel to flush the compressed data), so to
make it simple we always flush on the default to_dst_file port until
someone wants to add >1 ports support, as rs->f right now can really
change (after postcopy preempt is introduced).

There should be no functional change at all after patch applied, since as
long as rs->f referenced in compression code, it must be to_dst_file.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Peter Xu 
---
 migration/ram.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index c90feedb13..b9ac2d6921 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1461,6 +1461,7 @@ static bool save_page_use_compression(RAMState *rs);
 
 static void flush_compressed_data(RAMState *rs)
 {
+MigrationState *ms = migrate_get_current();
 int idx, len, thread_count;
 
 if (!save_page_use_compression(rs)) {
@@ -1479,7 +1480,7 @@ static void flush_compressed_data(RAMState *rs)
 for (idx = 0; idx < thread_count; idx++) {
 qemu_mutex_lock(_param[idx].mutex);
 if (!comp_param[idx].quit) {
-len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
+len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file);
 /*
  * it's safe to fetch zero_page without holding comp_done_lock
  * as there is no further request submitted to the thread,
@@ -1498,11 +1499,11 @@ static inline void set_compress_params(CompressParam 
*param, RAMBlock *block,
 param->offset = offset;
 }
 
-static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
-   ram_addr_t offset)
+static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset)
 {
 int idx, thread_count, bytes_xmit = -1, pages = -1;
 bool wait = migrate_compress_wait_thread();
+MigrationState *ms = migrate_get_current();
 
 thread_count = migrate_compress_threads();
 qemu_mutex_lock(_done_lock);
@@ -1510,7 +1511,8 @@ retry:
 for (idx = 0; idx < thread_count; idx++) {
 if (comp_param[idx].done) {
 comp_param[idx].done = false;
-bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
+bytes_xmit = qemu_put_qemu_file(ms->to_dst_file,
+comp_param[idx].file);
 qemu_mutex_lock(_param[idx].mutex);
 set_compress_params(_param[idx], block, offset);
 qemu_cond_signal(_param[idx].cond);
@@ -2263,7 +2265,7 @@ static bool save_compress_page(RAMState *rs, RAMBlock 
*block, ram_addr_t offset)
 return false;
 }
 
-if (compress_page_with_multi_thread(rs, block, offset) > 0) {
+if (compress_page_with_multi_thread(block, offset) > 0) {
 return true;
 }
 
-- 
2.37.3




[PATCH v2 08/15] migration: Teach PSS about host page

2022-10-11 Thread Peter Xu
Migration code has a lot to do with host pages.  Teaching PSS core about
the idea of host page helps a lot and makes the code clean.  Meanwhile,
this prepares for the future changes that can leverage the new PSS helpers
that this patch introduces to send host page in another thread.

Three more fields are introduced for this:

  (1) host_page_sending: this is set to true when QEMU is sending a host
  page, false otherwise.

  (2) host_page_{start|end}: these point to the start/end of host page
  we're sending, and it's only valid when host_page_sending==true.

For example, when we look up the next dirty page on the ramblock, with
host_page_sending==true, we'll not try to look for anything beyond the
current host page boundary.  This can be slightly efficient than current
code because currently we'll set pss->page to next dirty bit (which can be
over current host page boundary) and reset it to host page boundary if we
found it goes beyond that.

With above, we can easily make migration_bitmap_find_dirty() self contained
by updating pss->page properly.  rs* parameter is removed because it's not
even used in old code.

When sending a host page, we should use the pss helpers like this:

  - pss_host_page_prepare(pss): called before sending host page
  - pss_within_range(pss): whether we're still working on the cur host page?
  - pss_host_page_finish(pss): called after sending a host page

Then we can use ram_save_target_page() to save one small page.

Currently ram_save_host_page() is still the only user. If there'll be
another function to send host page (e.g. in return path thread) in the
future, it should follow the same style.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Peter Xu 
---
 migration/ram.c | 95 +++--
 1 file changed, 76 insertions(+), 19 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index f5a86265c7..ebb4737deb 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -481,6 +481,11 @@ struct PageSearchStatus {
  * postcopy pages via postcopy preempt channel.
  */
 bool postcopy_target_channel;
+/* Whether we're sending a host page */
+bool  host_page_sending;
+/* The start/end of current host page.  Only valid if 
host_page_sending==true */
+unsigned long host_page_start;
+unsigned long host_page_end;
 };
 typedef struct PageSearchStatus PageSearchStatus;
 
@@ -858,26 +863,38 @@ static int save_xbzrle_page(RAMState *rs, uint8_t 
**current_data,
 }
 
 /**
- * migration_bitmap_find_dirty: find the next dirty page from start
+ * pss_find_next_dirty: find the next dirty page of current ramblock
  *
- * Returns the page offset within memory region of the start of a dirty page
+ * This function updates pss->page to point to the next dirty page index
+ * within the ramblock to migrate, or the end of ramblock when nothing
+ * found.  Note that when pss->host_page_sending==true it means we're
+ * during sending a host page, so we won't look for dirty page that is
+ * outside the host page boundary.
  *
- * @rs: current RAM state
- * @rb: RAMBlock where to search for dirty pages
- * @start: page where we start the search
+ * @pss: the current page search status
  */
-static inline
-unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
-  unsigned long start)
+static void pss_find_next_dirty(PageSearchStatus *pss)
 {
+RAMBlock *rb = pss->block;
 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 unsigned long *bitmap = rb->bmap;
 
 if (ramblock_is_ignored(rb)) {
-return size;
+/* Points directly to the end, so we know no dirty page */
+pss->page = size;
+return;
+}
+
+/*
+ * If during sending a host page, only look for dirty pages within the
+ * current host page being send.
+ */
+if (pss->host_page_sending) {
+assert(pss->host_page_end);
+size = MIN(size, pss->host_page_end);
 }
 
-return find_next_bit(bitmap, size, start);
+pss->page = find_next_bit(bitmap, size, pss->page);
 }
 
 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
@@ -1563,7 +1580,9 @@ static bool find_dirty_block(RAMState *rs, 
PageSearchStatus *pss, bool *again)
 pss->postcopy_requested = false;
 pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
 
-pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
+/* Update pss->page for the next dirty bit in ramblock */
+pss_find_next_dirty(pss);
+
 if (pss->complete_round && pss->block == rs->last_seen_block &&
 pss->page >= rs->last_page) {
 /*
@@ -2452,6 +2471,44 @@ static void postcopy_preempt_reset_channel(RAMState *rs)
 }
 }
 
+/* Should be called before sending a host page */
+static void pss_host_page_prepare(PageSearchStatus *pss)
+{
+/* How many guest pages are there in one host page? */
+size_t guest_pfns = 

[PATCH v2 12/15] migration: Move last_sent_block into PageSearchStatus

2022-10-11 Thread Peter Xu
Since we use PageSearchStatus to represent a channel, it makes perfect
sense to keep last_sent_block (aka, leverage RAM_SAVE_FLAG_CONTINUE) to be
per-channel rather than global because each channel can be sending
different pages on ramblocks.

Hence move it from RAMState into PageSearchStatus.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Peter Xu 
---
 migration/ram.c | 71 -
 1 file changed, 41 insertions(+), 30 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 486dc47583..cbaa5650b8 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -89,6 +89,8 @@ XBZRLECacheStats xbzrle_counters;
 struct PageSearchStatus {
 /* The migration channel used for a specific host page */
 QEMUFile*pss_channel;
+/* Last block from where we have sent data */
+RAMBlock *last_sent_block;
 /* Current block being searched */
 RAMBlock*block;
 /* Current page to search from */
@@ -368,8 +370,6 @@ struct RAMState {
 int uffdio_fd;
 /* Last block that we have visited searching for dirty pages */
 RAMBlock *last_seen_block;
-/* Last block from where we have sent data */
-RAMBlock *last_sent_block;
 /* Last dirty target page we have sent */
 ram_addr_t last_page;
 /* last ram version we have seen */
@@ -684,16 +684,17 @@ exit:
  *
  * Returns the number of bytes written
  *
- * @f: QEMUFile where to send the data
+ * @pss: current PSS channel status
  * @block: block that contains the page we want to send
  * @offset: offset inside the block for the page
  *  in the lower bits, it contains flags
  */
-static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
+static size_t save_page_header(PageSearchStatus *pss, RAMBlock *block,
ram_addr_t offset)
 {
 size_t size, len;
-bool same_block = (block == rs->last_sent_block);
+bool same_block = (block == pss->last_sent_block);
+QEMUFile *f = pss->pss_channel;
 
 if (same_block) {
 offset |= RAM_SAVE_FLAG_CONTINUE;
@@ -706,7 +707,7 @@ static size_t save_page_header(RAMState *rs, QEMUFile *f,  
RAMBlock *block,
 qemu_put_byte(f, len);
 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 size += 1 + len;
-rs->last_sent_block = block;
+pss->last_sent_block = block;
 }
 return size;
 }
@@ -790,17 +791,19 @@ static void xbzrle_cache_zero_page(RAMState *rs, 
ram_addr_t current_addr)
  *  -1 means that xbzrle would be longer than normal
  *
  * @rs: current RAM state
+ * @pss: current PSS channel
  * @current_data: pointer to the address of the page contents
  * @current_addr: addr of the page
  * @block: block that contains the page we want to send
  * @offset: offset inside the block for the page
  */
-static int save_xbzrle_page(RAMState *rs, QEMUFile *file,
+static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
 uint8_t **current_data, ram_addr_t current_addr,
 RAMBlock *block, ram_addr_t offset)
 {
 int encoded_len = 0, bytes_xbzrle;
 uint8_t *prev_cached_page;
+QEMUFile *file = pss->pss_channel;
 
 if (!cache_is_cached(XBZRLE.cache, current_addr,
  ram_counters.dirty_sync_count)) {
@@ -865,7 +868,7 @@ static int save_xbzrle_page(RAMState *rs, QEMUFile *file,
 }
 
 /* Send XBZRLE based compressed page */
-bytes_xbzrle = save_page_header(rs, file, block,
+bytes_xbzrle = save_page_header(pss, block,
 offset | RAM_SAVE_FLAG_XBZRLE);
 qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
 qemu_put_be16(file, encoded_len);
@@ -1296,19 +1299,19 @@ static void ram_release_page(const char *rbname, 
uint64_t offset)
  * Returns the size of data written to the file, 0 means the page is not
  * a zero page
  *
- * @rs: current RAM state
- * @file: the file where the data is saved
+ * @pss: current PSS channel
  * @block: block that contains the page we want to send
  * @offset: offset inside the block for the page
  */
-static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
+static int save_zero_page_to_file(PageSearchStatus *pss,
   RAMBlock *block, ram_addr_t offset)
 {
 uint8_t *p = block->host + offset;
+QEMUFile *file = pss->pss_channel;
 int len = 0;
 
 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
-len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
+len += save_page_header(pss, block, offset | RAM_SAVE_FLAG_ZERO);
 qemu_put_byte(file, 0);
 len += 1;
 ram_release_page(block->idstr, offset);
@@ -1321,14 +1324,14 @@ static int save_zero_page_to_file(RAMState *rs, 
QEMUFile *file,
  *
  * Returns the number of pages written.
  *
- * @rs: current RAM state
+ * @pss: current PSS channel
  * @block: block that contains the page we want to send
  * @offset: offset 

[PATCH v2 15/15] migration: Drop rs->f

2022-10-11 Thread Peter Xu
Now with rs->pss we can already cache channels in pss->pss_channels.  That
pss_channel contains more infromation than rs->f because it's per-channel.
So rs->f could be replaced by rss->pss[RAM_CHANNEL_PRECOPY].pss_channel,
while rs->f itself is a bit vague now.

Note that vanilla postcopy still send pages via pss[RAM_CHANNEL_PRECOPY],
that's slightly confusing but it reflects the reality.

Then, after the replacement we can safely drop rs->f.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Peter Xu 
---
 migration/ram.c | 12 
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index db3bf51dad..538667b974 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -323,8 +323,6 @@ struct RAMSrcPageRequest {
 
 /* State of RAM for migration */
 struct RAMState {
-/* QEMUFile used for this migration */
-QEMUFile *f;
 /*
  * PageSearchStatus structures for the channels when send pages.
  * Protected by the bitmap_mutex.
@@ -2527,8 +2525,6 @@ static int ram_find_and_save_block(RAMState *rs)
 }
 
 if (found) {
-/* Cache rs->f in pss_channel (TODO: remove rs->f) */
-pss->pss_channel = rs->f;
 pages = ram_save_host_page(rs, pss);
 }
 } while (!pages && again);
@@ -3084,7 +3080,7 @@ static void ram_state_resume_prepare(RAMState *rs, 
QEMUFile *out)
 ram_state_reset(rs);
 
 /* Update RAMState cache of output QEMUFile */
-rs->f = out;
+rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
 
 trace_ram_state_resume_prepare(pages);
 }
@@ -3175,7 +3171,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
 return -1;
 }
 }
-(*rsp)->f = f;
+(*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
 
 WITH_RCU_READ_LOCK_GUARD() {
 qemu_put_be64(f, ram_bytes_total_common(true) | 
RAM_SAVE_FLAG_MEM_SIZE);
@@ -3310,7 +3306,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
 out:
 if (ret >= 0
 && migration_is_setup_or_active(migrate_get_current()->state)) {
-ret = multifd_send_sync_main(rs->f);
+ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
 if (ret < 0) {
 return ret;
 }
@@ -3380,7 +3376,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
 return ret;
 }
 
-ret = multifd_send_sync_main(rs->f);
+ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
 if (ret < 0) {
 return ret;
 }
-- 
2.37.3




[PATCH 5/5] cxl: update tests and documentation for new cxl properties

2022-10-11 Thread Gregory Price
Adds explicit examples for --persistent-memdev and --volatile-memdev

Signed-off-by: Gregory Price 
---
 docs/system/devices/cxl.rst | 53 ++--
 tests/qtest/cxl-test.c  | 81 +++--
 2 files changed, 110 insertions(+), 24 deletions(-)

diff --git a/docs/system/devices/cxl.rst b/docs/system/devices/cxl.rst
index f25783a4ec..9e165064c8 100644
--- a/docs/system/devices/cxl.rst
+++ b/docs/system/devices/cxl.rst
@@ -300,15 +300,36 @@ Example topology involving a switch::
 
 Example command lines
 -
-A very simple setup with just one directly attached CXL Type 3 device::
+A very simple setup with just one directly attached CXL Type 3 Persistent 
Memory device::
 
   qemu-system-aarch64 -M virt,gic-version=3,cxl=on -m 4g,maxmem=8G,slots=8 
-cpu max \
   ...
-  -object 
memory-backend-file,id=cxl-mem1,share=on,mem-path=/tmp/cxltest.raw,size=256M \
-  -object 
memory-backend-file,id=cxl-lsa1,share=on,mem-path=/tmp/lsa.raw,size=256M \
+  -object 
memory-backend-file,pmem=true,id=pmem0,share=on,mem-path=/tmp/cxltest.raw,size=256M
 \
+  -object 
memory-backend-file,pmem=true,id=cxl-lsa0,share=on,mem-path=/tmp/lsa.raw,size=256M
 \
+  -device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1 \
+  -device cxl-rp,port=0,bus=cxl.1,id=root_port13,chassis=0,slot=2 \
+  -device 
cxl-type3,bus=root_port13,persistent-memdev=pmem0,lsa=cxl-lsa1,id=cxl-pmem0 \
+  -M cxl-fmw.0.targets.0=cxl.1,cxl-fmw.0.size=4G
+
+A very simple setup with just one directly attached CXL Type 3 Volatile Memory 
device::
+
+  qemu-system-aarch64 -M virt,gic-version=3,cxl=on -m 4g,maxmem=8G,slots=8 
-cpu max \
+  ...
+  -object memory-backend-ram,id=vmem0,share=on,size=256M \
   -device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1 \
   -device cxl-rp,port=0,bus=cxl.1,id=root_port13,chassis=0,slot=2 \
-  -device cxl-type3,bus=root_port13,memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem0 \
+  -device cxl-type3,bus=root_port13,volatile-memdev=vmem0,id=cxl-vmem0 \
+  -M cxl-fmw.0.targets.0=cxl.1,cxl-fmw.0.size=4G
+
+The same volatile setup may optionally include an LSA region::
+
+  qemu-system-aarch64 -M virt,gic-version=3,cxl=on -m 4g,maxmem=8G,slots=8 
-cpu max \
+  ...
+  -object memory-backend-ram,id=vmem0,share=on,size=256M \
+  -object 
memory-backend-file,id=cxl-lsa0,share=on,mem-path=/tmp/lsa.raw,size=256M \
+  -device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1 \
+  -device cxl-rp,port=0,bus=cxl.1,id=root_port13,chassis=0,slot=2 \
+  -device 
cxl-type3,bus=root_port13,volatile-memdev=vmem0,lsa=cxl-lsa0,id=cxl-vmem0 \
   -M cxl-fmw.0.targets.0=cxl.1,cxl-fmw.0.size=4G
 
 A setup suitable for 4 way interleave. Only one fixed window provided, to 
enable 2 way
@@ -328,13 +349,13 @@ the CXL Type3 device directly attached (no switches).::
   -device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1 \
   -device pxb-cxl,bus_nr=222,bus=pcie.0,id=cxl.2 \
   -device cxl-rp,port=0,bus=cxl.1,id=root_port13,chassis=0,slot=2 \
-  -device cxl-type3,bus=root_port13,memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem0 \
+  -device 
cxl-type3,bus=root_port13,persistent-memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem0 \
   -device cxl-rp,port=1,bus=cxl.1,id=root_port14,chassis=0,slot=3 \
-  -device cxl-type3,bus=root_port14,memdev=cxl-mem2,lsa=cxl-lsa2,id=cxl-pmem1 \
+  -device 
cxl-type3,bus=root_port14,persistent-memdev=cxl-mem2,lsa=cxl-lsa2,id=cxl-pmem1 \
   -device cxl-rp,port=0,bus=cxl.2,id=root_port15,chassis=0,slot=5 \
-  -device cxl-type3,bus=root_port15,memdev=cxl-mem3,lsa=cxl-lsa3,id=cxl-pmem2 \
+  -device 
cxl-type3,bus=root_port15,persistent-memdev=cxl-mem3,lsa=cxl-lsa3,id=cxl-pmem2 \
   -device cxl-rp,port=1,bus=cxl.2,id=root_port16,chassis=0,slot=6 \
-  -device cxl-type3,bus=root_port16,memdev=cxl-mem4,lsa=cxl-lsa4,id=cxl-pmem3 \
+  -device 
cxl-type3,bus=root_port16,persistent-memdev=cxl-mem4,lsa=cxl-lsa4,id=cxl-pmem3 \
   -M 
cxl-fmw.0.targets.0=cxl.1,cxl-fmw.0.targets.1=cxl.2,cxl-fmw.0.size=4G,cxl-fmw.0.interleave-granularity=8k
 
 An example of 4 devices below a switch suitable for 1, 2 or 4 way interleave::
@@ -354,15 +375,23 @@ An example of 4 devices below a switch suitable for 1, 2 
or 4 way interleave::
   -device cxl-rp,port=1,bus=cxl.1,id=root_port1,chassis=0,slot=1 \
   -device cxl-upstream,bus=root_port0,id=us0 \
   -device cxl-downstream,port=0,bus=us0,id=swport0,chassis=0,slot=4 \
-  -device 
cxl-type3,bus=swport0,memdev=cxl-mem0,lsa=cxl-lsa0,id=cxl-pmem0,size=256M \
+  -device 
cxl-type3,bus=swport0,persistent-memdev=cxl-mem0,lsa=cxl-lsa0,id=cxl-pmem0,size=256M
 \
   -device cxl-downstream,port=1,bus=us0,id=swport1,chassis=0,slot=5 \
-  -device 
cxl-type3,bus=swport1,memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem1,size=256M \
+  -device 
cxl-type3,bus=swport1,persistent-memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem1,size=256M
 \
   -device cxl-downstream,port=2,bus=us0,id=swport2,chassis=0,slot=6 \
-  -device 
cxl-type3,bus=swport2,memdev=cxl-mem2,lsa=cxl-lsa2,id=cxl-pmem2,size=256M \
+  -device 

[PATCH v2 04/15] migration: Trivial cleanup save_page_header() on same block check

2022-10-11 Thread Peter Xu
The 2nd check on RAM_SAVE_FLAG_CONTINUE is a bit redundant.  Use a boolean
to be clearer.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Peter Xu 
---
 migration/ram.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 562646609e..c90feedb13 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -661,14 +661,15 @@ static size_t save_page_header(RAMState *rs, QEMUFile *f, 
 RAMBlock *block,
ram_addr_t offset)
 {
 size_t size, len;
+bool same_block = (block == rs->last_sent_block);
 
-if (block == rs->last_sent_block) {
+if (same_block) {
 offset |= RAM_SAVE_FLAG_CONTINUE;
 }
 qemu_put_be64(f, offset);
 size = 8;
 
-if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
+if (!same_block) {
 len = strlen(block->idstr);
 qemu_put_byte(f, len);
 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
-- 
2.37.3




[PATCH v2 09/15] migration: Introduce pss_channel

2022-10-11 Thread Peter Xu
Introduce pss_channel for PageSearchStatus, define it as "the migration
channel to be used to transfer this host page".

We used to have rs->f, which is a mirror to MigrationState.to_dst_file.

After postcopy preempt initial version, rs->f can be dynamically changed
depending on which channel we want to use.

But that later work still doesn't grant full concurrency of sending pages
in e.g. different threads, because rs->f can either be the PRECOPY channel
or POSTCOPY channel.  This needs to be per-thread too.

PageSearchStatus is actually a good piece of struct which we can leverage
if we want to have multiple threads sending pages.  Sending a single guest
page may not make sense, so we make the granule to be "host page", and in
the PSS structure we allow specify a QEMUFile* to migrate a specific host
page.  Then we open the possibility to specify different channels in
different threads with different PSS structures.

The PSS prefix can be slightly misleading here because e.g. for the
upcoming usage of postcopy channel/thread it's not "searching" (or,
scanning) at all but sending the explicit page that was requested.  However
since PSS existed for some years keep it as-is until someone complains.

This patch mostly (simply) replace rs->f with pss->pss_channel only. No
functional change intended for this patch yet.  But it does prepare to
finally drop rs->f, and make ram_save_guest_page() thread safe.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Peter Xu 
---
 migration/ram.c | 70 +++--
 1 file changed, 38 insertions(+), 32 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index ebb4737deb..d81dd3fdac 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -453,6 +453,8 @@ void dirty_sync_missed_zero_copy(void)
 
 /* used by the search for pages to send */
 struct PageSearchStatus {
+/* The migration channel used for a specific host page */
+QEMUFile*pss_channel;
 /* Current block being searched */
 RAMBlock*block;
 /* Current page to search from */
@@ -775,9 +777,9 @@ static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t 
current_addr)
  * @block: block that contains the page we want to send
  * @offset: offset inside the block for the page
  */
-static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
-ram_addr_t current_addr, RAMBlock *block,
-ram_addr_t offset)
+static int save_xbzrle_page(RAMState *rs, QEMUFile *file,
+uint8_t **current_data, ram_addr_t current_addr,
+RAMBlock *block, ram_addr_t offset)
 {
 int encoded_len = 0, bytes_xbzrle;
 uint8_t *prev_cached_page;
@@ -845,11 +847,11 @@ static int save_xbzrle_page(RAMState *rs, uint8_t 
**current_data,
 }
 
 /* Send XBZRLE based compressed page */
-bytes_xbzrle = save_page_header(rs, rs->f, block,
+bytes_xbzrle = save_page_header(rs, file, block,
 offset | RAM_SAVE_FLAG_XBZRLE);
-qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
-qemu_put_be16(rs->f, encoded_len);
-qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
+qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
+qemu_put_be16(file, encoded_len);
+qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
 bytes_xbzrle += encoded_len + 1 + 2;
 /*
  * Like compressed_size (please see update_compress_thread_counts),
@@ -1305,9 +1307,10 @@ static int save_zero_page_to_file(RAMState *rs, QEMUFile 
*file,
  * @block: block that contains the page we want to send
  * @offset: offset inside the block for the page
  */
-static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
+static int save_zero_page(RAMState *rs, QEMUFile *file, RAMBlock *block,
+  ram_addr_t offset)
 {
-int len = save_zero_page_to_file(rs, rs->f, block, offset);
+int len = save_zero_page_to_file(rs, file, block, offset);
 
 if (len) {
 stat64_add(_atomic_counters.duplicate, 1);
@@ -1324,15 +1327,15 @@ static int save_zero_page(RAMState *rs, RAMBlock 
*block, ram_addr_t offset)
  *
  * Return true if the pages has been saved, otherwise false is returned.
  */
-static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
-  int *pages)
+static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
+  ram_addr_t offset, int *pages)
 {
 uint64_t bytes_xmit = 0;
 int ret;
 
 *pages = -1;
-ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
-_xmit);
+ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
+TARGET_PAGE_SIZE, _xmit);
 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
 return false;
 }
@@ -1366,17 +1369,17 @@ static bool control_save_page(RAMState *rs, RAMBlock 

[PATCH v2 11/15] migration: Make PageSearchStatus part of RAMState

2022-10-11 Thread Peter Xu
We used to allocate PSS structure on the stack for precopy when sending
pages.  Make it static, so as to describe per-channel ram migration status.

Here we declared RAM_CHANNEL_MAX instances, preparing for postcopy to use
it, even though this patch has not yet to start using the 2nd instance.

This should not have any functional change per se, but it already starts to
export PSS information via the RAMState, so that e.g. one PSS channel can
start to reference the other PSS channel.

Always protect PSS access using the same RAMState.bitmap_mutex.  We already
do so, so no code change needed, just some comment update.  Maybe we should
consider renaming bitmap_mutex some day as it's going to be a more commonly
and big mutex we use for ram states, but just leave it for later.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Peter Xu 
---
 migration/ram.c | 112 ++--
 1 file changed, 61 insertions(+), 51 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 44967e72b2..486dc47583 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -85,6 +85,46 @@
 
 XBZRLECacheStats xbzrle_counters;
 
+/* used by the search for pages to send */
+struct PageSearchStatus {
+/* The migration channel used for a specific host page */
+QEMUFile*pss_channel;
+/* Current block being searched */
+RAMBlock*block;
+/* Current page to search from */
+unsigned long page;
+/* Set once we wrap around */
+bool complete_round;
+/*
+ * [POSTCOPY-ONLY] Whether current page is explicitly requested by
+ * postcopy.  When set, the request is "urgent" because the dest QEMU
+ * threads are waiting for us.
+ */
+bool postcopy_requested;
+/*
+ * [POSTCOPY-ONLY] The target channel to use to send current page.
+ *
+ * Note: This may _not_ match with the value in postcopy_requested
+ * above. Let's imagine the case where the postcopy request is exactly
+ * the page that we're sending in progress during precopy. In this case
+ * we'll have postcopy_requested set to true but the target channel
+ * will be the precopy channel (so that we don't split brain on that
+ * specific page since the precopy channel already contains partial of
+ * that page data).
+ *
+ * Besides that specific use case, postcopy_target_channel should
+ * always be equal to postcopy_requested, because by default we send
+ * postcopy pages via postcopy preempt channel.
+ */
+bool postcopy_target_channel;
+/* Whether we're sending a host page */
+bool  host_page_sending;
+/* The start/end of current host page.  Invalid if 
host_page_sending==false */
+unsigned long host_page_start;
+unsigned long host_page_end;
+};
+typedef struct PageSearchStatus PageSearchStatus;
+
 /* struct contains XBZRLE cache and a static page
used by the compression */
 static struct {
@@ -319,6 +359,11 @@ typedef struct {
 struct RAMState {
 /* QEMUFile used for this migration */
 QEMUFile *f;
+/*
+ * PageSearchStatus structures for the channels when send pages.
+ * Protected by the bitmap_mutex.
+ */
+PageSearchStatus pss[RAM_CHANNEL_MAX];
 /* UFFD file descriptor, used in 'write-tracking' migration */
 int uffdio_fd;
 /* Last block that we have visited searching for dirty pages */
@@ -362,7 +407,12 @@ struct RAMState {
 uint64_t target_page_count;
 /* number of dirty bits in the bitmap */
 uint64_t migration_dirty_pages;
-/* Protects modification of the bitmap and migration dirty pages */
+/*
+ * Protects:
+ * - dirty/clear bitmap
+ * - migration_dirty_pages
+ * - pss structures
+ */
 QemuMutex bitmap_mutex;
 /* The RAMBlock used in the last src_page_requests */
 RAMBlock *last_req_rb;
@@ -451,46 +501,6 @@ void dirty_sync_missed_zero_copy(void)
 ram_counters.dirty_sync_missed_zero_copy++;
 }
 
-/* used by the search for pages to send */
-struct PageSearchStatus {
-/* The migration channel used for a specific host page */
-QEMUFile*pss_channel;
-/* Current block being searched */
-RAMBlock*block;
-/* Current page to search from */
-unsigned long page;
-/* Set once we wrap around */
-bool complete_round;
-/*
- * [POSTCOPY-ONLY] Whether current page is explicitly requested by
- * postcopy.  When set, the request is "urgent" because the dest QEMU
- * threads are waiting for us.
- */
-bool postcopy_requested;
-/*
- * [POSTCOPY-ONLY] The target channel to use to send current page.
- *
- * Note: This may _not_ match with the value in postcopy_requested
- * above. Let's imagine the case where the postcopy request is exactly
- * the page that we're sending in progress during precopy. In this case
- * we'll have postcopy_requested set to true but the target channel
- * will 

[PATCH 4/5] hw/cxl: Multi-Region CXL Type-3 Devices (Volatile and Persistent)

2022-10-11 Thread Gregory Price
This commit enables each CXL Type-3 device to contain one volatile
memory region and one persistent region.

Two new properties have been added to cxl-type3 device initialization:
[volatile-memdev] and [persistent-memdev]

The existing [memdev] property has been deprecated and will default the
memory region to a persistent memory region (although a user may assign
the region to a ram or file backed region). It cannot be used in
combination with the new [persistent-memdev] property.

Partitioning volatile memory from persistent memory is not yet supported.

Volatile memory is mapped at DPA(0x0), while Persistent memory is mapped
at DPA(vmem->size), per CXL Spec 8.2.9.8.2.0 - Get Partition Info.

Signed-off-by: Gregory Price 
---
 hw/cxl/cxl-mailbox-utils.c  |  21 ++--
 hw/mem/cxl_type3.c  | 197 ++--
 include/hw/cxl/cxl_device.h |  11 +-
 3 files changed, 162 insertions(+), 67 deletions(-)

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index 776c8cbadc..88d33e9a37 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -142,7 +142,7 @@ static ret_code cmd_firmware_update_get_info(struct cxl_cmd 
*cmd,
 } QEMU_PACKED *fw_info;
 QEMU_BUILD_BUG_ON(sizeof(*fw_info) != 0x50);
 
-if (cxl_dstate->pmem_size < CXL_CAPACITY_MULTIPLIER) {
+if (cxl_dstate->mem_size < CXL_CAPACITY_MULTIPLIER) {
 return CXL_MBOX_INTERNAL_ERROR;
 }
 
@@ -285,20 +285,20 @@ static ret_code cmd_identify_memory_device(struct cxl_cmd 
*cmd,
 
 CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate);
 CXLType3Class *cvc = CXL_TYPE3_GET_CLASS(ct3d);
-uint64_t size = cxl_dstate->pmem_size;
 
-if (!QEMU_IS_ALIGNED(size, CXL_CAPACITY_MULTIPLIER)) {
+if ((!QEMU_IS_ALIGNED(cxl_dstate->vmem_size, CXL_CAPACITY_MULTIPLIER)) ||
+(!QEMU_IS_ALIGNED(cxl_dstate->pmem_size, CXL_CAPACITY_MULTIPLIER))) {
 return CXL_MBOX_INTERNAL_ERROR;
 }
 
 id = (void *)cmd->payload;
 memset(id, 0, sizeof(*id));
 
-/* PMEM only */
 snprintf(id->fw_revision, 0x10, "BWFW VERSION %02d", 0);
 
-id->total_capacity = size / CXL_CAPACITY_MULTIPLIER;
-id->persistent_capacity = size / CXL_CAPACITY_MULTIPLIER;
+id->total_capacity = cxl_dstate->mem_size / CXL_CAPACITY_MULTIPLIER;
+id->persistent_capacity = cxl_dstate->pmem_size / CXL_CAPACITY_MULTIPLIER;
+id->volatile_capacity = cxl_dstate->vmem_size / CXL_CAPACITY_MULTIPLIER;
 id->lsa_size = cvc->get_lsa_size(ct3d);
 id->poison_list_max_mer[1] = 0x1; /* 256 poison records */
 
@@ -317,16 +317,15 @@ static ret_code cmd_ccls_get_partition_info(struct 
cxl_cmd *cmd,
 uint64_t next_pmem;
 } QEMU_PACKED *part_info = (void *)cmd->payload;
 QEMU_BUILD_BUG_ON(sizeof(*part_info) != 0x20);
-uint64_t size = cxl_dstate->pmem_size;
 
-if (!QEMU_IS_ALIGNED(size, CXL_CAPACITY_MULTIPLIER)) {
+if ((!QEMU_IS_ALIGNED(cxl_dstate->vmem_size, CXL_CAPACITY_MULTIPLIER)) ||
+(!QEMU_IS_ALIGNED(cxl_dstate->pmem_size, CXL_CAPACITY_MULTIPLIER))) {
 return CXL_MBOX_INTERNAL_ERROR;
 }
 
-/* PMEM only */
-part_info->active_vmem = 0;
+part_info->active_vmem = cxl_dstate->vmem_size / CXL_CAPACITY_MULTIPLIER;
 part_info->next_vmem = 0;
-part_info->active_pmem = size / CXL_CAPACITY_MULTIPLIER;
+part_info->active_pmem = cxl_dstate->pmem_size / CXL_CAPACITY_MULTIPLIER;
 part_info->next_pmem = 0;
 
 *len = sizeof(*part_info);
diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index dda78704c2..c371cd06e1 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -131,11 +131,13 @@ static int ct3_build_cdat_table(CDATSubHeader 
***cdat_table,
 uint64_t dpa_base = 0;
 MemoryRegion *mr;
 
-if (!ct3d->hostmem | !host_memory_backend_get_memory(ct3d->hostmem)) {
+if ((!ct3d->hostvmem && !ct3d->hostpmem) ||
+(ct3d->hostvmem && !host_memory_backend_get_memory(ct3d->hostvmem)) ||
+(ct3d->hostpmem && !host_memory_backend_get_memory(ct3d->hostpmem))) {
 return -EINVAL;
 }
 
-dsmas_num = 1;
+dsmas_num = (ct3d->hostvmem ? 1 : 0) + (ct3d->hostpmem ? 1 : 0);
 dslbis_num = 4 * dsmas_num;
 dsemts_num = dsmas_num;
 
@@ -147,16 +149,30 @@ static int ct3_build_cdat_table(CDATSubHeader 
***cdat_table,
 return -ENOMEM;
 }
 
-mr = host_memory_backend_get_memory(ct3d->hostmem);
-cdat_len += ct3_build_dsmas([dsmad_handle],
-[4 * dsmad_handle],
-[dsmad_handle],
-mr,
-dsmad_handle,
-false,
-dpa_base);
-dpa_base += mr->size;
-dsmad_handle++;
+if (ct3d->hostvmem) {
+mr = host_memory_backend_get_memory(ct3d->hostvmem);
+cdat_len += ct3_build_dsmas([dsmad_handle],
+[4 * dsmad_handle],
+   

[PATCH v2 07/15] migration: Use atomic ops properly for page accountings

2022-10-11 Thread Peter Xu
To prepare for thread-safety on page accountings, at least below counters
need to be accessed only atomically, they are:

ram_counters.transferred
ram_counters.duplicate
ram_counters.normal
ram_counters.postcopy_bytes

There are a lot of other counters but they won't be accessed outside
migration thread, then they're still safe to be accessed without atomic
ops.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Peter Xu 
---
 migration/migration.c | 10 +-
 migration/multifd.c   |  4 ++--
 migration/ram.c   | 40 
 migration/ram.h   | 20 
 4 files changed, 51 insertions(+), 23 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index ef00bff0b3..4364813d82 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1048,13 +1048,13 @@ static void populate_ram_info(MigrationInfo *info, 
MigrationState *s)
 
 info->has_ram = true;
 info->ram = g_malloc0(sizeof(*info->ram));
-info->ram->transferred = ram_counters.transferred;
+info->ram->transferred = stat64_get(_atomic_counters.transferred);
 info->ram->total = ram_bytes_total();
-info->ram->duplicate = ram_counters.duplicate;
+info->ram->duplicate = stat64_get(_atomic_counters.duplicate);
 /* legacy value.  It is not used anymore */
 info->ram->skipped = 0;
-info->ram->normal = ram_counters.normal;
-info->ram->normal_bytes = ram_counters.normal * page_size;
+info->ram->normal = stat64_get(_atomic_counters.normal);
+info->ram->normal_bytes = info->ram->normal * page_size;
 info->ram->mbps = s->mbps;
 info->ram->dirty_sync_count = ram_counters.dirty_sync_count;
 info->ram->dirty_sync_missed_zero_copy =
@@ -1065,7 +1065,7 @@ static void populate_ram_info(MigrationInfo *info, 
MigrationState *s)
 info->ram->pages_per_second = s->pages_per_second;
 info->ram->precopy_bytes = ram_counters.precopy_bytes;
 info->ram->downtime_bytes = ram_counters.downtime_bytes;
-info->ram->postcopy_bytes = ram_counters.postcopy_bytes;
+info->ram->postcopy_bytes = 
stat64_get(_atomic_counters.postcopy_bytes);
 
 if (migrate_use_xbzrle()) {
 info->has_xbzrle_cache = true;
diff --git a/migration/multifd.c b/migration/multifd.c
index 586ddc9d65..6b1dc7c889 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -437,7 +437,7 @@ static int multifd_send_pages(QEMUFile *f)
 + p->packet_len;
 qemu_file_acct_rate_limit(f, transferred);
 ram_counters.multifd_bytes += transferred;
-ram_counters.transferred += transferred;
+stat64_add(_atomic_counters.transferred, transferred);
 qemu_mutex_unlock(>mutex);
 qemu_sem_post(>sem);
 
@@ -612,7 +612,7 @@ int multifd_send_sync_main(QEMUFile *f)
 p->pending_job++;
 qemu_file_acct_rate_limit(f, p->packet_len);
 ram_counters.multifd_bytes += p->packet_len;
-ram_counters.transferred += p->packet_len;
+stat64_add(_atomic_counters.transferred, p->packet_len);
 qemu_mutex_unlock(>mutex);
 qemu_sem_post(>sem);
 
diff --git a/migration/ram.c b/migration/ram.c
index 578ad8d70a..f5a86265c7 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -425,18 +425,25 @@ uint64_t ram_bytes_remaining(void)
0;
 }
 
+/*
+ * NOTE: not all stats in ram_counters are used in reality.  See comments
+ * for struct MigrationAtomicStats.  The ultimate result of ram migration
+ * counters will be a merged version with both ram_counters and the atomic
+ * fields in ram_atomic_counters.
+ */
 MigrationStats ram_counters;
+MigrationAtomicStats ram_atomic_counters;
 
 static void ram_transferred_add(uint64_t bytes)
 {
 if (runstate_is_running()) {
 ram_counters.precopy_bytes += bytes;
 } else if (migration_in_postcopy()) {
-ram_counters.postcopy_bytes += bytes;
+stat64_add(_atomic_counters.postcopy_bytes, bytes);
 } else {
 ram_counters.downtime_bytes += bytes;
 }
-ram_counters.transferred += bytes;
+stat64_add(_atomic_counters.transferred, bytes);
 }
 
 void dirty_sync_missed_zero_copy(void)
@@ -725,7 +732,7 @@ void mig_throttle_counter_reset(void)
 
 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 rs->num_dirty_pages_period = 0;
-rs->bytes_xfer_prev = ram_counters.transferred;
+rs->bytes_xfer_prev = stat64_get(_atomic_counters.transferred);
 }
 
 /**
@@ -1085,8 +1092,9 @@ uint64_t ram_pagesize_summary(void)
 
 uint64_t ram_get_total_transferred_pages(void)
 {
-return  ram_counters.normal + ram_counters.duplicate +
-compression_counters.pages + xbzrle_counters.pages;
+return  stat64_get(_atomic_counters.normal) +
+stat64_get(_atomic_counters.duplicate) +
+compression_counters.pages + xbzrle_counters.pages;
 }
 
 static void migration_update_rates(RAMState *rs, int64_t end_time)
@@ -1145,8 +1153,8 @@ 

[PATCH v2 00/15] migration: Postcopy Preempt-Full

2022-10-11 Thread Peter Xu
Based-on: <20221004182430.97638-1-pet...@redhat.com>
  [PATCH v2 0/5] migration: Bug fixes (prepare for preempt-full)

Tree is here:
  https://github.com/xzpeter/qemu/tree/preempt-full

RFC: https://lore.kernel.org/r/20220829165659.96046-1-pet...@redhat.com
v1:  https://lore.kernel.org/r/20220920225106.48451-1-pet...@redhat.com

This patchset is v2 version of preempt-full series.

v2 changelog:
- Add r-bs
- Rename release_lock to preempt_active [Dave]
- Create MigrationAtomicStats structure, maintaining the 4 stats fields
  that may need atomic access since they'll be accessed by multiple
  threads. define them with Stat64.  Add rich comment. [Dave]
- One more patch added (patch 1) to fix another deadlock; not easy to
  reproduce but actually quite obvious..

The existing preempt code has reduced ramdom page req latency over 10Gbps
network from ~12ms to ~500us which has already landed.

This preempt-full series can further reduces that ~500us to ~230us per my
initial test.

Note that no new capability is needed, IOW it's fully compatible with the
existing preempt mode.  So the naming is actually not important but just to
identify the difference on the binaries.

The logic of the series is simple: send urgent pages in rp-return thread
rather than migration thread.  It also means rp-thread will take over the
ownership of the newly created preempt channel.  It can slow down rp-return
thread on receiving page requests, but I don't really see a major issue
with it so far but only benefits.

For detailed performance numbers, please refer to the rfc cover letter.

Please have a look, thanks.

Peter Xu (15):
  migration: Take bitmap mutex when completing ram migration
  migration: Add postcopy_preempt_active()
  migration: Cleanup xbzrle zero page cache update logic
  migration: Trivial cleanup save_page_header() on same block check
  migration: Remove RAMState.f references in compression code
  migration: Yield bitmap_mutex properly when sending/sleeping
  migration: Use atomic ops properly for page accountings
  migration: Teach PSS about host page
  migration: Introduce pss_channel
  migration: Add pss_init()
  migration: Make PageSearchStatus part of RAMState
  migration: Move last_sent_block into PageSearchStatus
  migration: Send requested page directly in rp-return thread
  migration: Remove old preempt code around state maintainance
  migration: Drop rs->f

 migration/migration.c |  47 +--
 migration/migration.h |   7 -
 migration/multifd.c   |   4 +-
 migration/ram.c   | 720 +++---
 migration/ram.h   |  20 ++
 5 files changed, 379 insertions(+), 419 deletions(-)

-- 
2.37.3




[PATCH v2 06/15] migration: Yield bitmap_mutex properly when sending/sleeping

2022-10-11 Thread Peter Xu
Don't take the bitmap mutex when sending pages, or when being throttled by
migration_rate_limit() (which is a bit tricky to call it here in ram code,
but seems still helpful).

It prepares for the possibility of concurrently sending pages in >1 threads
using the function ram_save_host_page() because all threads may need the
bitmap_mutex to operate on bitmaps, so that either sendmsg() or any kind of
qemu_sem_wait() blocking for one thread will not block the other from
progressing.

Signed-off-by: Peter Xu 
---
 migration/ram.c | 41 ++---
 1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index b9ac2d6921..578ad8d70a 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2462,6 +2462,7 @@ static void postcopy_preempt_reset_channel(RAMState *rs)
  */
 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
 {
+bool page_dirty, preempt_active = postcopy_preempt_active();
 int tmppages, pages = 0;
 size_t pagesize_bits =
 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
@@ -2485,22 +2486,40 @@ static int ram_save_host_page(RAMState *rs, 
PageSearchStatus *pss)
 break;
 }
 
-/* Check the pages is dirty and if it is send it */
-if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
-tmppages = ram_save_target_page(rs, pss);
-if (tmppages < 0) {
-return tmppages;
-}
+page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
 
-pages += tmppages;
+/* Check the pages is dirty and if it is send it */
+if (page_dirty) {
 /*
- * Allow rate limiting to happen in the middle of huge pages if
- * something is sent in the current iteration.
+ * Properly yield the lock only in postcopy preempt mode
+ * because both migration thread and rp-return thread can
+ * operate on the bitmaps.
  */
-if (pagesize_bits > 1 && tmppages > 0) {
-migration_rate_limit();
+if (preempt_active) {
+qemu_mutex_unlock(>bitmap_mutex);
+}
+tmppages = ram_save_target_page(rs, pss);
+if (tmppages >= 0) {
+pages += tmppages;
+/*
+ * Allow rate limiting to happen in the middle of huge pages if
+ * something is sent in the current iteration.
+ */
+if (pagesize_bits > 1 && tmppages > 0) {
+migration_rate_limit();
+}
 }
+if (preempt_active) {
+qemu_mutex_lock(>bitmap_mutex);
+}
+} else {
+tmppages = 0;
 }
+
+if (tmppages < 0) {
+return tmppages;
+}
+
 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
 } while ((pss->page < hostpage_boundary) &&
  offset_in_ramblock(pss->block,
-- 
2.37.3




[PATCH 2/5] hw/cxl: Add CXL_CAPACITY_MULTIPLIER definition

2022-10-11 Thread Gregory Price
Remove usage of magic numbers when accessing capacity fields and replace
with CXL_CAPACITY_MULTIPLIER, matching the kernel definition.

Signed-off-by: Gregory Price 
Reviewed-by: Davidlohr Bueso 
---
 hw/cxl/cxl-mailbox-utils.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index c7e1a88b44..776c8cbadc 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -14,6 +14,8 @@
 #include "qemu/log.h"
 #include "qemu/uuid.h"
 
+#define CXL_CAPACITY_MULTIPLIER   0x1000 /* SZ_256M */
+
 /*
  * How to add a new command, example. The command set FOO, with cmd BAR.
  *  1. Add the command set and cmd to the enum.
@@ -140,7 +142,7 @@ static ret_code cmd_firmware_update_get_info(struct cxl_cmd 
*cmd,
 } QEMU_PACKED *fw_info;
 QEMU_BUILD_BUG_ON(sizeof(*fw_info) != 0x50);
 
-if (cxl_dstate->pmem_size < (256 << 20)) {
+if (cxl_dstate->pmem_size < CXL_CAPACITY_MULTIPLIER) {
 return CXL_MBOX_INTERNAL_ERROR;
 }
 
@@ -285,7 +287,7 @@ static ret_code cmd_identify_memory_device(struct cxl_cmd 
*cmd,
 CXLType3Class *cvc = CXL_TYPE3_GET_CLASS(ct3d);
 uint64_t size = cxl_dstate->pmem_size;
 
-if (!QEMU_IS_ALIGNED(size, 256 << 20)) {
+if (!QEMU_IS_ALIGNED(size, CXL_CAPACITY_MULTIPLIER)) {
 return CXL_MBOX_INTERNAL_ERROR;
 }
 
@@ -295,8 +297,8 @@ static ret_code cmd_identify_memory_device(struct cxl_cmd 
*cmd,
 /* PMEM only */
 snprintf(id->fw_revision, 0x10, "BWFW VERSION %02d", 0);
 
-id->total_capacity = size / (256 << 20);
-id->persistent_capacity = size / (256 << 20);
+id->total_capacity = size / CXL_CAPACITY_MULTIPLIER;
+id->persistent_capacity = size / CXL_CAPACITY_MULTIPLIER;
 id->lsa_size = cvc->get_lsa_size(ct3d);
 id->poison_list_max_mer[1] = 0x1; /* 256 poison records */
 
@@ -317,14 +319,14 @@ static ret_code cmd_ccls_get_partition_info(struct 
cxl_cmd *cmd,
 QEMU_BUILD_BUG_ON(sizeof(*part_info) != 0x20);
 uint64_t size = cxl_dstate->pmem_size;
 
-if (!QEMU_IS_ALIGNED(size, 256 << 20)) {
+if (!QEMU_IS_ALIGNED(size, CXL_CAPACITY_MULTIPLIER)) {
 return CXL_MBOX_INTERNAL_ERROR;
 }
 
 /* PMEM only */
 part_info->active_vmem = 0;
 part_info->next_vmem = 0;
-part_info->active_pmem = size / (256 << 20);
+part_info->active_pmem = size / CXL_CAPACITY_MULTIPLIER;
 part_info->next_pmem = 0;
 
 *len = sizeof(*part_info);
-- 
2.37.3




[PATCH v2 03/15] migration: Cleanup xbzrle zero page cache update logic

2022-10-11 Thread Peter Xu
The major change is to replace "!save_page_use_compression()" with
"xbzrle_enabled" to make it clear.

Reasonings:

(1) When compression enabled, "!save_page_use_compression()" is exactly the
same as checking "xbzrle_enabled".

(2) When compression disabled, "!save_page_use_compression()" always return
true.  We used to try calling the xbzrle code, but after this change we
won't, and we shouldn't need to.

Since at it, drop the xbzrle_enabled check in xbzrle_cache_zero_page()
because with this change it's not needed anymore.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Peter Xu 
---
 migration/ram.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 7aaa843a21..562646609e 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -741,10 +741,6 @@ void mig_throttle_counter_reset(void)
  */
 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 {
-if (!rs->xbzrle_enabled) {
-return;
-}
-
 /* We don't care if this fails to allocate a new cache page
  * as long as it updated an old one */
 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
@@ -2301,7 +2297,7 @@ static int ram_save_target_page(RAMState *rs, 
PageSearchStatus *pss)
 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
  * page would be stale
  */
-if (!save_page_use_compression(rs)) {
+if (rs->xbzrle_enabled) {
 XBZRLE_cache_lock();
 xbzrle_cache_zero_page(rs, block->offset + offset);
 XBZRLE_cache_unlock();
-- 
2.37.3




[PATCH 0/5] Multi-Region and Volatile Memory support for CXL Type-3 Devices

2022-10-11 Thread Gregory Price
Summary of Changes:
1) Correction of PCI_CLASS from STORAGE_EXPRESS to MEMORY_CXL on init
2) Add CXL_CAPACITY_MULTIPLIER definition to replace magic numbers
3) Refactor CDAT DSMAS Initialization for multi-region initialization
4) Multi-Region and Volatile Memory support for CXL Type-3 Devices
5) Test and Documentation updates

Developed with input from Jonathan Cameron and Davidloh Bueso.

This series brings 2 features to CXL Type-3 Devices:
1) Volatile Memory Region support
2) Multi-Region support (1 Volatile, 1 Persistent)

In this series we implement multi-region and volatile region support
through 6 major changes to CXL devices
1) The HostMemoryBackend [hostmem] has been replaced by two
   [hostvmem] and [hostpmem] to store volatile and persistent memory
   respectively
2) The single AddressSpace has been replaced by two AddressSpaces
   [hostvmem_as] and [hostpmem_as] to map respective memdevs.
3) Each memory region size and total region are stored separately
4) The CDAT and DVSEC memory map entries have been updated:
   a) if vmem is present, vmem is mapped at DPA(0)
   b) if pmem is present
  i)  and vmem is present, pmem is mapped at DPA(vmem->size)
  ii) else, pmem is mapped at DPA(0)
   c) partitioning of pmem is not supported in this patch set but
  has been discussed and this design should suffice.
5) Read/Write functions have been updated to access AddressSpaces
   according to the mapping described in #4
6) cxl-mailbox has been updated to report the respective size of
   volatile and persistent memory regions

CXL Spec (3.0) Section 8.2.9.8.2.0 - Get Partition Info
  Active Volatile Memory
The device shall provide this volatile capacity starting at DPA 0
  Active Persistent Memory
The device shall provide this persistent capacity starting at the
DPA immediately following the volatile capacity

Partitioning of Persistent Memory regions may be supported on following
patch sets.

Submitted as an extention to the CDAT emulation because the CDAT DSMAS
entry concerns memory mapping and is required to successfully map memory
regions correctly in bios/efi.

Gregory Price (5):
  hw/cxl: set cxl-type3 device type to PCI_CLASS_MEMORY_CXL
  hw/cxl: Add CXL_CAPACITY_MULTIPLIER definition
  hw/mem/cxl_type: Generalize CDATDsmas initialization for Memory
Regions
  hw/cxl: Multi-Region CXL Type-3 Devices (Volatile and Persistent)
  cxl: update tests and documentation for new cxl properties

 docs/system/devices/cxl.rst |  53 -
 hw/cxl/cxl-mailbox-utils.c  |  23 +-
 hw/mem/cxl_type3.c  | 449 +++-
 include/hw/cxl/cxl_device.h |  11 +-
 tests/qtest/cxl-test.c  |  81 ++-
 5 files changed, 416 insertions(+), 201 deletions(-)

-- 
2.37.3




[PATCH v2 02/15] migration: Add postcopy_preempt_active()

2022-10-11 Thread Peter Xu
Add the helper to show that postcopy preempt enabled, meanwhile active.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Peter Xu 
---
 migration/ram.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index cfeb571800..7aaa843a21 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -162,6 +162,11 @@ out:
 return ret;
 }
 
+static bool postcopy_preempt_active(void)
+{
+return migrate_postcopy_preempt() && migration_in_postcopy();
+}
+
 bool ramblock_is_ignored(RAMBlock *block)
 {
 return !qemu_ram_is_migratable(block) ||
@@ -2433,7 +2438,7 @@ static void postcopy_preempt_choose_channel(RAMState *rs, 
PageSearchStatus *pss)
 /* We need to make sure rs->f always points to the default channel elsewhere */
 static void postcopy_preempt_reset_channel(RAMState *rs)
 {
-if (migrate_postcopy_preempt() && migration_in_postcopy()) {
+if (postcopy_preempt_active()) {
 rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
 rs->f = migrate_get_current()->to_dst_file;
 trace_postcopy_preempt_reset_channel();
@@ -2471,7 +2476,7 @@ static int ram_save_host_page(RAMState *rs, 
PageSearchStatus *pss)
 return 0;
 }
 
-if (migrate_postcopy_preempt() && migration_in_postcopy()) {
+if (postcopy_preempt_active()) {
 postcopy_preempt_choose_channel(rs, pss);
 }
 
-- 
2.37.3




[PATCH v2 01/15] migration: Take bitmap mutex when completing ram migration

2022-10-11 Thread Peter Xu
Any call to ram_find_and_save_block() needs to take the bitmap mutex.  We
used to not take it for most of ram_save_complete() because we thought
we're the only one left using the bitmap, but it's not true after the
preempt full patchset applied, since the return path can be taking it too.

Signed-off-by: Peter Xu 
---
 migration/ram.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/migration/ram.c b/migration/ram.c
index 1338e47665..cfeb571800 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -3406,6 +3406,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
 /* try transferring iterative blocks of memory */
 
 /* flush all remaining blocks regardless of rate limiting */
+qemu_mutex_lock(>bitmap_mutex);
 while (true) {
 int pages;
 
@@ -3419,6 +3420,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
 break;
 }
 }
+qemu_mutex_unlock(>bitmap_mutex);
 
 flush_compressed_data(rs);
 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
-- 
2.37.3




[PATCH 3/5] hw/mem/cxl_type: Generalize CDATDsmas initialization for Memory Regions

2022-10-11 Thread Gregory Price
This is a preparatory commit for enabling multiple memory regions within
a single CXL Type-3 device.  We will need to initialize multiple CDAT
DSMAS regions (and subsequent DSLBIS, and DSEMTS entries), so generalize
the intialization into a function.

Signed-off-by: Gregory Price 
---
 hw/mem/cxl_type3.c | 275 +
 1 file changed, 154 insertions(+), 121 deletions(-)

diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index 282f274266..dda78704c2 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -24,145 +24,178 @@
 #define UI64_NULL ~(0ULL)
 #define DWORD_BYTE 4
 
+static int ct3_build_dsmas(CDATDsmas *dsmas,
+   CDATDslbis *dslbis,
+   CDATDsemts *dsemts,
+   MemoryRegion *mr,
+   int dsmad_handle,
+   bool is_pmem,
+   uint64_t dpa_base)
+{
+int len = 0;
+/* ttl_len should be incremented for every entry */
+
+/* Device Scoped Memory Affinity Structure */
+*dsmas = (CDATDsmas) {
+.header = {
+.type = CDAT_TYPE_DSMAS,
+.length = sizeof(*dsmas),
+},
+.DSMADhandle = dsmad_handle,
+.flags = (is_pmem ? CDAT_DSMAS_FLAG_NV : 0),
+.DPA_base = dpa_base,
+.DPA_length = int128_get64(mr->size),
+};
+len++;
+
+/* For now, no memory side cache, plausiblish numbers */
+dslbis[0] = (CDATDslbis) {
+.header = {
+.type = CDAT_TYPE_DSLBIS,
+.length = sizeof(*dslbis),
+},
+.handle = dsmad_handle,
+.flags = HMAT_LB_MEM_MEMORY,
+.data_type = HMAT_LB_DATA_READ_LATENCY,
+.entry_base_unit = 1, /* 10ns base */
+.entry[0] = 15, /* 150ns */
+};
+len++;
+
+dslbis[1] = (CDATDslbis) {
+.header = {
+.type = CDAT_TYPE_DSLBIS,
+.length = sizeof(*dslbis),
+},
+.handle = dsmad_handle,
+.flags = HMAT_LB_MEM_MEMORY,
+.data_type = HMAT_LB_DATA_WRITE_LATENCY,
+.entry_base_unit = 1,
+.entry[0] = 25, /* 250ns */
+};
+len++;
+
+dslbis[2] = (CDATDslbis) {
+.header = {
+.type = CDAT_TYPE_DSLBIS,
+.length = sizeof(*dslbis),
+},
+.handle = dsmad_handle,
+.flags = HMAT_LB_MEM_MEMORY,
+.data_type = HMAT_LB_DATA_READ_BANDWIDTH,
+.entry_base_unit = 1000, /* GB/s */
+.entry[0] = 16,
+};
+len++;
+
+dslbis[3] = (CDATDslbis) {
+.header = {
+.type = CDAT_TYPE_DSLBIS,
+.length = sizeof(*dslbis),
+},
+.handle = dsmad_handle,
+.flags = HMAT_LB_MEM_MEMORY,
+.data_type = HMAT_LB_DATA_WRITE_BANDWIDTH,
+.entry_base_unit = 1000, /* GB/s */
+.entry[0] = 16,
+};
+len++;
+
+*dsemts = (CDATDsemts) {
+.header = {
+.type = CDAT_TYPE_DSEMTS,
+.length = sizeof(*dsemts),
+},
+.DSMAS_handle = dsmad_handle,
+/* EFI_MEMORY_NV implies EfiReservedMemoryType */
+.EFI_memory_type_attr = is_pmem ? 2 : 0,
+/* Reserved - the non volatile from DSMAS matters */
+.DPA_offset = 0,
+.DPA_length = int128_get64(mr->size),
+};
+len++;
+return len;
+}
+
 static int ct3_build_cdat_table(CDATSubHeader ***cdat_table,
 void *priv)
 {
-g_autofree CDATDsmas *dsmas_nonvolatile = NULL;
-g_autofree CDATDslbis *dslbis_nonvolatile = NULL;
-g_autofree CDATDsemts *dsemts_nonvolatile = NULL;
+g_autofree CDATDsmas *dsmas = NULL;
+g_autofree CDATDslbis *dslbis = NULL;
+g_autofree CDATDsemts *dsemts = NULL;
 CXLType3Dev *ct3d = priv;
-int len = 0;
-int i = 0;
-int next_dsmad_handle = 0;
-int nonvolatile_dsmad = -1;
-int dslbis_nonvolatile_num = 4;
+int cdat_len = 0;
+int cdat_idx = 0, sub_idx = 0;
+int dsmas_num, dslbis_num, dsemts_num;
+int dsmad_handle = 0;
+uint64_t dpa_base = 0;
 MemoryRegion *mr;
 
-/* Non volatile aspects */
-if (ct3d->hostmem) {
-dsmas_nonvolatile = g_malloc(sizeof(*dsmas_nonvolatile));
-if (!dsmas_nonvolatile) {
-return -ENOMEM;
-}
-nonvolatile_dsmad = next_dsmad_handle++;
-mr = host_memory_backend_get_memory(ct3d->hostmem);
-if (!mr) {
-return -EINVAL;
-}
-*dsmas_nonvolatile = (CDATDsmas) {
-.header = {
-.type = CDAT_TYPE_DSMAS,
-.length = sizeof(*dsmas_nonvolatile),
-},
-.DSMADhandle = nonvolatile_dsmad,
-.flags = CDAT_DSMAS_FLAG_NV,
-.DPA_base = 0,
-.DPA_length = int128_get64(mr->size),
-};
-len++;
-
-/* For now, no memory side cache, plausiblish numbers */
-dslbis_nonvolatile = 

[PATCH v3 28/29] target/ppc: unify cpu->has_work based on cs->interrupt_request

2022-10-11 Thread Matheus Ferst
Now that cs->interrupt_request indicates if there is any unmasked
interrupt, checking if the CPU has work to do can be simplified to a
single check that works for all CPU models.

Reviewed-by: Fabiano Rosas 
Signed-off-by: Matheus Ferst 
---
 target/ppc/cpu_init.c | 94 +--
 1 file changed, 1 insertion(+), 93 deletions(-)

diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index 0adc866485..15d549ad38 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -5984,27 +5984,10 @@ int p7_interrupt_powersave(CPUPPCState *env)
 return 0;
 }
 
-static bool cpu_has_work_POWER7(CPUState *cs)
-{
-PowerPCCPU *cpu = POWERPC_CPU(cs);
-CPUPPCState *env = >env;
-
-if (cs->halted) {
-if (!(cs->interrupt_request & CPU_INTERRUPT_HARD)) {
-return false;
-}
-return p7_interrupt_powersave(env) != 0;
-} else {
-return FIELD_EX64(env->msr, MSR, EE) &&
-   (cs->interrupt_request & CPU_INTERRUPT_HARD);
-}
-}
-
 POWERPC_FAMILY(POWER7)(ObjectClass *oc, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(oc);
 PowerPCCPUClass *pcc = POWERPC_CPU_CLASS(oc);
-CPUClass *cc = CPU_CLASS(oc);
 
 dc->fw_name = "PowerPC,POWER7";
 dc->desc = "POWER7";
@@ -6013,7 +5996,6 @@ POWERPC_FAMILY(POWER7)(ObjectClass *oc, void *data)
 pcc->pcr_supported = PCR_COMPAT_2_06 | PCR_COMPAT_2_05;
 pcc->init_proc = init_proc_POWER7;
 pcc->check_pow = check_pow_nocheck;
-cc->has_work = cpu_has_work_POWER7;
 pcc->insns_flags = PPC_INSNS_BASE | PPC_ISEL | PPC_STRING | PPC_MFTB |
PPC_FLOAT | PPC_FLOAT_FSEL | PPC_FLOAT_FRES |
PPC_FLOAT_FSQRT | PPC_FLOAT_FRSQRTE |
@@ -6170,27 +6152,10 @@ int p8_interrupt_powersave(CPUPPCState *env)
 return 0;
 }
 
-static bool cpu_has_work_POWER8(CPUState *cs)
-{
-PowerPCCPU *cpu = POWERPC_CPU(cs);
-CPUPPCState *env = >env;
-
-if (cs->halted) {
-if (!(cs->interrupt_request & CPU_INTERRUPT_HARD)) {
-return false;
-}
-return p8_interrupt_powersave(env) != 0;
-} else {
-return FIELD_EX64(env->msr, MSR, EE) &&
-   (cs->interrupt_request & CPU_INTERRUPT_HARD);
-}
-}
-
 POWERPC_FAMILY(POWER8)(ObjectClass *oc, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(oc);
 PowerPCCPUClass *pcc = POWERPC_CPU_CLASS(oc);
-CPUClass *cc = CPU_CLASS(oc);
 
 dc->fw_name = "PowerPC,POWER8";
 dc->desc = "POWER8";
@@ -6199,7 +6164,6 @@ POWERPC_FAMILY(POWER8)(ObjectClass *oc, void *data)
 pcc->pcr_supported = PCR_COMPAT_2_07 | PCR_COMPAT_2_06 | PCR_COMPAT_2_05;
 pcc->init_proc = init_proc_POWER8;
 pcc->check_pow = check_pow_nocheck;
-cc->has_work = cpu_has_work_POWER8;
 pcc->insns_flags = PPC_INSNS_BASE | PPC_ISEL | PPC_STRING | PPC_MFTB |
PPC_FLOAT | PPC_FLOAT_FSEL | PPC_FLOAT_FRES |
PPC_FLOAT_FSQRT | PPC_FLOAT_FRSQRTE |
@@ -6407,35 +6371,10 @@ int p9_interrupt_powersave(CPUPPCState *env)
 return 0;
 }
 
-static bool cpu_has_work_POWER9(CPUState *cs)
-{
-PowerPCCPU *cpu = POWERPC_CPU(cs);
-CPUPPCState *env = >env;
-
-if (cs->halted) {
-uint64_t psscr = env->spr[SPR_PSSCR];
-
-if (!(cs->interrupt_request & CPU_INTERRUPT_HARD)) {
-return false;
-}
-
-/* If EC is clear, just return true on any pending interrupt */
-if (!(psscr & PSSCR_EC)) {
-return true;
-}
-
-return p9_interrupt_powersave(env) != 0;
-} else {
-return FIELD_EX64(env->msr, MSR, EE) &&
-   (cs->interrupt_request & CPU_INTERRUPT_HARD);
-}
-}
-
 POWERPC_FAMILY(POWER9)(ObjectClass *oc, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(oc);
 PowerPCCPUClass *pcc = POWERPC_CPU_CLASS(oc);
-CPUClass *cc = CPU_CLASS(oc);
 
 dc->fw_name = "PowerPC,POWER9";
 dc->desc = "POWER9";
@@ -6445,7 +6384,6 @@ POWERPC_FAMILY(POWER9)(ObjectClass *oc, void *data)
  PCR_COMPAT_2_05;
 pcc->init_proc = init_proc_POWER9;
 pcc->check_pow = check_pow_nocheck;
-cc->has_work = cpu_has_work_POWER9;
 pcc->insns_flags = PPC_INSNS_BASE | PPC_ISEL | PPC_STRING | PPC_MFTB |
PPC_FLOAT | PPC_FLOAT_FSEL | PPC_FLOAT_FRES |
PPC_FLOAT_FSQRT | PPC_FLOAT_FRSQRTE |
@@ -6604,35 +6542,10 @@ static bool ppc_pvr_match_power10(PowerPCCPUClass *pcc, 
uint32_t pvr, bool best)
 return false;
 }
 
-static bool cpu_has_work_POWER10(CPUState *cs)
-{
-PowerPCCPU *cpu = POWERPC_CPU(cs);
-CPUPPCState *env = >env;
-
-if (cs->halted) {
-uint64_t psscr = env->spr[SPR_PSSCR];
-
-if (!(cs->interrupt_request & CPU_INTERRUPT_HARD)) {
-return false;
-}
-
-/* If EC is clear, just return true on any pending interrupt */
-if (!(psscr & PSSCR_EC)) {
-return true;
-}
-
-   

[PATCH v3 26/29] target/ppc: remove ppc_store_lpcr from CONFIG_USER_ONLY builds

2022-10-11 Thread Matheus Ferst
Writes to LPCR are hypervisor privileged.

Signed-off-by: Matheus Ferst 
---
 target/ppc/cpu.c | 2 ++
 target/ppc/cpu.h | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/target/ppc/cpu.c b/target/ppc/cpu.c
index 0ebac04bc4..e95b4c5ee1 100644
--- a/target/ppc/cpu.c
+++ b/target/ppc/cpu.c
@@ -73,6 +73,7 @@ void ppc_store_msr(CPUPPCState *env, target_ulong value)
 hreg_store_msr(env, value, 0);
 }
 
+#if !defined(CONFIG_USER_ONLY)
 void ppc_store_lpcr(PowerPCCPU *cpu, target_ulong val)
 {
 PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
@@ -82,6 +83,7 @@ void ppc_store_lpcr(PowerPCCPU *cpu, target_ulong val)
 /* The gtse bit affects hflags */
 hreg_compute_hflags(env);
 }
+#endif
 
 static inline void fpscr_set_rounding_mode(CPUPPCState *env)
 {
diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 2433756973..ad758b00e5 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -1370,9 +1370,9 @@ void ppc_translate_init(void);
 
 #if !defined(CONFIG_USER_ONLY)
 void ppc_store_sdr1(CPUPPCState *env, target_ulong value);
+void ppc_store_lpcr(PowerPCCPU *cpu, target_ulong val);
 #endif /* !defined(CONFIG_USER_ONLY) */
 void ppc_store_msr(CPUPPCState *env, target_ulong value);
-void ppc_store_lpcr(PowerPCCPU *cpu, target_ulong val);
 
 void ppc_cpu_list(void);
 
-- 
2.25.1




[PATCH 1/5] hw/cxl: set cxl-type3 device type to PCI_CLASS_MEMORY_CXL

2022-10-11 Thread Gregory Price
Current code sets to STORAGE_EXPRESS and then overrides it.

Signed-off-by: Gregory Price 
Reviewed-by: Davidlohr Bueso 
Reviewed-by: Jonathan Cameron 
---
 hw/mem/cxl_type3.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index 3e7ca7a455..282f274266 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -535,7 +535,6 @@ static void ct3_realize(PCIDevice *pci_dev, Error **errp)
 }
 
 pci_config_set_prog_interface(pci_conf, 0x10);
-pci_config_set_class(pci_conf, PCI_CLASS_MEMORY_CXL);
 
 pcie_endpoint_cap_init(pci_dev, 0x80);
 if (ct3d->sn != UI64_NULL) {
@@ -763,7 +762,7 @@ static void ct3_class_init(ObjectClass *oc, void *data)
 pc->config_read = ct3d_config_read;
 pc->realize = ct3_realize;
 pc->exit = ct3_exit;
-pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
+pc->class_id = PCI_CLASS_MEMORY_CXL;
 pc->vendor_id = PCI_VENDOR_ID_INTEL;
 pc->device_id = 0xd93; /* LVF for now */
 pc->revision = 1;
-- 
2.37.3




[PATCH v3 19/29] target/ppc: create an interrupt masking method for POWER7

2022-10-11 Thread Matheus Ferst
The new method is identical to ppc_next_unmasked_interrupt_generic,
processor-specific code will be added/removed in the following patches.

Signed-off-by: Matheus Ferst 
---
 target/ppc/excp_helper.c | 114 +++
 1 file changed, 114 insertions(+)

diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index 18a16bf316..534c0f8f5c 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -1679,6 +1679,118 @@ void ppc_cpu_do_interrupt(CPUState *cs)
 }
 
 #if defined(TARGET_PPC64)
+static int p7_next_unmasked_interrupt(CPUPPCState *env)
+{
+bool async_deliver;
+
+/* External reset */
+if (env->pending_interrupts & PPC_INTERRUPT_RESET) {
+return PPC_INTERRUPT_RESET;
+}
+/* Machine check exception */
+if (env->pending_interrupts & PPC_INTERRUPT_MCK) {
+return PPC_INTERRUPT_MCK;
+}
+#if 0 /* TODO */
+/* External debug exception */
+if (env->pending_interrupts & PPC_INTERRUPT_DEBUG) {
+return PPC_INTERRUPT_DEBUG;
+}
+#endif
+
+/*
+ * For interrupts that gate on MSR:EE, we need to do something a
+ * bit more subtle, as we need to let them through even when EE is
+ * clear when coming out of some power management states (in order
+ * for them to become a 0x100).
+ */
+async_deliver = FIELD_EX64(env->msr, MSR, EE) || env->resume_as_sreset;
+
+/* Hypervisor decrementer exception */
+if (env->pending_interrupts & PPC_INTERRUPT_HDECR) {
+/* LPCR will be clear when not supported so this will work */
+bool hdice = !!(env->spr[SPR_LPCR] & LPCR_HDICE);
+if ((async_deliver || !FIELD_EX64_HV(env->msr)) && hdice) {
+/* HDEC clears on delivery */
+return PPC_INTERRUPT_HDECR;
+}
+}
+
+/* Hypervisor virtualization interrupt */
+if (env->pending_interrupts & PPC_INTERRUPT_HVIRT) {
+/* LPCR will be clear when not supported so this will work */
+bool hvice = !!(env->spr[SPR_LPCR] & LPCR_HVICE);
+if ((async_deliver || !FIELD_EX64_HV(env->msr)) && hvice) {
+return PPC_INTERRUPT_HVIRT;
+}
+}
+
+/* External interrupt can ignore MSR:EE under some circumstances */
+if (env->pending_interrupts & PPC_INTERRUPT_EXT) {
+bool lpes0 = !!(env->spr[SPR_LPCR] & LPCR_LPES0);
+bool heic = !!(env->spr[SPR_LPCR] & LPCR_HEIC);
+/* HEIC blocks delivery to the hypervisor */
+if ((async_deliver && !(heic && FIELD_EX64_HV(env->msr) &&
+!FIELD_EX64(env->msr, MSR, PR))) ||
+(env->has_hv_mode && !FIELD_EX64_HV(env->msr) && !lpes0)) {
+return PPC_INTERRUPT_EXT;
+}
+}
+if (FIELD_EX64(env->msr, MSR, CE)) {
+/* External critical interrupt */
+if (env->pending_interrupts & PPC_INTERRUPT_CEXT) {
+return PPC_INTERRUPT_CEXT;
+}
+}
+if (async_deliver != 0) {
+/* Watchdog timer on embedded PowerPC */
+if (env->pending_interrupts & PPC_INTERRUPT_WDT) {
+return PPC_INTERRUPT_WDT;
+}
+if (env->pending_interrupts & PPC_INTERRUPT_CDOORBELL) {
+return PPC_INTERRUPT_CDOORBELL;
+}
+/* Fixed interval timer on embedded PowerPC */
+if (env->pending_interrupts & PPC_INTERRUPT_FIT) {
+return PPC_INTERRUPT_FIT;
+}
+/* Programmable interval timer on embedded PowerPC */
+if (env->pending_interrupts & PPC_INTERRUPT_PIT) {
+return PPC_INTERRUPT_PIT;
+}
+/* Decrementer exception */
+if (env->pending_interrupts & PPC_INTERRUPT_DECR) {
+return PPC_INTERRUPT_DECR;
+}
+if (env->pending_interrupts & PPC_INTERRUPT_DOORBELL) {
+return PPC_INTERRUPT_DOORBELL;
+}
+if (env->pending_interrupts & PPC_INTERRUPT_HDOORBELL) {
+return PPC_INTERRUPT_HDOORBELL;
+}
+if (env->pending_interrupts & PPC_INTERRUPT_PERFM) {
+return PPC_INTERRUPT_PERFM;
+}
+/* Thermal interrupt */
+if (env->pending_interrupts & PPC_INTERRUPT_THERM) {
+return PPC_INTERRUPT_THERM;
+}
+/* EBB exception */
+if (env->pending_interrupts & PPC_INTERRUPT_EBB) {
+/*
+ * EBB exception must be taken in problem state and
+ * with BESCR_GE set.
+ */
+if (FIELD_EX64(env->msr, MSR, PR) &&
+(env->spr[SPR_BESCR] & BESCR_GE)) {
+return PPC_INTERRUPT_EBB;
+}
+}
+}
+
+return 0;
+}
+
 #define P8_UNUSED_INTERRUPTS \
 (PPC_INTERRUPT_RESET | PPC_INTERRUPT_DEBUG | PPC_INTERRUPT_HVIRT |  \
 PPC_INTERRUPT_CEXT | PPC_INTERRUPT_WDT | PPC_INTERRUPT_CDOORBELL |  \
@@ -1966,6 +2078,8 @@ static int ppc_next_unmasked_interrupt(CPUPPCState *env)
 {
 switch (env->excp_model) {
 #if defined(TARGET_PPC64)
+case 

[PATCH v3 27/29] target/ppc: introduce ppc_maybe_interrupt

2022-10-11 Thread Matheus Ferst
This new method will check if any pending interrupt was unmasked and
then call cpu_interrupt/cpu_reset_interrupt accordingly. Code that
raises/lowers or masks/unmasks interrupts should call this method to
keep CPU_INTERRUPT_HARD coherent with env->pending_interrupts.

Signed-off-by: Matheus Ferst 
---
v3:
 - Comment about when the method should be used.
---
 hw/ppc/pnv_core.c|  1 +
 hw/ppc/ppc.c |  7 +--
 hw/ppc/spapr_hcall.c |  6 ++
 hw/ppc/spapr_rtas.c  |  2 +-
 target/ppc/cpu.c |  2 ++
 target/ppc/cpu.h |  1 +
 target/ppc/excp_helper.c | 42 
 target/ppc/helper.h  |  1 +
 target/ppc/helper_regs.c |  2 ++
 target/ppc/translate.c   |  2 ++
 10 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/hw/ppc/pnv_core.c b/hw/ppc/pnv_core.c
index 19e8eb885f..9ee79192dd 100644
--- a/hw/ppc/pnv_core.c
+++ b/hw/ppc/pnv_core.c
@@ -58,6 +58,7 @@ static void pnv_core_cpu_reset(PnvCore *pc, PowerPCCPU *cpu)
 env->msr |= MSR_HVB; /* Hypervisor mode */
 env->spr[SPR_HRMOR] = pc->hrmor;
 hreg_compute_hflags(env);
+ppc_maybe_interrupt(env);
 
 pcc->intc_reset(pc->chip, cpu);
 }
diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
index 77e611e81c..dc86c1c7db 100644
--- a/hw/ppc/ppc.c
+++ b/hw/ppc/ppc.c
@@ -42,7 +42,6 @@ static void cpu_ppc_tb_start (CPUPPCState *env);
 
 void ppc_set_irq(PowerPCCPU *cpu, int irq, int level)
 {
-CPUState *cs = CPU(cpu);
 CPUPPCState *env = >env;
 unsigned int old_pending;
 bool locked = false;
@@ -57,19 +56,15 @@ void ppc_set_irq(PowerPCCPU *cpu, int irq, int level)
 
 if (level) {
 env->pending_interrupts |= irq;
-cpu_interrupt(cs, CPU_INTERRUPT_HARD);
 } else {
 env->pending_interrupts &= ~irq;
-if (env->pending_interrupts == 0) {
-cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD);
-}
 }
 
 if (old_pending != env->pending_interrupts) {
+ppc_maybe_interrupt(env);
 kvmppc_set_interrupt(cpu, irq, level);
 }
 
-
 trace_ppc_irq_set_exit(env, irq, level, env->pending_interrupts,
CPU(cpu)->interrupt_request);
 
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index a8d4a6bcf0..23aa41c879 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -490,6 +490,7 @@ static target_ulong h_cede(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
 
 env->msr |= (1ULL << MSR_EE);
 hreg_compute_hflags(env);
+ppc_maybe_interrupt(env);
 
 if (spapr_cpu->prod) {
 spapr_cpu->prod = false;
@@ -500,6 +501,7 @@ static target_ulong h_cede(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
 cs->halted = 1;
 cs->exception_index = EXCP_HLT;
 cs->exit_request = 1;
+ppc_maybe_interrupt(env);
 }
 
 return H_SUCCESS;
@@ -521,6 +523,7 @@ static target_ulong h_confer_self(PowerPCCPU *cpu)
 cs->halted = 1;
 cs->exception_index = EXCP_HALTED;
 cs->exit_request = 1;
+ppc_maybe_interrupt(>env);
 
 return H_SUCCESS;
 }
@@ -633,6 +636,7 @@ static target_ulong h_prod(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
 spapr_cpu = spapr_cpu_state(tcpu);
 spapr_cpu->prod = true;
 cs->halted = 0;
+ppc_maybe_interrupt(>env);
 qemu_cpu_kick(cs);
 
 return H_SUCCESS;
@@ -1661,6 +1665,7 @@ static target_ulong h_enter_nested(PowerPCCPU *cpu,
 spapr_cpu->in_nested = true;
 
 hreg_compute_hflags(env);
+ppc_maybe_interrupt(env);
 tlb_flush(cs);
 env->reserve_addr = -1; /* Reset the reservation */
 
@@ -1802,6 +1807,7 @@ out_restore_l1:
 spapr_cpu->in_nested = false;
 
 hreg_compute_hflags(env);
+ppc_maybe_interrupt(env);
 tlb_flush(cs);
 env->reserve_addr = -1; /* Reset the reservation */
 
diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
index d58b65e88f..3f664ea02c 100644
--- a/hw/ppc/spapr_rtas.c
+++ b/hw/ppc/spapr_rtas.c
@@ -214,9 +214,9 @@ static void rtas_stop_self(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
  * guest.
  * For the same reason, set PSSCR_EC.
  */
-ppc_store_lpcr(cpu, env->spr[SPR_LPCR] & ~pcc->lpcr_pm);
 env->spr[SPR_PSSCR] |= PSSCR_EC;
 cs->halted = 1;
+ppc_store_lpcr(cpu, env->spr[SPR_LPCR] & ~pcc->lpcr_pm);
 kvmppc_set_reg_ppc_online(cpu, 0);
 qemu_cpu_kick(cs);
 }
diff --git a/target/ppc/cpu.c b/target/ppc/cpu.c
index e95b4c5ee1..1a97b41c6b 100644
--- a/target/ppc/cpu.c
+++ b/target/ppc/cpu.c
@@ -82,6 +82,8 @@ void ppc_store_lpcr(PowerPCCPU *cpu, target_ulong val)
 env->spr[SPR_LPCR] = val & pcc->lpcr_mask;
 /* The gtse bit affects hflags */
 hreg_compute_hflags(env);
+
+ppc_maybe_interrupt(env);
 }
 #endif
 
diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index ad758b00e5..cc2d0305ff 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -1358,6 +1358,7 @@ int ppc64_cpu_write_elf64_note(WriteCoreDumpFunction f, 
CPUState *cs,
 int ppc32_cpu_write_elf32_note(WriteCoreDumpFunction 

Re: [PATCH v3] m68k: write bootinfo as rom section and re-randomize on reboot

2022-10-11 Thread Laurent Vivier

Le 11/10/2022 à 16:56, Jason A. Donenfeld a écrit :

On Tue, Oct 11, 2022 at 10:29:45AM +0100, Peter Maydell wrote:

On Tue, 11 Oct 2022 at 09:41, Laurent Vivier  wrote:


Le 03/10/2022 à 13:02, Jason A. Donenfeld a écrit :

Rather than poking directly into RAM, add the bootinfo block as a proper
ROM, so that it's restored when rebooting the system. This way, if the
guest corrupts any of the bootinfo items, but then tries to reboot,
it'll still be restored back to normal as expected.

Then, since the RNG seed needs to be fresh on each boot, regenerate the
RNG seed in the ROM when reseting the CPU.


As it's needed to be refreshed, I think it would better not to use a ROM and to 
regenerate all the
bootinfo data on the reset.


I quite liked the use of a rom blob in this patch -- it gets rid
of a lot of direct stl_phys() calls (which is a semi-deprecated
API because it ignores the possibility of failure).


A ROM is also how other archs do it. I'm good either way though.
Laurent/Peter - can you guys decide something and let me know if I need
a v+1 that avoids the ROM, or if you'll go with this v3 that uses the
ROM? Just make a decision, and I'll follow it.



If Peter likes it, it's ok.

Applied to my m68k-for-7.2 branch

Thanks,
Laurent



[PATCH v3 17/29] target/ppc: move power-saving interrupt masking out of cpu_has_work_POWER8

2022-10-11 Thread Matheus Ferst
Move the interrupt masking logic out of cpu_has_work_POWER8 in a new
method, p8_interrupt_powersave, that only returns an interrupt if it can
wake the processor from power-saving mode.

Signed-off-by: Matheus Ferst 
---
 target/ppc/cpu_init.c | 61 +++
 1 file changed, 33 insertions(+), 28 deletions(-)

diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index efdcf63282..3772f82e51 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -6133,6 +6133,38 @@ static bool ppc_pvr_match_power8(PowerPCCPUClass *pcc, 
uint32_t pvr, bool best)
 return true;
 }
 
+static int p8_interrupt_powersave(CPUPPCState *env)
+{
+if ((env->pending_interrupts & PPC_INTERRUPT_EXT) &&
+(env->spr[SPR_LPCR] & LPCR_P8_PECE2)) {
+return PPC_INTERRUPT_EXT;
+}
+if ((env->pending_interrupts & PPC_INTERRUPT_DECR) &&
+(env->spr[SPR_LPCR] & LPCR_P8_PECE3)) {
+return PPC_INTERRUPT_DECR;
+}
+if ((env->pending_interrupts & PPC_INTERRUPT_MCK) &&
+(env->spr[SPR_LPCR] & LPCR_P8_PECE4)) {
+return PPC_INTERRUPT_MCK;
+}
+if ((env->pending_interrupts & PPC_INTERRUPT_HMI) &&
+(env->spr[SPR_LPCR] & LPCR_P8_PECE4)) {
+return PPC_INTERRUPT_HMI;
+}
+if ((env->pending_interrupts & PPC_INTERRUPT_DOORBELL) &&
+(env->spr[SPR_LPCR] & LPCR_P8_PECE0)) {
+return PPC_INTERRUPT_DOORBELL;
+}
+if ((env->pending_interrupts & PPC_INTERRUPT_HDOORBELL) &&
+(env->spr[SPR_LPCR] & LPCR_P8_PECE1)) {
+return PPC_INTERRUPT_HDOORBELL;
+}
+if (env->pending_interrupts & PPC_INTERRUPT_RESET) {
+return PPC_INTERRUPT_RESET;
+}
+return 0;
+}
+
 static bool cpu_has_work_POWER8(CPUState *cs)
 {
 PowerPCCPU *cpu = POWERPC_CPU(cs);
@@ -6142,34 +6174,7 @@ static bool cpu_has_work_POWER8(CPUState *cs)
 if (!(cs->interrupt_request & CPU_INTERRUPT_HARD)) {
 return false;
 }
-if ((env->pending_interrupts & PPC_INTERRUPT_EXT) &&
-(env->spr[SPR_LPCR] & LPCR_P8_PECE2)) {
-return true;
-}
-if ((env->pending_interrupts & PPC_INTERRUPT_DECR) &&
-(env->spr[SPR_LPCR] & LPCR_P8_PECE3)) {
-return true;
-}
-if ((env->pending_interrupts & PPC_INTERRUPT_MCK) &&
-(env->spr[SPR_LPCR] & LPCR_P8_PECE4)) {
-return true;
-}
-if ((env->pending_interrupts & PPC_INTERRUPT_HMI) &&
-(env->spr[SPR_LPCR] & LPCR_P8_PECE4)) {
-return true;
-}
-if ((env->pending_interrupts & PPC_INTERRUPT_DOORBELL) &&
-(env->spr[SPR_LPCR] & LPCR_P8_PECE0)) {
-return true;
-}
-if ((env->pending_interrupts & PPC_INTERRUPT_HDOORBELL) &&
-(env->spr[SPR_LPCR] & LPCR_P8_PECE1)) {
-return true;
-}
-if (env->pending_interrupts & PPC_INTERRUPT_RESET) {
-return true;
-}
-return false;
+return p8_interrupt_powersave(env) != 0;
 } else {
 return FIELD_EX64(env->msr, MSR, EE) &&
(cs->interrupt_request & CPU_INTERRUPT_HARD);
-- 
2.25.1




[PATCH v3 22/29] target/ppc: remove unused interrupts from p7_deliver_interrupt

2022-10-11 Thread Matheus Ferst
Remove the following unused interrupts from the POWER7 interrupt
processing method:
- PPC_INTERRUPT_RESET: only raised for 6xx, 7xx, 970 and POWER5p;
- Hypervisor Virtualization: introduced in Power ISA v3.0;
- Hypervisor Doorbell and Event-Based Branch: introduced in
  Power ISA v2.07;
- Critical Input, Watchdog Timer, and Fixed Interval Timer: only defined
  for embedded CPUs;
- Doorbell and Critical Doorbell Interrupt: processor does not implement
  the Embedded.Processor Control category;
- Programmable Interval Timer: 40x-only;
- PPC_INTERRUPT_THERM: only raised for 970 and POWER5p;

Signed-off-by: Matheus Ferst 
---
 target/ppc/excp_helper.c | 50 
 1 file changed, 50 deletions(-)

diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index 48c1b9f627..055f1de20e 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -2051,10 +2051,6 @@ static void p7_deliver_interrupt(CPUPPCState *env, int 
interrupt)
 CPUState *cs = env_cpu(env);
 
 switch (interrupt) {
-case PPC_INTERRUPT_RESET: /* External reset */
-env->pending_interrupts &= ~PPC_INTERRUPT_RESET;
-powerpc_excp(cpu, POWERPC_EXCP_RESET);
-break;
 case PPC_INTERRUPT_MCK: /* Machine check exception */
 env->pending_interrupts &= ~PPC_INTERRUPT_MCK;
 powerpc_excp(cpu, POWERPC_EXCP_MCHECK);
@@ -2071,9 +2067,6 @@ static void p7_deliver_interrupt(CPUPPCState *env, int 
interrupt)
 env->pending_interrupts &= ~PPC_INTERRUPT_HDECR;
 powerpc_excp(cpu, POWERPC_EXCP_HDECR);
 break;
-case PPC_INTERRUPT_HVIRT: /* Hypervisor virtualization interrupt */
-powerpc_excp(cpu, POWERPC_EXCP_HVIRT);
-break;
 
 case PPC_INTERRUPT_EXT:
 if (books_vhyp_promotes_external_to_hvirt(cpu)) {
@@ -2082,60 +2075,17 @@ static void p7_deliver_interrupt(CPUPPCState *env, int 
interrupt)
 powerpc_excp(cpu, POWERPC_EXCP_EXTERNAL);
 }
 break;
-case PPC_INTERRUPT_CEXT: /* External critical interrupt */
-powerpc_excp(cpu, POWERPC_EXCP_CRITICAL);
-break;
 
-case PPC_INTERRUPT_WDT: /* Watchdog timer on embedded PowerPC */
-env->pending_interrupts &= ~PPC_INTERRUPT_WDT;
-powerpc_excp(cpu, POWERPC_EXCP_WDT);
-break;
-case PPC_INTERRUPT_CDOORBELL:
-env->pending_interrupts &= ~PPC_INTERRUPT_CDOORBELL;
-powerpc_excp(cpu, POWERPC_EXCP_DOORCI);
-break;
-case PPC_INTERRUPT_FIT: /* Fixed interval timer on embedded PowerPC */
-env->pending_interrupts &= ~PPC_INTERRUPT_FIT;
-powerpc_excp(cpu, POWERPC_EXCP_FIT);
-break;
-case PPC_INTERRUPT_PIT: /* Programmable interval timer on embedded PowerPC 
*/
-env->pending_interrupts &= ~PPC_INTERRUPT_PIT;
-powerpc_excp(cpu, POWERPC_EXCP_PIT);
-break;
 case PPC_INTERRUPT_DECR: /* Decrementer exception */
 if (ppc_decr_clear_on_delivery(env)) {
 env->pending_interrupts &= ~PPC_INTERRUPT_DECR;
 }
 powerpc_excp(cpu, POWERPC_EXCP_DECR);
 break;
-case PPC_INTERRUPT_DOORBELL:
-env->pending_interrupts &= ~PPC_INTERRUPT_DOORBELL;
-if (is_book3s_arch2x(env)) {
-powerpc_excp(cpu, POWERPC_EXCP_SDOOR);
-} else {
-powerpc_excp(cpu, POWERPC_EXCP_DOORI);
-}
-break;
-case PPC_INTERRUPT_HDOORBELL:
-env->pending_interrupts &= ~PPC_INTERRUPT_HDOORBELL;
-powerpc_excp(cpu, POWERPC_EXCP_SDOOR_HV);
-break;
 case PPC_INTERRUPT_PERFM:
 env->pending_interrupts &= ~PPC_INTERRUPT_PERFM;
 powerpc_excp(cpu, POWERPC_EXCP_PERFM);
 break;
-case PPC_INTERRUPT_THERM:  /* Thermal interrupt */
-env->pending_interrupts &= ~PPC_INTERRUPT_THERM;
-powerpc_excp(cpu, POWERPC_EXCP_THERM);
-break;
-case PPC_INTERRUPT_EBB: /* EBB exception */
-env->pending_interrupts &= ~PPC_INTERRUPT_EBB;
-if (env->spr[SPR_BESCR] & BESCR_PMEO) {
-powerpc_excp(cpu, POWERPC_EXCP_PERFM_EBB);
-} else if (env->spr[SPR_BESCR] & BESCR_EEO) {
-powerpc_excp(cpu, POWERPC_EXCP_EXTERNAL_EBB);
-}
-break;
 case 0:
 /*
  * This is a bug ! It means that has_work took us out of halt without
-- 
2.25.1




[PATCH v3 24/29] target/ppc: move power-saving interrupt masking out of cpu_has_work_POWER7

2022-10-11 Thread Matheus Ferst
Move the interrupt masking logic out of cpu_has_work_POWER7 in a new
method, p7_interrupt_powersave, that only returns an interrupt if it can
wake the processor from power-saving mode.

Signed-off-by: Matheus Ferst 
---
 target/ppc/cpu_init.c | 45 ---
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index 4a44ba1733..53a87c379c 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -5960,6 +5960,30 @@ static bool ppc_pvr_match_power7(PowerPCCPUClass *pcc, 
uint32_t pvr, bool best)
 return true;
 }
 
+static int p7_interrupt_powersave(CPUPPCState *env)
+{
+if ((env->pending_interrupts & PPC_INTERRUPT_EXT) &&
+(env->spr[SPR_LPCR] & LPCR_P7_PECE0)) {
+return PPC_INTERRUPT_EXT;
+}
+if ((env->pending_interrupts & PPC_INTERRUPT_DECR) &&
+(env->spr[SPR_LPCR] & LPCR_P7_PECE1)) {
+return PPC_INTERRUPT_DECR;
+}
+if ((env->pending_interrupts & PPC_INTERRUPT_MCK) &&
+(env->spr[SPR_LPCR] & LPCR_P7_PECE2)) {
+return PPC_INTERRUPT_MCK;
+}
+if ((env->pending_interrupts & PPC_INTERRUPT_HMI) &&
+(env->spr[SPR_LPCR] & LPCR_P7_PECE2)) {
+return PPC_INTERRUPT_HMI;
+}
+if (env->pending_interrupts & PPC_INTERRUPT_RESET) {
+return PPC_INTERRUPT_RESET;
+}
+return 0;
+}
+
 static bool cpu_has_work_POWER7(CPUState *cs)
 {
 PowerPCCPU *cpu = POWERPC_CPU(cs);
@@ -5969,26 +5993,7 @@ static bool cpu_has_work_POWER7(CPUState *cs)
 if (!(cs->interrupt_request & CPU_INTERRUPT_HARD)) {
 return false;
 }
-if ((env->pending_interrupts & PPC_INTERRUPT_EXT) &&
-(env->spr[SPR_LPCR] & LPCR_P7_PECE0)) {
-return true;
-}
-if ((env->pending_interrupts & PPC_INTERRUPT_DECR) &&
-(env->spr[SPR_LPCR] & LPCR_P7_PECE1)) {
-return true;
-}
-if ((env->pending_interrupts & PPC_INTERRUPT_MCK) &&
-(env->spr[SPR_LPCR] & LPCR_P7_PECE2)) {
-return true;
-}
-if ((env->pending_interrupts & PPC_INTERRUPT_HMI) &&
-(env->spr[SPR_LPCR] & LPCR_P7_PECE2)) {
-return true;
-}
-if (env->pending_interrupts & PPC_INTERRUPT_RESET) {
-return true;
-}
-return false;
+return p7_interrupt_powersave(env) != 0;
 } else {
 return FIELD_EX64(env->msr, MSR, EE) &&
(cs->interrupt_request & CPU_INTERRUPT_HARD);
-- 
2.25.1




[PATCH v3 16/29] target/ppc: remove generic architecture checks from p8_deliver_interrupt

2022-10-11 Thread Matheus Ferst
Signed-off-by: Matheus Ferst 
---
 target/ppc/excp_helper.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index 662daad796..aaf1c95087 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -2004,9 +2004,6 @@ static void p8_deliver_interrupt(CPUPPCState *env, int 
interrupt)
 break;
 
 case PPC_INTERRUPT_DECR: /* Decrementer exception */
-if (ppc_decr_clear_on_delivery(env)) {
-env->pending_interrupts &= ~PPC_INTERRUPT_DECR;
-}
 powerpc_excp(cpu, POWERPC_EXCP_DECR);
 break;
 case PPC_INTERRUPT_DOORBELL:
-- 
2.25.1




[PATCH v3 29/29] target/ppc: move the p*_interrupt_powersave methods to excp_helper.c

2022-10-11 Thread Matheus Ferst
Move the methods to excp_helper.c and make them static.

Reviewed-by: Fabiano Rosas 
Signed-off-by: Matheus Ferst 
---
 target/ppc/cpu_init.c| 102 ---
 target/ppc/excp_helper.c | 102 +++
 target/ppc/internal.h|   6 ---
 3 files changed, 102 insertions(+), 108 deletions(-)

diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index 15d549ad38..6f3539f13a 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -5960,30 +5960,6 @@ static bool ppc_pvr_match_power7(PowerPCCPUClass *pcc, 
uint32_t pvr, bool best)
 return true;
 }
 
-int p7_interrupt_powersave(CPUPPCState *env)
-{
-if ((env->pending_interrupts & PPC_INTERRUPT_EXT) &&
-(env->spr[SPR_LPCR] & LPCR_P7_PECE0)) {
-return PPC_INTERRUPT_EXT;
-}
-if ((env->pending_interrupts & PPC_INTERRUPT_DECR) &&
-(env->spr[SPR_LPCR] & LPCR_P7_PECE1)) {
-return PPC_INTERRUPT_DECR;
-}
-if ((env->pending_interrupts & PPC_INTERRUPT_MCK) &&
-(env->spr[SPR_LPCR] & LPCR_P7_PECE2)) {
-return PPC_INTERRUPT_MCK;
-}
-if ((env->pending_interrupts & PPC_INTERRUPT_HMI) &&
-(env->spr[SPR_LPCR] & LPCR_P7_PECE2)) {
-return PPC_INTERRUPT_HMI;
-}
-if (env->pending_interrupts & PPC_INTERRUPT_RESET) {
-return PPC_INTERRUPT_RESET;
-}
-return 0;
-}
-
 POWERPC_FAMILY(POWER7)(ObjectClass *oc, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(oc);
@@ -6120,38 +6096,6 @@ static bool ppc_pvr_match_power8(PowerPCCPUClass *pcc, 
uint32_t pvr, bool best)
 return true;
 }
 
-int p8_interrupt_powersave(CPUPPCState *env)
-{
-if ((env->pending_interrupts & PPC_INTERRUPT_EXT) &&
-(env->spr[SPR_LPCR] & LPCR_P8_PECE2)) {
-return PPC_INTERRUPT_EXT;
-}
-if ((env->pending_interrupts & PPC_INTERRUPT_DECR) &&
-(env->spr[SPR_LPCR] & LPCR_P8_PECE3)) {
-return PPC_INTERRUPT_DECR;
-}
-if ((env->pending_interrupts & PPC_INTERRUPT_MCK) &&
-(env->spr[SPR_LPCR] & LPCR_P8_PECE4)) {
-return PPC_INTERRUPT_MCK;
-}
-if ((env->pending_interrupts & PPC_INTERRUPT_HMI) &&
-(env->spr[SPR_LPCR] & LPCR_P8_PECE4)) {
-return PPC_INTERRUPT_HMI;
-}
-if ((env->pending_interrupts & PPC_INTERRUPT_DOORBELL) &&
-(env->spr[SPR_LPCR] & LPCR_P8_PECE0)) {
-return PPC_INTERRUPT_DOORBELL;
-}
-if ((env->pending_interrupts & PPC_INTERRUPT_HDOORBELL) &&
-(env->spr[SPR_LPCR] & LPCR_P8_PECE1)) {
-return PPC_INTERRUPT_HDOORBELL;
-}
-if (env->pending_interrupts & PPC_INTERRUPT_RESET) {
-return PPC_INTERRUPT_RESET;
-}
-return 0;
-}
-
 POWERPC_FAMILY(POWER8)(ObjectClass *oc, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(oc);
@@ -6325,52 +6269,6 @@ static bool ppc_pvr_match_power9(PowerPCCPUClass *pcc, 
uint32_t pvr, bool best)
 return false;
 }
 
-int p9_interrupt_powersave(CPUPPCState *env)
-{
-/* External Exception */
-if ((env->pending_interrupts & PPC_INTERRUPT_EXT) &&
-(env->spr[SPR_LPCR] & LPCR_EEE)) {
-bool heic = !!(env->spr[SPR_LPCR] & LPCR_HEIC);
-if (!heic || !FIELD_EX64_HV(env->msr) ||
-FIELD_EX64(env->msr, MSR, PR)) {
-return PPC_INTERRUPT_EXT;
-}
-}
-/* Decrementer Exception */
-if ((env->pending_interrupts & PPC_INTERRUPT_DECR) &&
-(env->spr[SPR_LPCR] & LPCR_DEE)) {
-return PPC_INTERRUPT_DECR;
-}
-/* Machine Check or Hypervisor Maintenance Exception */
-if (env->spr[SPR_LPCR] & LPCR_OEE) {
-if (env->pending_interrupts & PPC_INTERRUPT_MCK) {
-return PPC_INTERRUPT_MCK;
-}
-if (env->pending_interrupts & PPC_INTERRUPT_HMI) {
-return PPC_INTERRUPT_HMI;
-}
-}
-/* Privileged Doorbell Exception */
-if ((env->pending_interrupts & PPC_INTERRUPT_DOORBELL) &&
-(env->spr[SPR_LPCR] & LPCR_PDEE)) {
-return PPC_INTERRUPT_DOORBELL;
-}
-/* Hypervisor Doorbell Exception */
-if ((env->pending_interrupts & PPC_INTERRUPT_HDOORBELL) &&
-(env->spr[SPR_LPCR] & LPCR_HDEE)) {
-return PPC_INTERRUPT_HDOORBELL;
-}
-/* Hypervisor virtualization exception */
-if ((env->pending_interrupts & PPC_INTERRUPT_HVIRT) &&
-(env->spr[SPR_LPCR] & LPCR_HVEE)) {
-return PPC_INTERRUPT_HVIRT;
-}
-if (env->pending_interrupts & PPC_INTERRUPT_RESET) {
-return PPC_INTERRUPT_RESET;
-}
-return 0;
-}
-
 POWERPC_FAMILY(POWER9)(ObjectClass *oc, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(oc);
diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index 110592d91b..b3f5dad02e 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -1686,6 +1686,30 @@ void ppc_cpu_do_interrupt(CPUState *cs)
  PPC_INTERRUPT_PIT | PPC_INTERRUPT_DOORBELL | PPC_INTERRUPT_HDOORBELL | \
  PPC_INTERRUPT_THERM | 

[PATCH v3 21/29] target/ppc: create an interrupt deliver method for POWER7

2022-10-11 Thread Matheus Ferst
The new method is identical to ppc_deliver_interrupt, processor-specific
code will be added/removed in the following patches.

Signed-off-by: Matheus Ferst 
---
 target/ppc/excp_helper.c | 113 +++
 1 file changed, 113 insertions(+)

diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index a4d5fac37b..48c1b9f627 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -2045,6 +2045,116 @@ static int ppc_next_unmasked_interrupt(CPUPPCState *env)
 }
 
 #if defined(TARGET_PPC64)
+static void p7_deliver_interrupt(CPUPPCState *env, int interrupt)
+{
+PowerPCCPU *cpu = env_archcpu(env);
+CPUState *cs = env_cpu(env);
+
+switch (interrupt) {
+case PPC_INTERRUPT_RESET: /* External reset */
+env->pending_interrupts &= ~PPC_INTERRUPT_RESET;
+powerpc_excp(cpu, POWERPC_EXCP_RESET);
+break;
+case PPC_INTERRUPT_MCK: /* Machine check exception */
+env->pending_interrupts &= ~PPC_INTERRUPT_MCK;
+powerpc_excp(cpu, POWERPC_EXCP_MCHECK);
+break;
+#if 0 /* TODO */
+case PPC_INTERRUPT_DEBUG: /* External debug exception */
+env->pending_interrupts &= ~PPC_INTERRUPT_DEBUG;
+powerpc_excp(cpu, POWERPC_EXCP_DEBUG);
+break;
+#endif
+
+case PPC_INTERRUPT_HDECR: /* Hypervisor decrementer exception */
+/* HDEC clears on delivery */
+env->pending_interrupts &= ~PPC_INTERRUPT_HDECR;
+powerpc_excp(cpu, POWERPC_EXCP_HDECR);
+break;
+case PPC_INTERRUPT_HVIRT: /* Hypervisor virtualization interrupt */
+powerpc_excp(cpu, POWERPC_EXCP_HVIRT);
+break;
+
+case PPC_INTERRUPT_EXT:
+if (books_vhyp_promotes_external_to_hvirt(cpu)) {
+powerpc_excp(cpu, POWERPC_EXCP_HVIRT);
+} else {
+powerpc_excp(cpu, POWERPC_EXCP_EXTERNAL);
+}
+break;
+case PPC_INTERRUPT_CEXT: /* External critical interrupt */
+powerpc_excp(cpu, POWERPC_EXCP_CRITICAL);
+break;
+
+case PPC_INTERRUPT_WDT: /* Watchdog timer on embedded PowerPC */
+env->pending_interrupts &= ~PPC_INTERRUPT_WDT;
+powerpc_excp(cpu, POWERPC_EXCP_WDT);
+break;
+case PPC_INTERRUPT_CDOORBELL:
+env->pending_interrupts &= ~PPC_INTERRUPT_CDOORBELL;
+powerpc_excp(cpu, POWERPC_EXCP_DOORCI);
+break;
+case PPC_INTERRUPT_FIT: /* Fixed interval timer on embedded PowerPC */
+env->pending_interrupts &= ~PPC_INTERRUPT_FIT;
+powerpc_excp(cpu, POWERPC_EXCP_FIT);
+break;
+case PPC_INTERRUPT_PIT: /* Programmable interval timer on embedded PowerPC 
*/
+env->pending_interrupts &= ~PPC_INTERRUPT_PIT;
+powerpc_excp(cpu, POWERPC_EXCP_PIT);
+break;
+case PPC_INTERRUPT_DECR: /* Decrementer exception */
+if (ppc_decr_clear_on_delivery(env)) {
+env->pending_interrupts &= ~PPC_INTERRUPT_DECR;
+}
+powerpc_excp(cpu, POWERPC_EXCP_DECR);
+break;
+case PPC_INTERRUPT_DOORBELL:
+env->pending_interrupts &= ~PPC_INTERRUPT_DOORBELL;
+if (is_book3s_arch2x(env)) {
+powerpc_excp(cpu, POWERPC_EXCP_SDOOR);
+} else {
+powerpc_excp(cpu, POWERPC_EXCP_DOORI);
+}
+break;
+case PPC_INTERRUPT_HDOORBELL:
+env->pending_interrupts &= ~PPC_INTERRUPT_HDOORBELL;
+powerpc_excp(cpu, POWERPC_EXCP_SDOOR_HV);
+break;
+case PPC_INTERRUPT_PERFM:
+env->pending_interrupts &= ~PPC_INTERRUPT_PERFM;
+powerpc_excp(cpu, POWERPC_EXCP_PERFM);
+break;
+case PPC_INTERRUPT_THERM:  /* Thermal interrupt */
+env->pending_interrupts &= ~PPC_INTERRUPT_THERM;
+powerpc_excp(cpu, POWERPC_EXCP_THERM);
+break;
+case PPC_INTERRUPT_EBB: /* EBB exception */
+env->pending_interrupts &= ~PPC_INTERRUPT_EBB;
+if (env->spr[SPR_BESCR] & BESCR_PMEO) {
+powerpc_excp(cpu, POWERPC_EXCP_PERFM_EBB);
+} else if (env->spr[SPR_BESCR] & BESCR_EEO) {
+powerpc_excp(cpu, POWERPC_EXCP_EXTERNAL_EBB);
+}
+break;
+case 0:
+/*
+ * This is a bug ! It means that has_work took us out of halt without
+ * anything to deliver while in a PM state that requires getting
+ * out via a 0x100
+ *
+ * This means we will incorrectly execute past the power management
+ * instruction instead of triggering a reset.
+ *
+ * It generally means a discrepancy between the wakeup conditions in 
the
+ * processor has_work implementation and the logic in this function.
+ */
+assert(!env->resume_as_sreset);
+break;
+default:
+cpu_abort(cs, "Invalid PowerPC interrupt %d. Aborting\n", interrupt);
+}
+}
+
 static void p8_deliver_interrupt(CPUPPCState *env, int interrupt)
 {
 PowerPCCPU *cpu = env_archcpu(env);
@@ -2310,6 +2420,9 @@ static void 

[PATCH v3 23/29] target/ppc: remove generic architecture checks from p7_deliver_interrupt

2022-10-11 Thread Matheus Ferst
Signed-off-by: Matheus Ferst 
---
 target/ppc/excp_helper.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index 055f1de20e..1c373c1a7c 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -2077,9 +2077,6 @@ static void p7_deliver_interrupt(CPUPPCState *env, int 
interrupt)
 break;
 
 case PPC_INTERRUPT_DECR: /* Decrementer exception */
-if (ppc_decr_clear_on_delivery(env)) {
-env->pending_interrupts &= ~PPC_INTERRUPT_DECR;
-}
 powerpc_excp(cpu, POWERPC_EXCP_DECR);
 break;
 case PPC_INTERRUPT_PERFM:
-- 
2.25.1




[PATCH v3 15/29] target/ppc: remove unused interrupts from p8_deliver_interrupt

2022-10-11 Thread Matheus Ferst
Remove the following unused interrupts from the POWER8 interrupt
processing method:
- PPC_INTERRUPT_RESET: only raised for 6xx, 7xx, 970 and POWER5p;
- Debug Interrupt: removed in Power ISA v2.07;
- Hypervisor Virtualization: introduced in Power ISA v3.0;
- Critical Input, Watchdog Timer, and Fixed Interval Timer: only defined
  for embedded CPUs;
- Critical Doorbell: processor does not implement the
  "Embedded.Processor Control" category;
- Programmable Interval Timer: 40x-only;
- PPC_INTERRUPT_THERM: only raised for 970 and POWER5p;

Signed-off-by: Matheus Ferst 
---
v3:
 - Keep Hypervisor and Privileged Doorbell interrupts, the category for
   processor control instruction became "Embedded.Processor Control" or
   "Server" on Power ISA v2.07, so the interrupts are still necessary.
---
 target/ppc/excp_helper.c | 36 
 1 file changed, 36 deletions(-)

diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index 1d1b26b8d8..662daad796 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -1984,29 +1984,16 @@ static void p8_deliver_interrupt(CPUPPCState *env, int 
interrupt)
 CPUState *cs = env_cpu(env);
 
 switch (interrupt) {
-case PPC_INTERRUPT_RESET: /* External reset */
-env->pending_interrupts &= ~PPC_INTERRUPT_RESET;
-powerpc_excp(cpu, POWERPC_EXCP_RESET);
-break;
 case PPC_INTERRUPT_MCK: /* Machine check exception */
 env->pending_interrupts &= ~PPC_INTERRUPT_MCK;
 powerpc_excp(cpu, POWERPC_EXCP_MCHECK);
 break;
-#if 0 /* TODO */
-case PPC_INTERRUPT_DEBUG: /* External debug exception */
-env->pending_interrupts &= ~PPC_INTERRUPT_DEBUG;
-powerpc_excp(cpu, POWERPC_EXCP_DEBUG);
-break;
-#endif
 
 case PPC_INTERRUPT_HDECR: /* Hypervisor decrementer exception */
 /* HDEC clears on delivery */
 env->pending_interrupts &= ~PPC_INTERRUPT_HDECR;
 powerpc_excp(cpu, POWERPC_EXCP_HDECR);
 break;
-case PPC_INTERRUPT_HVIRT: /* Hypervisor virtualization interrupt */
-powerpc_excp(cpu, POWERPC_EXCP_HVIRT);
-break;
 
 case PPC_INTERRUPT_EXT:
 if (books_vhyp_promotes_external_to_hvirt(cpu)) {
@@ -2015,26 +2002,7 @@ static void p8_deliver_interrupt(CPUPPCState *env, int 
interrupt)
 powerpc_excp(cpu, POWERPC_EXCP_EXTERNAL);
 }
 break;
-case PPC_INTERRUPT_CEXT: /* External critical interrupt */
-powerpc_excp(cpu, POWERPC_EXCP_CRITICAL);
-break;
 
-case PPC_INTERRUPT_WDT: /* Watchdog timer on embedded PowerPC */
-env->pending_interrupts &= ~PPC_INTERRUPT_WDT;
-powerpc_excp(cpu, POWERPC_EXCP_WDT);
-break;
-case PPC_INTERRUPT_CDOORBELL:
-env->pending_interrupts &= ~PPC_INTERRUPT_CDOORBELL;
-powerpc_excp(cpu, POWERPC_EXCP_DOORCI);
-break;
-case PPC_INTERRUPT_FIT: /* Fixed interval timer on embedded PowerPC */
-env->pending_interrupts &= ~PPC_INTERRUPT_FIT;
-powerpc_excp(cpu, POWERPC_EXCP_FIT);
-break;
-case PPC_INTERRUPT_PIT: /* Programmable interval timer on embedded PowerPC 
*/
-env->pending_interrupts &= ~PPC_INTERRUPT_PIT;
-powerpc_excp(cpu, POWERPC_EXCP_PIT);
-break;
 case PPC_INTERRUPT_DECR: /* Decrementer exception */
 if (ppc_decr_clear_on_delivery(env)) {
 env->pending_interrupts &= ~PPC_INTERRUPT_DECR;
@@ -2057,10 +2025,6 @@ static void p8_deliver_interrupt(CPUPPCState *env, int 
interrupt)
 env->pending_interrupts &= ~PPC_INTERRUPT_PERFM;
 powerpc_excp(cpu, POWERPC_EXCP_PERFM);
 break;
-case PPC_INTERRUPT_THERM:  /* Thermal interrupt */
-env->pending_interrupts &= ~PPC_INTERRUPT_THERM;
-powerpc_excp(cpu, POWERPC_EXCP_THERM);
-break;
 case PPC_INTERRUPT_EBB: /* EBB exception */
 env->pending_interrupts &= ~PPC_INTERRUPT_EBB;
 if (env->spr[SPR_BESCR] & BESCR_PMEO) {
-- 
2.25.1




[PATCH v3 25/29] target/ppc: add power-saving interrupt masking logic to p7_next_unmasked_interrupt

2022-10-11 Thread Matheus Ferst
Export p7_interrupt_powersave and use it in p7_next_unmasked_interrupt.

Signed-off-by: Matheus Ferst 
---
 target/ppc/cpu_init.c|  2 +-
 target/ppc/excp_helper.c | 24 
 target/ppc/internal.h|  1 +
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index 53a87c379c..0adc866485 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -5960,7 +5960,7 @@ static bool ppc_pvr_match_power7(PowerPCCPUClass *pcc, 
uint32_t pvr, bool best)
 return true;
 }
 
-static int p7_interrupt_powersave(CPUPPCState *env)
+int p7_interrupt_powersave(CPUPPCState *env)
 {
 if ((env->pending_interrupts & PPC_INTERRUPT_EXT) &&
 (env->spr[SPR_LPCR] & LPCR_P7_PECE0)) {
diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index 1c373c1a7c..3e8a368d01 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -1687,10 +1687,18 @@ void ppc_cpu_do_interrupt(CPUState *cs)
 
 static int p7_next_unmasked_interrupt(CPUPPCState *env)
 {
-bool async_deliver;
+PowerPCCPU *cpu = env_archcpu(env);
+CPUState *cs = CPU(cpu);
+/* Ignore MSR[EE] when coming out of some power management states */
+bool msr_ee = FIELD_EX64(env->msr, MSR, EE) || env->resume_as_sreset;
 
 assert((env->pending_interrupts & P7_UNUSED_INTERRUPTS) == 0);
 
+if (cs->halted) {
+/* LPCR[PECE] controls which interrupts can exit power-saving mode */
+return p7_interrupt_powersave(env);
+}
+
 /* Machine check exception */
 if (env->pending_interrupts & PPC_INTERRUPT_MCK) {
 return PPC_INTERRUPT_MCK;
@@ -1702,19 +1710,11 @@ static int p7_next_unmasked_interrupt(CPUPPCState *env)
 }
 #endif
 
-/*
- * For interrupts that gate on MSR:EE, we need to do something a
- * bit more subtle, as we need to let them through even when EE is
- * clear when coming out of some power management states (in order
- * for them to become a 0x100).
- */
-async_deliver = FIELD_EX64(env->msr, MSR, EE) || env->resume_as_sreset;
-
 /* Hypervisor decrementer exception */
 if (env->pending_interrupts & PPC_INTERRUPT_HDECR) {
 /* LPCR will be clear when not supported so this will work */
 bool hdice = !!(env->spr[SPR_LPCR] & LPCR_HDICE);
-if ((async_deliver || !FIELD_EX64_HV(env->msr)) && hdice) {
+if ((msr_ee || !FIELD_EX64_HV(env->msr)) && hdice) {
 /* HDEC clears on delivery */
 return PPC_INTERRUPT_HDECR;
 }
@@ -1725,13 +1725,13 @@ static int p7_next_unmasked_interrupt(CPUPPCState *env)
 bool lpes0 = !!(env->spr[SPR_LPCR] & LPCR_LPES0);
 bool heic = !!(env->spr[SPR_LPCR] & LPCR_HEIC);
 /* HEIC blocks delivery to the hypervisor */
-if ((async_deliver && !(heic && FIELD_EX64_HV(env->msr) &&
+if ((msr_ee && !(heic && FIELD_EX64_HV(env->msr) &&
 !FIELD_EX64(env->msr, MSR, PR))) ||
 (env->has_hv_mode && !FIELD_EX64_HV(env->msr) && !lpes0)) {
 return PPC_INTERRUPT_EXT;
 }
 }
-if (async_deliver != 0) {
+if (msr_ee != 0) {
 /* Decrementer exception */
 if (env->pending_interrupts & PPC_INTERRUPT_DECR) {
 return PPC_INTERRUPT_DECR;
diff --git a/target/ppc/internal.h b/target/ppc/internal.h
index 9069874adb..25827ebf6f 100644
--- a/target/ppc/internal.h
+++ b/target/ppc/internal.h
@@ -309,6 +309,7 @@ static inline int ger_pack_masks(int pmsk, int ymsk, int 
xmsk)
 #if defined(TARGET_PPC64)
 int p9_interrupt_powersave(CPUPPCState *env);
 int p8_interrupt_powersave(CPUPPCState *env);
+int p7_interrupt_powersave(CPUPPCState *env);
 #endif
 
 #endif /* PPC_INTERNAL_H */
-- 
2.25.1




[PATCH v3 20/29] target/ppc: remove unused interrupts from p7_next_unmasked_interrupt

2022-10-11 Thread Matheus Ferst
Remove the following unused interrupts from the POWER7 interrupt masking
method:
- PPC_INTERRUPT_RESET: only raised for 6xx, 7xx, 970 and POWER5p;
- Hypervisor Virtualization: introduced in Power ISA v3.0;
- Hypervisor Doorbell and Event-Based Branch: introduced in
  Power ISA v2.07;
- Critical Input, Watchdog Timer, and Fixed Interval Timer: only defined
  for embedded CPUs;
- Doorbell and Critical Doorbell Interrupt: processor does not implement
  the Embedded.Processor Control category;
- Programmable Interval Timer: 40x-only;
- PPC_INTERRUPT_THERM: only raised for 970 and POWER5p;

Signed-off-by: Matheus Ferst 
---
v3:
 - Fixed method name in subject.
---
 target/ppc/excp_helper.c | 63 +---
 1 file changed, 8 insertions(+), 55 deletions(-)

diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index 534c0f8f5c..a4d5fac37b 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -1679,14 +1679,18 @@ void ppc_cpu_do_interrupt(CPUState *cs)
 }
 
 #if defined(TARGET_PPC64)
+#define P7_UNUSED_INTERRUPTS \
+(PPC_INTERRUPT_RESET | PPC_INTERRUPT_HVIRT | PPC_INTERRUPT_CEXT |   \
+ PPC_INTERRUPT_WDT | PPC_INTERRUPT_CDOORBELL | PPC_INTERRUPT_FIT |  \
+ PPC_INTERRUPT_PIT | PPC_INTERRUPT_DOORBELL | PPC_INTERRUPT_HDOORBELL | \
+ PPC_INTERRUPT_THERM | PPC_INTERRUPT_EBB)
+
 static int p7_next_unmasked_interrupt(CPUPPCState *env)
 {
 bool async_deliver;
 
-/* External reset */
-if (env->pending_interrupts & PPC_INTERRUPT_RESET) {
-return PPC_INTERRUPT_RESET;
-}
+assert((env->pending_interrupts & P7_UNUSED_INTERRUPTS) == 0);
+
 /* Machine check exception */
 if (env->pending_interrupts & PPC_INTERRUPT_MCK) {
 return PPC_INTERRUPT_MCK;
@@ -1716,15 +1720,6 @@ static int p7_next_unmasked_interrupt(CPUPPCState *env)
 }
 }
 
-/* Hypervisor virtualization interrupt */
-if (env->pending_interrupts & PPC_INTERRUPT_HVIRT) {
-/* LPCR will be clear when not supported so this will work */
-bool hvice = !!(env->spr[SPR_LPCR] & LPCR_HVICE);
-if ((async_deliver || !FIELD_EX64_HV(env->msr)) && hvice) {
-return PPC_INTERRUPT_HVIRT;
-}
-}
-
 /* External interrupt can ignore MSR:EE under some circumstances */
 if (env->pending_interrupts & PPC_INTERRUPT_EXT) {
 bool lpes0 = !!(env->spr[SPR_LPCR] & LPCR_LPES0);
@@ -1736,56 +1731,14 @@ static int p7_next_unmasked_interrupt(CPUPPCState *env)
 return PPC_INTERRUPT_EXT;
 }
 }
-if (FIELD_EX64(env->msr, MSR, CE)) {
-/* External critical interrupt */
-if (env->pending_interrupts & PPC_INTERRUPT_CEXT) {
-return PPC_INTERRUPT_CEXT;
-}
-}
 if (async_deliver != 0) {
-/* Watchdog timer on embedded PowerPC */
-if (env->pending_interrupts & PPC_INTERRUPT_WDT) {
-return PPC_INTERRUPT_WDT;
-}
-if (env->pending_interrupts & PPC_INTERRUPT_CDOORBELL) {
-return PPC_INTERRUPT_CDOORBELL;
-}
-/* Fixed interval timer on embedded PowerPC */
-if (env->pending_interrupts & PPC_INTERRUPT_FIT) {
-return PPC_INTERRUPT_FIT;
-}
-/* Programmable interval timer on embedded PowerPC */
-if (env->pending_interrupts & PPC_INTERRUPT_PIT) {
-return PPC_INTERRUPT_PIT;
-}
 /* Decrementer exception */
 if (env->pending_interrupts & PPC_INTERRUPT_DECR) {
 return PPC_INTERRUPT_DECR;
 }
-if (env->pending_interrupts & PPC_INTERRUPT_DOORBELL) {
-return PPC_INTERRUPT_DOORBELL;
-}
-if (env->pending_interrupts & PPC_INTERRUPT_HDOORBELL) {
-return PPC_INTERRUPT_HDOORBELL;
-}
 if (env->pending_interrupts & PPC_INTERRUPT_PERFM) {
 return PPC_INTERRUPT_PERFM;
 }
-/* Thermal interrupt */
-if (env->pending_interrupts & PPC_INTERRUPT_THERM) {
-return PPC_INTERRUPT_THERM;
-}
-/* EBB exception */
-if (env->pending_interrupts & PPC_INTERRUPT_EBB) {
-/*
- * EBB exception must be taken in problem state and
- * with BESCR_GE set.
- */
-if (FIELD_EX64(env->msr, MSR, PR) &&
-(env->spr[SPR_BESCR] & BESCR_GE)) {
-return PPC_INTERRUPT_EBB;
-}
-}
 }
 
 return 0;
-- 
2.25.1




[PATCH v3 13/29] target/ppc: remove unused interrupts from p8_next_unmasked_interrupt

2022-10-11 Thread Matheus Ferst
Remove the following unused interrupts from the POWER8 interrupt masking
method:
- PPC_INTERRUPT_RESET: only raised for 6xx, 7xx, 970, and POWER5p;
- Debug Interrupt: removed in Power ISA v2.07;
- Hypervisor Virtualization: introduced in Power ISA v3.0;
- Critical Input, Watchdog Timer, and Fixed Interval Timer: only defined
  for embedded CPUs;
- Critical Doorbell: processor does not implement the "Embedded.Processor
  Control" category;
- Programmable Interval Timer: 40x-only;
- PPC_INTERRUPT_THERM: only raised for 970 and POWER5p;

Signed-off-by: Matheus Ferst 
---
v3:
 - Keep Hypervisor and Privileged Doorbell interrupts, the category for
   processor control instruction became "Embedded.Processor Control" or
   "Server" on Power ISA v2.07, so the interrupts are still necessary;
 - Fixed method name in subject.
---
 target/ppc/excp_helper.c | 51 ++--
 1 file changed, 7 insertions(+), 44 deletions(-)

diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index 19d352a1b2..9bdc87aa61 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -1679,24 +1679,21 @@ void ppc_cpu_do_interrupt(CPUState *cs)
 }
 
 #if defined(TARGET_PPC64)
+#define P8_UNUSED_INTERRUPTS \
+(PPC_INTERRUPT_RESET | PPC_INTERRUPT_DEBUG | PPC_INTERRUPT_HVIRT |  \
+PPC_INTERRUPT_CEXT | PPC_INTERRUPT_WDT | PPC_INTERRUPT_CDOORBELL |  \
+PPC_INTERRUPT_FIT | PPC_INTERRUPT_PIT | PPC_INTERRUPT_THERM)
+
 static int p8_next_unmasked_interrupt(CPUPPCState *env)
 {
 bool async_deliver;
 
-/* External reset */
-if (env->pending_interrupts & PPC_INTERRUPT_RESET) {
-return PPC_INTERRUPT_RESET;
-}
+assert((env->pending_interrupts & P8_UNUSED_INTERRUPTS) == 0);
+
 /* Machine check exception */
 if (env->pending_interrupts & PPC_INTERRUPT_MCK) {
 return PPC_INTERRUPT_MCK;
 }
-#if 0 /* TODO */
-/* External debug exception */
-if (env->pending_interrupts & PPC_INTERRUPT_DEBUG) {
-return PPC_INTERRUPT_DEBUG;
-}
-#endif
 
 /*
  * For interrupts that gate on MSR:EE, we need to do something a
@@ -1716,15 +1713,6 @@ static int p8_next_unmasked_interrupt(CPUPPCState *env)
 }
 }
 
-/* Hypervisor virtualization interrupt */
-if (env->pending_interrupts & PPC_INTERRUPT_HVIRT) {
-/* LPCR will be clear when not supported so this will work */
-bool hvice = !!(env->spr[SPR_LPCR] & LPCR_HVICE);
-if ((async_deliver || !FIELD_EX64_HV(env->msr)) && hvice) {
-return PPC_INTERRUPT_HVIRT;
-}
-}
-
 /* External interrupt can ignore MSR:EE under some circumstances */
 if (env->pending_interrupts & PPC_INTERRUPT_EXT) {
 bool lpes0 = !!(env->spr[SPR_LPCR] & LPCR_LPES0);
@@ -1736,28 +1724,7 @@ static int p8_next_unmasked_interrupt(CPUPPCState *env)
 return PPC_INTERRUPT_EXT;
 }
 }
-if (FIELD_EX64(env->msr, MSR, CE)) {
-/* External critical interrupt */
-if (env->pending_interrupts & PPC_INTERRUPT_CEXT) {
-return PPC_INTERRUPT_CEXT;
-}
-}
 if (async_deliver != 0) {
-/* Watchdog timer on embedded PowerPC */
-if (env->pending_interrupts & PPC_INTERRUPT_WDT) {
-return PPC_INTERRUPT_WDT;
-}
-if (env->pending_interrupts & PPC_INTERRUPT_CDOORBELL) {
-return PPC_INTERRUPT_CDOORBELL;
-}
-/* Fixed interval timer on embedded PowerPC */
-if (env->pending_interrupts & PPC_INTERRUPT_FIT) {
-return PPC_INTERRUPT_FIT;
-}
-/* Programmable interval timer on embedded PowerPC */
-if (env->pending_interrupts & PPC_INTERRUPT_PIT) {
-return PPC_INTERRUPT_PIT;
-}
 /* Decrementer exception */
 if (env->pending_interrupts & PPC_INTERRUPT_DECR) {
 return PPC_INTERRUPT_DECR;
@@ -1771,10 +1738,6 @@ static int p8_next_unmasked_interrupt(CPUPPCState *env)
 if (env->pending_interrupts & PPC_INTERRUPT_PERFM) {
 return PPC_INTERRUPT_PERFM;
 }
-/* Thermal interrupt */
-if (env->pending_interrupts & PPC_INTERRUPT_THERM) {
-return PPC_INTERRUPT_THERM;
-}
 /* EBB exception */
 if (env->pending_interrupts & PPC_INTERRUPT_EBB) {
 /*
-- 
2.25.1




[PATCH v3 14/29] target/ppc: create an interrupt deliver method for POWER8

2022-10-11 Thread Matheus Ferst
The new method is identical to ppc_deliver_interrupt, processor-specific
code will be added/removed in the following patches.

Signed-off-by: Matheus Ferst 
---
 target/ppc/excp_helper.c | 113 +++
 1 file changed, 113 insertions(+)

diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index 9bdc87aa61..1d1b26b8d8 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -1978,6 +1978,116 @@ static int ppc_next_unmasked_interrupt(CPUPPCState *env)
 }
 
 #if defined(TARGET_PPC64)
+static void p8_deliver_interrupt(CPUPPCState *env, int interrupt)
+{
+PowerPCCPU *cpu = env_archcpu(env);
+CPUState *cs = env_cpu(env);
+
+switch (interrupt) {
+case PPC_INTERRUPT_RESET: /* External reset */
+env->pending_interrupts &= ~PPC_INTERRUPT_RESET;
+powerpc_excp(cpu, POWERPC_EXCP_RESET);
+break;
+case PPC_INTERRUPT_MCK: /* Machine check exception */
+env->pending_interrupts &= ~PPC_INTERRUPT_MCK;
+powerpc_excp(cpu, POWERPC_EXCP_MCHECK);
+break;
+#if 0 /* TODO */
+case PPC_INTERRUPT_DEBUG: /* External debug exception */
+env->pending_interrupts &= ~PPC_INTERRUPT_DEBUG;
+powerpc_excp(cpu, POWERPC_EXCP_DEBUG);
+break;
+#endif
+
+case PPC_INTERRUPT_HDECR: /* Hypervisor decrementer exception */
+/* HDEC clears on delivery */
+env->pending_interrupts &= ~PPC_INTERRUPT_HDECR;
+powerpc_excp(cpu, POWERPC_EXCP_HDECR);
+break;
+case PPC_INTERRUPT_HVIRT: /* Hypervisor virtualization interrupt */
+powerpc_excp(cpu, POWERPC_EXCP_HVIRT);
+break;
+
+case PPC_INTERRUPT_EXT:
+if (books_vhyp_promotes_external_to_hvirt(cpu)) {
+powerpc_excp(cpu, POWERPC_EXCP_HVIRT);
+} else {
+powerpc_excp(cpu, POWERPC_EXCP_EXTERNAL);
+}
+break;
+case PPC_INTERRUPT_CEXT: /* External critical interrupt */
+powerpc_excp(cpu, POWERPC_EXCP_CRITICAL);
+break;
+
+case PPC_INTERRUPT_WDT: /* Watchdog timer on embedded PowerPC */
+env->pending_interrupts &= ~PPC_INTERRUPT_WDT;
+powerpc_excp(cpu, POWERPC_EXCP_WDT);
+break;
+case PPC_INTERRUPT_CDOORBELL:
+env->pending_interrupts &= ~PPC_INTERRUPT_CDOORBELL;
+powerpc_excp(cpu, POWERPC_EXCP_DOORCI);
+break;
+case PPC_INTERRUPT_FIT: /* Fixed interval timer on embedded PowerPC */
+env->pending_interrupts &= ~PPC_INTERRUPT_FIT;
+powerpc_excp(cpu, POWERPC_EXCP_FIT);
+break;
+case PPC_INTERRUPT_PIT: /* Programmable interval timer on embedded PowerPC 
*/
+env->pending_interrupts &= ~PPC_INTERRUPT_PIT;
+powerpc_excp(cpu, POWERPC_EXCP_PIT);
+break;
+case PPC_INTERRUPT_DECR: /* Decrementer exception */
+if (ppc_decr_clear_on_delivery(env)) {
+env->pending_interrupts &= ~PPC_INTERRUPT_DECR;
+}
+powerpc_excp(cpu, POWERPC_EXCP_DECR);
+break;
+case PPC_INTERRUPT_DOORBELL:
+env->pending_interrupts &= ~PPC_INTERRUPT_DOORBELL;
+if (is_book3s_arch2x(env)) {
+powerpc_excp(cpu, POWERPC_EXCP_SDOOR);
+} else {
+powerpc_excp(cpu, POWERPC_EXCP_DOORI);
+}
+break;
+case PPC_INTERRUPT_HDOORBELL:
+env->pending_interrupts &= ~PPC_INTERRUPT_HDOORBELL;
+powerpc_excp(cpu, POWERPC_EXCP_SDOOR_HV);
+break;
+case PPC_INTERRUPT_PERFM:
+env->pending_interrupts &= ~PPC_INTERRUPT_PERFM;
+powerpc_excp(cpu, POWERPC_EXCP_PERFM);
+break;
+case PPC_INTERRUPT_THERM:  /* Thermal interrupt */
+env->pending_interrupts &= ~PPC_INTERRUPT_THERM;
+powerpc_excp(cpu, POWERPC_EXCP_THERM);
+break;
+case PPC_INTERRUPT_EBB: /* EBB exception */
+env->pending_interrupts &= ~PPC_INTERRUPT_EBB;
+if (env->spr[SPR_BESCR] & BESCR_PMEO) {
+powerpc_excp(cpu, POWERPC_EXCP_PERFM_EBB);
+} else if (env->spr[SPR_BESCR] & BESCR_EEO) {
+powerpc_excp(cpu, POWERPC_EXCP_EXTERNAL_EBB);
+}
+break;
+case 0:
+/*
+ * This is a bug ! It means that has_work took us out of halt without
+ * anything to deliver while in a PM state that requires getting
+ * out via a 0x100
+ *
+ * This means we will incorrectly execute past the power management
+ * instruction instead of triggering a reset.
+ *
+ * It generally means a discrepancy between the wakeup conditions in 
the
+ * processor has_work implementation and the logic in this function.
+ */
+assert(!env->resume_as_sreset);
+break;
+default:
+cpu_abort(cs, "Invalid PowerPC interrupt %d. Aborting\n", interrupt);
+}
+}
+
 static void p9_deliver_interrupt(CPUPPCState *env, int interrupt)
 {
 PowerPCCPU *cpu = env_archcpu(env);
@@ -2172,6 +2282,9 @@ static void 

[PATCH v3 18/29] target/ppc: add power-saving interrupt masking logic to p8_next_unmasked_interrupt

2022-10-11 Thread Matheus Ferst
Export p8_interrupt_powersave and use it in p8_next_unmasked_interrupt.

Signed-off-by: Matheus Ferst 
---
 target/ppc/cpu_init.c|  2 +-
 target/ppc/excp_helper.c | 24 
 target/ppc/internal.h|  1 +
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index 3772f82e51..4a44ba1733 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -6133,7 +6133,7 @@ static bool ppc_pvr_match_power8(PowerPCCPUClass *pcc, 
uint32_t pvr, bool best)
 return true;
 }
 
-static int p8_interrupt_powersave(CPUPPCState *env)
+int p8_interrupt_powersave(CPUPPCState *env)
 {
 if ((env->pending_interrupts & PPC_INTERRUPT_EXT) &&
 (env->spr[SPR_LPCR] & LPCR_P8_PECE2)) {
diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index aaf1c95087..18a16bf316 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -1686,28 +1686,28 @@ void ppc_cpu_do_interrupt(CPUState *cs)
 
 static int p8_next_unmasked_interrupt(CPUPPCState *env)
 {
-bool async_deliver;
+PowerPCCPU *cpu = env_archcpu(env);
+CPUState *cs = CPU(cpu);
+/* Ignore MSR[EE] when coming out of some power management states */
+bool msr_ee = FIELD_EX64(env->msr, MSR, EE) || env->resume_as_sreset;
 
 assert((env->pending_interrupts & P8_UNUSED_INTERRUPTS) == 0);
 
+if (cs->halted) {
+/* LPCR[PECE] controls which interrupts can exit power-saving mode */
+return p8_interrupt_powersave(env);
+}
+
 /* Machine check exception */
 if (env->pending_interrupts & PPC_INTERRUPT_MCK) {
 return PPC_INTERRUPT_MCK;
 }
 
-/*
- * For interrupts that gate on MSR:EE, we need to do something a
- * bit more subtle, as we need to let them through even when EE is
- * clear when coming out of some power management states (in order
- * for them to become a 0x100).
- */
-async_deliver = FIELD_EX64(env->msr, MSR, EE) || env->resume_as_sreset;
-
 /* Hypervisor decrementer exception */
 if (env->pending_interrupts & PPC_INTERRUPT_HDECR) {
 /* LPCR will be clear when not supported so this will work */
 bool hdice = !!(env->spr[SPR_LPCR] & LPCR_HDICE);
-if ((async_deliver || !FIELD_EX64_HV(env->msr)) && hdice) {
+if ((msr_ee || !FIELD_EX64_HV(env->msr)) && hdice) {
 /* HDEC clears on delivery */
 return PPC_INTERRUPT_HDECR;
 }
@@ -1718,13 +1718,13 @@ static int p8_next_unmasked_interrupt(CPUPPCState *env)
 bool lpes0 = !!(env->spr[SPR_LPCR] & LPCR_LPES0);
 bool heic = !!(env->spr[SPR_LPCR] & LPCR_HEIC);
 /* HEIC blocks delivery to the hypervisor */
-if ((async_deliver && !(heic && FIELD_EX64_HV(env->msr) &&
+if ((msr_ee && !(heic && FIELD_EX64_HV(env->msr) &&
 !FIELD_EX64(env->msr, MSR, PR))) ||
 (env->has_hv_mode && !FIELD_EX64_HV(env->msr) && !lpes0)) {
 return PPC_INTERRUPT_EXT;
 }
 }
-if (async_deliver != 0) {
+if (msr_ee != 0) {
 /* Decrementer exception */
 if (env->pending_interrupts & PPC_INTERRUPT_DECR) {
 return PPC_INTERRUPT_DECR;
diff --git a/target/ppc/internal.h b/target/ppc/internal.h
index 41e79adfdb..9069874adb 100644
--- a/target/ppc/internal.h
+++ b/target/ppc/internal.h
@@ -308,6 +308,7 @@ static inline int ger_pack_masks(int pmsk, int ymsk, int 
xmsk)
 
 #if defined(TARGET_PPC64)
 int p9_interrupt_powersave(CPUPPCState *env);
+int p8_interrupt_powersave(CPUPPCState *env);
 #endif
 
 #endif /* PPC_INTERNAL_H */
-- 
2.25.1




[PATCH v3 10/29] target/ppc: move power-saving interrupt masking out of cpu_has_work_POWER9

2022-10-11 Thread Matheus Ferst
Move the interrupt masking logic out of cpu_has_work_POWER9 in a new
method, p9_interrupt_powersave, that only returns an interrupt if it can
wake the processor from power-saving mode.

Signed-off-by: Matheus Ferst 
---
 target/ppc/cpu_init.c | 126 +-
 1 file changed, 50 insertions(+), 76 deletions(-)

diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index 304ebdc062..5fce293728 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -6351,6 +6351,52 @@ static bool ppc_pvr_match_power9(PowerPCCPUClass *pcc, 
uint32_t pvr, bool best)
 return false;
 }
 
+static int p9_interrupt_powersave(CPUPPCState *env)
+{
+/* External Exception */
+if ((env->pending_interrupts & PPC_INTERRUPT_EXT) &&
+(env->spr[SPR_LPCR] & LPCR_EEE)) {
+bool heic = !!(env->spr[SPR_LPCR] & LPCR_HEIC);
+if (!heic || !FIELD_EX64_HV(env->msr) ||
+FIELD_EX64(env->msr, MSR, PR)) {
+return PPC_INTERRUPT_EXT;
+}
+}
+/* Decrementer Exception */
+if ((env->pending_interrupts & PPC_INTERRUPT_DECR) &&
+(env->spr[SPR_LPCR] & LPCR_DEE)) {
+return PPC_INTERRUPT_DECR;
+}
+/* Machine Check or Hypervisor Maintenance Exception */
+if (env->spr[SPR_LPCR] & LPCR_OEE) {
+if (env->pending_interrupts & PPC_INTERRUPT_MCK) {
+return PPC_INTERRUPT_MCK;
+}
+if (env->pending_interrupts & PPC_INTERRUPT_HMI) {
+return PPC_INTERRUPT_HMI;
+}
+}
+/* Privileged Doorbell Exception */
+if ((env->pending_interrupts & PPC_INTERRUPT_DOORBELL) &&
+(env->spr[SPR_LPCR] & LPCR_PDEE)) {
+return PPC_INTERRUPT_DOORBELL;
+}
+/* Hypervisor Doorbell Exception */
+if ((env->pending_interrupts & PPC_INTERRUPT_HDOORBELL) &&
+(env->spr[SPR_LPCR] & LPCR_HDEE)) {
+return PPC_INTERRUPT_HDOORBELL;
+}
+/* Hypervisor virtualization exception */
+if ((env->pending_interrupts & PPC_INTERRUPT_HVIRT) &&
+(env->spr[SPR_LPCR] & LPCR_HVEE)) {
+return PPC_INTERRUPT_HVIRT;
+}
+if (env->pending_interrupts & PPC_INTERRUPT_RESET) {
+return PPC_INTERRUPT_RESET;
+}
+return 0;
+}
+
 static bool cpu_has_work_POWER9(CPUState *cs)
 {
 PowerPCCPU *cpu = POWERPC_CPU(cs);
@@ -6367,44 +6413,8 @@ static bool cpu_has_work_POWER9(CPUState *cs)
 if (!(psscr & PSSCR_EC)) {
 return true;
 }
-/* External Exception */
-if ((env->pending_interrupts & PPC_INTERRUPT_EXT) &&
-(env->spr[SPR_LPCR] & LPCR_EEE)) {
-bool heic = !!(env->spr[SPR_LPCR] & LPCR_HEIC);
-if (!heic || !FIELD_EX64_HV(env->msr) ||
-FIELD_EX64(env->msr, MSR, PR)) {
-return true;
-}
-}
-/* Decrementer Exception */
-if ((env->pending_interrupts & PPC_INTERRUPT_DECR) &&
-(env->spr[SPR_LPCR] & LPCR_DEE)) {
-return true;
-}
-/* Machine Check or Hypervisor Maintenance Exception */
-if ((env->pending_interrupts & (PPC_INTERRUPT_MCK | PPC_INTERRUPT_HMI))
-&& (env->spr[SPR_LPCR] & LPCR_OEE)) {
-return true;
-}
-/* Privileged Doorbell Exception */
-if ((env->pending_interrupts & PPC_INTERRUPT_DOORBELL) &&
-(env->spr[SPR_LPCR] & LPCR_PDEE)) {
-return true;
-}
-/* Hypervisor Doorbell Exception */
-if ((env->pending_interrupts & PPC_INTERRUPT_HDOORBELL) &&
-(env->spr[SPR_LPCR] & LPCR_HDEE)) {
-return true;
-}
-/* Hypervisor virtualization exception */
-if ((env->pending_interrupts & PPC_INTERRUPT_HVIRT) &&
-(env->spr[SPR_LPCR] & LPCR_HVEE)) {
-return true;
-}
-if (env->pending_interrupts & PPC_INTERRUPT_RESET) {
-return true;
-}
-return false;
+
+return p9_interrupt_powersave(env) != 0;
 } else {
 return FIELD_EX64(env->msr, MSR, EE) &&
(cs->interrupt_request & CPU_INTERRUPT_HARD);
@@ -6600,44 +6610,8 @@ static bool cpu_has_work_POWER10(CPUState *cs)
 if (!(psscr & PSSCR_EC)) {
 return true;
 }
-/* External Exception */
-if ((env->pending_interrupts & PPC_INTERRUPT_EXT) &&
-(env->spr[SPR_LPCR] & LPCR_EEE)) {
-bool heic = !!(env->spr[SPR_LPCR] & LPCR_HEIC);
-if (!heic || !FIELD_EX64_HV(env->msr) ||
-FIELD_EX64(env->msr, MSR, PR)) {
-return true;
-}
-}
-/* Decrementer Exception */
-if ((env->pending_interrupts & PPC_INTERRUPT_DECR) &&
-(env->spr[SPR_LPCR] & LPCR_DEE)) {
-return true;
-}
-/* Machine Check or Hypervisor Maintenance Exception */
-if ((env->pending_interrupts & (PPC_INTERRUPT_MCK | PPC_INTERRUPT_HMI))
-

[PATCH v3 12/29] target/ppc: create an interrupt masking method for POWER8

2022-10-11 Thread Matheus Ferst
The new method is identical to ppc_next_unmasked_interrupt_generic,
processor-specific code will be added/removed in the following patches.

Signed-off-by: Matheus Ferst 
---
 target/ppc/excp_helper.c | 114 +++
 1 file changed, 114 insertions(+)

diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index d103820afa..19d352a1b2 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -1679,6 +1679,118 @@ void ppc_cpu_do_interrupt(CPUState *cs)
 }
 
 #if defined(TARGET_PPC64)
+static int p8_next_unmasked_interrupt(CPUPPCState *env)
+{
+bool async_deliver;
+
+/* External reset */
+if (env->pending_interrupts & PPC_INTERRUPT_RESET) {
+return PPC_INTERRUPT_RESET;
+}
+/* Machine check exception */
+if (env->pending_interrupts & PPC_INTERRUPT_MCK) {
+return PPC_INTERRUPT_MCK;
+}
+#if 0 /* TODO */
+/* External debug exception */
+if (env->pending_interrupts & PPC_INTERRUPT_DEBUG) {
+return PPC_INTERRUPT_DEBUG;
+}
+#endif
+
+/*
+ * For interrupts that gate on MSR:EE, we need to do something a
+ * bit more subtle, as we need to let them through even when EE is
+ * clear when coming out of some power management states (in order
+ * for them to become a 0x100).
+ */
+async_deliver = FIELD_EX64(env->msr, MSR, EE) || env->resume_as_sreset;
+
+/* Hypervisor decrementer exception */
+if (env->pending_interrupts & PPC_INTERRUPT_HDECR) {
+/* LPCR will be clear when not supported so this will work */
+bool hdice = !!(env->spr[SPR_LPCR] & LPCR_HDICE);
+if ((async_deliver || !FIELD_EX64_HV(env->msr)) && hdice) {
+/* HDEC clears on delivery */
+return PPC_INTERRUPT_HDECR;
+}
+}
+
+/* Hypervisor virtualization interrupt */
+if (env->pending_interrupts & PPC_INTERRUPT_HVIRT) {
+/* LPCR will be clear when not supported so this will work */
+bool hvice = !!(env->spr[SPR_LPCR] & LPCR_HVICE);
+if ((async_deliver || !FIELD_EX64_HV(env->msr)) && hvice) {
+return PPC_INTERRUPT_HVIRT;
+}
+}
+
+/* External interrupt can ignore MSR:EE under some circumstances */
+if (env->pending_interrupts & PPC_INTERRUPT_EXT) {
+bool lpes0 = !!(env->spr[SPR_LPCR] & LPCR_LPES0);
+bool heic = !!(env->spr[SPR_LPCR] & LPCR_HEIC);
+/* HEIC blocks delivery to the hypervisor */
+if ((async_deliver && !(heic && FIELD_EX64_HV(env->msr) &&
+!FIELD_EX64(env->msr, MSR, PR))) ||
+(env->has_hv_mode && !FIELD_EX64_HV(env->msr) && !lpes0)) {
+return PPC_INTERRUPT_EXT;
+}
+}
+if (FIELD_EX64(env->msr, MSR, CE)) {
+/* External critical interrupt */
+if (env->pending_interrupts & PPC_INTERRUPT_CEXT) {
+return PPC_INTERRUPT_CEXT;
+}
+}
+if (async_deliver != 0) {
+/* Watchdog timer on embedded PowerPC */
+if (env->pending_interrupts & PPC_INTERRUPT_WDT) {
+return PPC_INTERRUPT_WDT;
+}
+if (env->pending_interrupts & PPC_INTERRUPT_CDOORBELL) {
+return PPC_INTERRUPT_CDOORBELL;
+}
+/* Fixed interval timer on embedded PowerPC */
+if (env->pending_interrupts & PPC_INTERRUPT_FIT) {
+return PPC_INTERRUPT_FIT;
+}
+/* Programmable interval timer on embedded PowerPC */
+if (env->pending_interrupts & PPC_INTERRUPT_PIT) {
+return PPC_INTERRUPT_PIT;
+}
+/* Decrementer exception */
+if (env->pending_interrupts & PPC_INTERRUPT_DECR) {
+return PPC_INTERRUPT_DECR;
+}
+if (env->pending_interrupts & PPC_INTERRUPT_DOORBELL) {
+return PPC_INTERRUPT_DOORBELL;
+}
+if (env->pending_interrupts & PPC_INTERRUPT_HDOORBELL) {
+return PPC_INTERRUPT_HDOORBELL;
+}
+if (env->pending_interrupts & PPC_INTERRUPT_PERFM) {
+return PPC_INTERRUPT_PERFM;
+}
+/* Thermal interrupt */
+if (env->pending_interrupts & PPC_INTERRUPT_THERM) {
+return PPC_INTERRUPT_THERM;
+}
+/* EBB exception */
+if (env->pending_interrupts & PPC_INTERRUPT_EBB) {
+/*
+ * EBB exception must be taken in problem state and
+ * with BESCR_GE set.
+ */
+if (FIELD_EX64(env->msr, MSR, PR) &&
+(env->spr[SPR_BESCR] & BESCR_GE)) {
+return PPC_INTERRUPT_EBB;
+}
+}
+}
+
+return 0;
+}
+
 #define P9_UNUSED_INTERRUPTS \
 (PPC_INTERRUPT_RESET | PPC_INTERRUPT_DEBUG | PPC_INTERRUPT_CEXT |   \
  PPC_INTERRUPT_WDT | PPC_INTERRUPT_CDOORBELL | PPC_INTERRUPT_FIT |  \
@@ -1891,6 +2003,8 @@ static int ppc_next_unmasked_interrupt(CPUPPCState *env)
 {
 switch (env->excp_model) {
 #if defined(TARGET_PPC64)
+case 

[PATCH v3 11/29] target/ppc: add power-saving interrupt masking logic to p9_next_unmasked_interrupt

2022-10-11 Thread Matheus Ferst
Export p9_interrupt_powersave and use it in p9_next_unmasked_interrupt.

Signed-off-by: Matheus Ferst 
---
Putting the prototype in internal.h for a lack of better place. However,
we will un-export p9_interrupt_powersave in future patches, so it's only
temporary.
---
 target/ppc/cpu_init.c|  2 +-
 target/ppc/excp_helper.c | 46 
 target/ppc/internal.h|  4 
 3 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index 5fce293728..efdcf63282 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -6351,7 +6351,7 @@ static bool ppc_pvr_match_power9(PowerPCCPUClass *pcc, 
uint32_t pvr, bool best)
 return false;
 }
 
-static int p9_interrupt_powersave(CPUPPCState *env)
+int p9_interrupt_powersave(CPUPPCState *env)
 {
 /* External Exception */
 if ((env->pending_interrupts & PPC_INTERRUPT_EXT) &&
diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index fd9745c37e..d103820afa 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -1686,28 +1686,39 @@ void ppc_cpu_do_interrupt(CPUState *cs)
 
 static int p9_next_unmasked_interrupt(CPUPPCState *env)
 {
-bool async_deliver;
+PowerPCCPU *cpu = env_archcpu(env);
+CPUState *cs = CPU(cpu);
+/* Ignore MSR[EE] when coming out of some power management states */
+bool msr_ee = FIELD_EX64(env->msr, MSR, EE) || env->resume_as_sreset;
 
 assert((env->pending_interrupts & P9_UNUSED_INTERRUPTS) == 0);
 
+if (cs->halted) {
+if (env->spr[SPR_PSSCR] & PSSCR_EC) {
+/*
+ * When PSSCR[EC] is set, LPCR[PECE] controls which interrupts can
+ * wakeup the processor
+ */
+return p9_interrupt_powersave(env);
+} else {
+/*
+ * When it's clear, any system-caused exception exits power-saving
+ * mode, even the ones that gate on MSR[EE].
+ */
+msr_ee = true;
+}
+}
+
 /* Machine check exception */
 if (env->pending_interrupts & PPC_INTERRUPT_MCK) {
 return PPC_INTERRUPT_MCK;
 }
 
-/*
- * For interrupts that gate on MSR:EE, we need to do something a
- * bit more subtle, as we need to let them through even when EE is
- * clear when coming out of some power management states (in order
- * for them to become a 0x100).
- */
-async_deliver = FIELD_EX64(env->msr, MSR, EE) || env->resume_as_sreset;
-
 /* Hypervisor decrementer exception */
 if (env->pending_interrupts & PPC_INTERRUPT_HDECR) {
 /* LPCR will be clear when not supported so this will work */
 bool hdice = !!(env->spr[SPR_LPCR] & LPCR_HDICE);
-if ((async_deliver || !FIELD_EX64_HV(env->msr)) && hdice) {
+if ((msr_ee || !FIELD_EX64_HV(env->msr)) && hdice) {
 /* HDEC clears on delivery */
 return PPC_INTERRUPT_HDECR;
 }
@@ -1717,7 +1728,7 @@ static int p9_next_unmasked_interrupt(CPUPPCState *env)
 if (env->pending_interrupts & PPC_INTERRUPT_HVIRT) {
 /* LPCR will be clear when not supported so this will work */
 bool hvice = !!(env->spr[SPR_LPCR] & LPCR_HVICE);
-if ((async_deliver || !FIELD_EX64_HV(env->msr)) && hvice) {
+if ((msr_ee || !FIELD_EX64_HV(env->msr)) && hvice) {
 return PPC_INTERRUPT_HVIRT;
 }
 }
@@ -1727,13 +1738,13 @@ static int p9_next_unmasked_interrupt(CPUPPCState *env)
 bool lpes0 = !!(env->spr[SPR_LPCR] & LPCR_LPES0);
 bool heic = !!(env->spr[SPR_LPCR] & LPCR_HEIC);
 /* HEIC blocks delivery to the hypervisor */
-if ((async_deliver && !(heic && FIELD_EX64_HV(env->msr) &&
+if ((msr_ee && !(heic && FIELD_EX64_HV(env->msr) &&
 !FIELD_EX64(env->msr, MSR, PR))) ||
 (env->has_hv_mode && !FIELD_EX64_HV(env->msr) && !lpes0)) {
 return PPC_INTERRUPT_EXT;
 }
 }
-if (async_deliver != 0) {
+if (msr_ee != 0) {
 /* Decrementer exception */
 if (env->pending_interrupts & PPC_INTERRUPT_DECR) {
 return PPC_INTERRUPT_DECR;
@@ -1895,6 +1906,15 @@ static void p9_deliver_interrupt(CPUPPCState *env, int 
interrupt)
 PowerPCCPU *cpu = env_archcpu(env);
 CPUState *cs = env_cpu(env);
 
+if (cs->halted && !(env->spr[SPR_PSSCR] & PSSCR_EC) &&
+!FIELD_EX64(env->msr, MSR, EE)) {
+/*
+ * A pending interrupt took us out of power-saving, but MSR[EE] says
+ * that we should return to NIP+4 instead of delivering it.
+ */
+return;
+}
+
 switch (interrupt) {
 case PPC_INTERRUPT_MCK: /* Machine check exception */
 env->pending_interrupts &= ~PPC_INTERRUPT_MCK;
diff --git a/target/ppc/internal.h b/target/ppc/internal.h
index 337a362205..41e79adfdb 100644
--- a/target/ppc/internal.h
+++ b/target/ppc/internal.h
@@ -306,4 +306,8 @@ static inline int 

[PATCH v3 07/29] target/ppc: create an interrupt deliver method for POWER9/POWER10

2022-10-11 Thread Matheus Ferst
The new method is identical to ppc_deliver_interrupt, processor-specific
code will be added/removed in the following patches.

Signed-off-by: Matheus Ferst 
---
 target/ppc/excp_helper.c | 118 +++
 1 file changed, 118 insertions(+)

diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index da9c928350..9ebc0a0d31 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -1889,6 +1889,118 @@ static int ppc_next_unmasked_interrupt(CPUPPCState *env)
 }
 }
 
+#if defined(TARGET_PPC64)
+static void p9_deliver_interrupt(CPUPPCState *env, int interrupt)
+{
+PowerPCCPU *cpu = env_archcpu(env);
+CPUState *cs = env_cpu(env);
+
+switch (interrupt) {
+case PPC_INTERRUPT_RESET: /* External reset */
+env->pending_interrupts &= ~PPC_INTERRUPT_RESET;
+powerpc_excp(cpu, POWERPC_EXCP_RESET);
+break;
+case PPC_INTERRUPT_MCK: /* Machine check exception */
+env->pending_interrupts &= ~PPC_INTERRUPT_MCK;
+powerpc_excp(cpu, POWERPC_EXCP_MCHECK);
+break;
+#if 0 /* TODO */
+case PPC_INTERRUPT_DEBUG: /* External debug exception */
+env->pending_interrupts &= ~PPC_INTERRUPT_DEBUG;
+powerpc_excp(cpu, POWERPC_EXCP_DEBUG);
+break;
+#endif
+
+case PPC_INTERRUPT_HDECR: /* Hypervisor decrementer exception */
+/* HDEC clears on delivery */
+env->pending_interrupts &= ~PPC_INTERRUPT_HDECR;
+powerpc_excp(cpu, POWERPC_EXCP_HDECR);
+break;
+case PPC_INTERRUPT_HVIRT: /* Hypervisor virtualization interrupt */
+powerpc_excp(cpu, POWERPC_EXCP_HVIRT);
+break;
+
+case PPC_INTERRUPT_EXT:
+if (books_vhyp_promotes_external_to_hvirt(cpu)) {
+powerpc_excp(cpu, POWERPC_EXCP_HVIRT);
+} else {
+powerpc_excp(cpu, POWERPC_EXCP_EXTERNAL);
+}
+break;
+case PPC_INTERRUPT_CEXT: /* External critical interrupt */
+powerpc_excp(cpu, POWERPC_EXCP_CRITICAL);
+break;
+
+case PPC_INTERRUPT_WDT: /* Watchdog timer on embedded PowerPC */
+env->pending_interrupts &= ~PPC_INTERRUPT_WDT;
+powerpc_excp(cpu, POWERPC_EXCP_WDT);
+break;
+case PPC_INTERRUPT_CDOORBELL:
+env->pending_interrupts &= ~PPC_INTERRUPT_CDOORBELL;
+powerpc_excp(cpu, POWERPC_EXCP_DOORCI);
+break;
+case PPC_INTERRUPT_FIT: /* Fixed interval timer on embedded PowerPC */
+env->pending_interrupts &= ~PPC_INTERRUPT_FIT;
+powerpc_excp(cpu, POWERPC_EXCP_FIT);
+break;
+case PPC_INTERRUPT_PIT: /* Programmable interval timer on embedded PowerPC 
*/
+env->pending_interrupts &= ~PPC_INTERRUPT_PIT;
+powerpc_excp(cpu, POWERPC_EXCP_PIT);
+break;
+case PPC_INTERRUPT_DECR: /* Decrementer exception */
+if (ppc_decr_clear_on_delivery(env)) {
+env->pending_interrupts &= ~PPC_INTERRUPT_DECR;
+}
+powerpc_excp(cpu, POWERPC_EXCP_DECR);
+break;
+case PPC_INTERRUPT_DOORBELL:
+env->pending_interrupts &= ~PPC_INTERRUPT_DOORBELL;
+if (is_book3s_arch2x(env)) {
+powerpc_excp(cpu, POWERPC_EXCP_SDOOR);
+} else {
+powerpc_excp(cpu, POWERPC_EXCP_DOORI);
+}
+break;
+case PPC_INTERRUPT_HDOORBELL:
+env->pending_interrupts &= ~PPC_INTERRUPT_HDOORBELL;
+powerpc_excp(cpu, POWERPC_EXCP_SDOOR_HV);
+break;
+case PPC_INTERRUPT_PERFM:
+env->pending_interrupts &= ~PPC_INTERRUPT_PERFM;
+powerpc_excp(cpu, POWERPC_EXCP_PERFM);
+break;
+case PPC_INTERRUPT_THERM:  /* Thermal interrupt */
+env->pending_interrupts &= ~PPC_INTERRUPT_THERM;
+powerpc_excp(cpu, POWERPC_EXCP_THERM);
+break;
+case PPC_INTERRUPT_EBB: /* EBB exception */
+env->pending_interrupts &= ~PPC_INTERRUPT_EBB;
+if (env->spr[SPR_BESCR] & BESCR_PMEO) {
+powerpc_excp(cpu, POWERPC_EXCP_PERFM_EBB);
+} else if (env->spr[SPR_BESCR] & BESCR_EEO) {
+powerpc_excp(cpu, POWERPC_EXCP_EXTERNAL_EBB);
+}
+break;
+case 0:
+/*
+ * This is a bug ! It means that has_work took us out of halt without
+ * anything to deliver while in a PM state that requires getting
+ * out via a 0x100
+ *
+ * This means we will incorrectly execute past the power management
+ * instruction instead of triggering a reset.
+ *
+ * It generally means a discrepancy between the wakeup conditions in 
the
+ * processor has_work implementation and the logic in this function.
+ */
+assert(!env->resume_as_sreset);
+break;
+default:
+cpu_abort(cs, "Invalid PowerPC interrupt %d. Aborting\n", interrupt);
+}
+}
+#endif
+
 static void ppc_deliver_interrupt_generic(CPUPPCState *env, int interrupt)
 {
 PowerPCCPU *cpu = env_archcpu(env);
@@ -2002,6 +2114,12 @@ 

[PATCH v3 09/29] target/ppc: remove generic architecture checks from p9_deliver_interrupt

2022-10-11 Thread Matheus Ferst
Signed-off-by: Matheus Ferst 
---
 target/ppc/excp_helper.c | 9 +
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index fb946385cc..fd9745c37e 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -1919,18 +1919,11 @@ static void p9_deliver_interrupt(CPUPPCState *env, int 
interrupt)
 break;
 
 case PPC_INTERRUPT_DECR: /* Decrementer exception */
-if (ppc_decr_clear_on_delivery(env)) {
-env->pending_interrupts &= ~PPC_INTERRUPT_DECR;
-}
 powerpc_excp(cpu, POWERPC_EXCP_DECR);
 break;
 case PPC_INTERRUPT_DOORBELL:
 env->pending_interrupts &= ~PPC_INTERRUPT_DOORBELL;
-if (is_book3s_arch2x(env)) {
-powerpc_excp(cpu, POWERPC_EXCP_SDOOR);
-} else {
-powerpc_excp(cpu, POWERPC_EXCP_DOORI);
-}
+powerpc_excp(cpu, POWERPC_EXCP_SDOOR);
 break;
 case PPC_INTERRUPT_HDOORBELL:
 env->pending_interrupts &= ~PPC_INTERRUPT_HDOORBELL;
-- 
2.25.1




[PATCH v3 06/29] target/ppc: remove unused interrupts from p9_next_unmasked_interrupt

2022-10-11 Thread Matheus Ferst
Remove the following unused interrupts from the POWER9 interrupt masking
method:
- PPC_INTERRUPT_RESET: only raised for 6xx, 7xx, 970 and POWER5p;
- Debug Interrupt: removed in Power ISA v2.07;
- Critical Input, Watchdog Timer, and Fixed Interval Timer: only defined
  for embedded CPUs;
- Critical Doorbell Interrupt: removed in Power ISA v3.0;
- Programmable Interval Timer: 40x-only.

Signed-off-by: Matheus Ferst 
---
v3:
 - Fixed method name in subject.
---
 target/ppc/excp_helper.c | 42 +++-
 1 file changed, 7 insertions(+), 35 deletions(-)

diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index 834181cdaf..da9c928350 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -1679,24 +1679,21 @@ void ppc_cpu_do_interrupt(CPUState *cs)
 }
 
 #if defined(TARGET_PPC64)
+#define P9_UNUSED_INTERRUPTS \
+(PPC_INTERRUPT_RESET | PPC_INTERRUPT_DEBUG | PPC_INTERRUPT_CEXT |   \
+ PPC_INTERRUPT_WDT | PPC_INTERRUPT_CDOORBELL | PPC_INTERRUPT_FIT |  \
+ PPC_INTERRUPT_PIT | PPC_INTERRUPT_THERM)
+
 static int p9_next_unmasked_interrupt(CPUPPCState *env)
 {
 bool async_deliver;
 
-/* External reset */
-if (env->pending_interrupts & PPC_INTERRUPT_RESET) {
-return PPC_INTERRUPT_RESET;
-}
+assert((env->pending_interrupts & P9_UNUSED_INTERRUPTS) == 0);
+
 /* Machine check exception */
 if (env->pending_interrupts & PPC_INTERRUPT_MCK) {
 return PPC_INTERRUPT_MCK;
 }
-#if 0 /* TODO */
-/* External debug exception */
-if (env->pending_interrupts & PPC_INTERRUPT_DEBUG) {
-return PPC_INTERRUPT_DEBUG;
-}
-#endif
 
 /*
  * For interrupts that gate on MSR:EE, we need to do something a
@@ -1736,28 +1733,7 @@ static int p9_next_unmasked_interrupt(CPUPPCState *env)
 return PPC_INTERRUPT_EXT;
 }
 }
-if (FIELD_EX64(env->msr, MSR, CE)) {
-/* External critical interrupt */
-if (env->pending_interrupts & PPC_INTERRUPT_CEXT) {
-return PPC_INTERRUPT_CEXT;
-}
-}
 if (async_deliver != 0) {
-/* Watchdog timer on embedded PowerPC */
-if (env->pending_interrupts & PPC_INTERRUPT_WDT) {
-return PPC_INTERRUPT_WDT;
-}
-if (env->pending_interrupts & PPC_INTERRUPT_CDOORBELL) {
-return PPC_INTERRUPT_CDOORBELL;
-}
-/* Fixed interval timer on embedded PowerPC */
-if (env->pending_interrupts & PPC_INTERRUPT_FIT) {
-return PPC_INTERRUPT_FIT;
-}
-/* Programmable interval timer on embedded PowerPC */
-if (env->pending_interrupts & PPC_INTERRUPT_PIT) {
-return PPC_INTERRUPT_PIT;
-}
 /* Decrementer exception */
 if (env->pending_interrupts & PPC_INTERRUPT_DECR) {
 return PPC_INTERRUPT_DECR;
@@ -1771,10 +1747,6 @@ static int p9_next_unmasked_interrupt(CPUPPCState *env)
 if (env->pending_interrupts & PPC_INTERRUPT_PERFM) {
 return PPC_INTERRUPT_PERFM;
 }
-/* Thermal interrupt */
-if (env->pending_interrupts & PPC_INTERRUPT_THERM) {
-return PPC_INTERRUPT_THERM;
-}
 /* EBB exception */
 if (env->pending_interrupts & PPC_INTERRUPT_EBB) {
 /*
-- 
2.25.1




[PATCH v3 04/29] target/ppc: prepare to split interrupt masking and delivery by excp_model

2022-10-11 Thread Matheus Ferst
Signed-off-by: Matheus Ferst 
---
 target/ppc/excp_helper.c | 20 ++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index f92b6c2b18..7d196d1581 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -1678,7 +1678,7 @@ void ppc_cpu_do_interrupt(CPUState *cs)
 powerpc_excp(cpu, cs->exception_index);
 }
 
-static int ppc_next_unmasked_interrupt(CPUPPCState *env)
+static int ppc_next_unmasked_interrupt_generic(CPUPPCState *env)
 {
 bool async_deliver;
 
@@ -1790,7 +1790,15 @@ static int ppc_next_unmasked_interrupt(CPUPPCState *env)
 return 0;
 }
 
-static void ppc_deliver_interrupt(CPUPPCState *env, int interrupt)
+static int ppc_next_unmasked_interrupt(CPUPPCState *env)
+{
+switch (env->excp_model) {
+default:
+return ppc_next_unmasked_interrupt_generic(env);
+}
+}
+
+static void ppc_deliver_interrupt_generic(CPUPPCState *env, int interrupt)
 {
 PowerPCCPU *cpu = env_archcpu(env);
 CPUState *cs = env_cpu(env);
@@ -1900,6 +1908,14 @@ static void ppc_deliver_interrupt(CPUPPCState *env, int 
interrupt)
 }
 }
 
+static void ppc_deliver_interrupt(CPUPPCState *env, int interrupt)
+{
+switch (env->excp_model) {
+default:
+ppc_deliver_interrupt_generic(env, interrupt);
+}
+}
+
 void ppc_cpu_do_system_reset(CPUState *cs)
 {
 PowerPCCPU *cpu = POWERPC_CPU(cs);
-- 
2.25.1




[PATCH v3 08/29] target/ppc: remove unused interrupts from p9_deliver_interrupt

2022-10-11 Thread Matheus Ferst
Remove the following unused interrupts from the POWER9 interrupt
processing method:
- PPC_INTERRUPT_RESET: only raised for 6xx, 7xx, 970 and POWER5p;
- Debug Interrupt: removed in Power ISA v2.07;
- Critical Input, Watchdog Timer, and Fixed Interval Timer: only defined
  for embedded CPUs;
- Critical Doorbell Interrupt: removed in Power ISA v3.0;
- Programmable Interval Timer: 40x-only.

Signed-off-by: Matheus Ferst 
---
 target/ppc/excp_helper.c | 33 -
 1 file changed, 33 deletions(-)

diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index 9ebc0a0d31..fb946385cc 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -1896,20 +1896,10 @@ static void p9_deliver_interrupt(CPUPPCState *env, int 
interrupt)
 CPUState *cs = env_cpu(env);
 
 switch (interrupt) {
-case PPC_INTERRUPT_RESET: /* External reset */
-env->pending_interrupts &= ~PPC_INTERRUPT_RESET;
-powerpc_excp(cpu, POWERPC_EXCP_RESET);
-break;
 case PPC_INTERRUPT_MCK: /* Machine check exception */
 env->pending_interrupts &= ~PPC_INTERRUPT_MCK;
 powerpc_excp(cpu, POWERPC_EXCP_MCHECK);
 break;
-#if 0 /* TODO */
-case PPC_INTERRUPT_DEBUG: /* External debug exception */
-env->pending_interrupts &= ~PPC_INTERRUPT_DEBUG;
-powerpc_excp(cpu, POWERPC_EXCP_DEBUG);
-break;
-#endif
 
 case PPC_INTERRUPT_HDECR: /* Hypervisor decrementer exception */
 /* HDEC clears on delivery */
@@ -1927,26 +1917,7 @@ static void p9_deliver_interrupt(CPUPPCState *env, int 
interrupt)
 powerpc_excp(cpu, POWERPC_EXCP_EXTERNAL);
 }
 break;
-case PPC_INTERRUPT_CEXT: /* External critical interrupt */
-powerpc_excp(cpu, POWERPC_EXCP_CRITICAL);
-break;
 
-case PPC_INTERRUPT_WDT: /* Watchdog timer on embedded PowerPC */
-env->pending_interrupts &= ~PPC_INTERRUPT_WDT;
-powerpc_excp(cpu, POWERPC_EXCP_WDT);
-break;
-case PPC_INTERRUPT_CDOORBELL:
-env->pending_interrupts &= ~PPC_INTERRUPT_CDOORBELL;
-powerpc_excp(cpu, POWERPC_EXCP_DOORCI);
-break;
-case PPC_INTERRUPT_FIT: /* Fixed interval timer on embedded PowerPC */
-env->pending_interrupts &= ~PPC_INTERRUPT_FIT;
-powerpc_excp(cpu, POWERPC_EXCP_FIT);
-break;
-case PPC_INTERRUPT_PIT: /* Programmable interval timer on embedded PowerPC 
*/
-env->pending_interrupts &= ~PPC_INTERRUPT_PIT;
-powerpc_excp(cpu, POWERPC_EXCP_PIT);
-break;
 case PPC_INTERRUPT_DECR: /* Decrementer exception */
 if (ppc_decr_clear_on_delivery(env)) {
 env->pending_interrupts &= ~PPC_INTERRUPT_DECR;
@@ -1969,10 +1940,6 @@ static void p9_deliver_interrupt(CPUPPCState *env, int 
interrupt)
 env->pending_interrupts &= ~PPC_INTERRUPT_PERFM;
 powerpc_excp(cpu, POWERPC_EXCP_PERFM);
 break;
-case PPC_INTERRUPT_THERM:  /* Thermal interrupt */
-env->pending_interrupts &= ~PPC_INTERRUPT_THERM;
-powerpc_excp(cpu, POWERPC_EXCP_THERM);
-break;
 case PPC_INTERRUPT_EBB: /* EBB exception */
 env->pending_interrupts &= ~PPC_INTERRUPT_EBB;
 if (env->spr[SPR_BESCR] & BESCR_PMEO) {
-- 
2.25.1




[PATCH v3 05/29] target/ppc: create an interrupt masking method for POWER9/POWER10

2022-10-11 Thread Matheus Ferst
The new method is identical to ppc_next_unmasked_interrupt_generic,
processor-specific code will be added/removed in the following patches.

Signed-off-by: Matheus Ferst 
---
 target/ppc/excp_helper.c | 119 +++
 1 file changed, 119 insertions(+)

diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index 7d196d1581..834181cdaf 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -1678,6 +1678,120 @@ void ppc_cpu_do_interrupt(CPUState *cs)
 powerpc_excp(cpu, cs->exception_index);
 }
 
+#if defined(TARGET_PPC64)
+static int p9_next_unmasked_interrupt(CPUPPCState *env)
+{
+bool async_deliver;
+
+/* External reset */
+if (env->pending_interrupts & PPC_INTERRUPT_RESET) {
+return PPC_INTERRUPT_RESET;
+}
+/* Machine check exception */
+if (env->pending_interrupts & PPC_INTERRUPT_MCK) {
+return PPC_INTERRUPT_MCK;
+}
+#if 0 /* TODO */
+/* External debug exception */
+if (env->pending_interrupts & PPC_INTERRUPT_DEBUG) {
+return PPC_INTERRUPT_DEBUG;
+}
+#endif
+
+/*
+ * For interrupts that gate on MSR:EE, we need to do something a
+ * bit more subtle, as we need to let them through even when EE is
+ * clear when coming out of some power management states (in order
+ * for them to become a 0x100).
+ */
+async_deliver = FIELD_EX64(env->msr, MSR, EE) || env->resume_as_sreset;
+
+/* Hypervisor decrementer exception */
+if (env->pending_interrupts & PPC_INTERRUPT_HDECR) {
+/* LPCR will be clear when not supported so this will work */
+bool hdice = !!(env->spr[SPR_LPCR] & LPCR_HDICE);
+if ((async_deliver || !FIELD_EX64_HV(env->msr)) && hdice) {
+/* HDEC clears on delivery */
+return PPC_INTERRUPT_HDECR;
+}
+}
+
+/* Hypervisor virtualization interrupt */
+if (env->pending_interrupts & PPC_INTERRUPT_HVIRT) {
+/* LPCR will be clear when not supported so this will work */
+bool hvice = !!(env->spr[SPR_LPCR] & LPCR_HVICE);
+if ((async_deliver || !FIELD_EX64_HV(env->msr)) && hvice) {
+return PPC_INTERRUPT_HVIRT;
+}
+}
+
+/* External interrupt can ignore MSR:EE under some circumstances */
+if (env->pending_interrupts & PPC_INTERRUPT_EXT) {
+bool lpes0 = !!(env->spr[SPR_LPCR] & LPCR_LPES0);
+bool heic = !!(env->spr[SPR_LPCR] & LPCR_HEIC);
+/* HEIC blocks delivery to the hypervisor */
+if ((async_deliver && !(heic && FIELD_EX64_HV(env->msr) &&
+!FIELD_EX64(env->msr, MSR, PR))) ||
+(env->has_hv_mode && !FIELD_EX64_HV(env->msr) && !lpes0)) {
+return PPC_INTERRUPT_EXT;
+}
+}
+if (FIELD_EX64(env->msr, MSR, CE)) {
+/* External critical interrupt */
+if (env->pending_interrupts & PPC_INTERRUPT_CEXT) {
+return PPC_INTERRUPT_CEXT;
+}
+}
+if (async_deliver != 0) {
+/* Watchdog timer on embedded PowerPC */
+if (env->pending_interrupts & PPC_INTERRUPT_WDT) {
+return PPC_INTERRUPT_WDT;
+}
+if (env->pending_interrupts & PPC_INTERRUPT_CDOORBELL) {
+return PPC_INTERRUPT_CDOORBELL;
+}
+/* Fixed interval timer on embedded PowerPC */
+if (env->pending_interrupts & PPC_INTERRUPT_FIT) {
+return PPC_INTERRUPT_FIT;
+}
+/* Programmable interval timer on embedded PowerPC */
+if (env->pending_interrupts & PPC_INTERRUPT_PIT) {
+return PPC_INTERRUPT_PIT;
+}
+/* Decrementer exception */
+if (env->pending_interrupts & PPC_INTERRUPT_DECR) {
+return PPC_INTERRUPT_DECR;
+}
+if (env->pending_interrupts & PPC_INTERRUPT_DOORBELL) {
+return PPC_INTERRUPT_DOORBELL;
+}
+if (env->pending_interrupts & PPC_INTERRUPT_HDOORBELL) {
+return PPC_INTERRUPT_HDOORBELL;
+}
+if (env->pending_interrupts & PPC_INTERRUPT_PERFM) {
+return PPC_INTERRUPT_PERFM;
+}
+/* Thermal interrupt */
+if (env->pending_interrupts & PPC_INTERRUPT_THERM) {
+return PPC_INTERRUPT_THERM;
+}
+/* EBB exception */
+if (env->pending_interrupts & PPC_INTERRUPT_EBB) {
+/*
+ * EBB exception must be taken in problem state and
+ * with BESCR_GE set.
+ */
+if (FIELD_EX64(env->msr, MSR, PR) &&
+(env->spr[SPR_BESCR] & BESCR_GE)) {
+return PPC_INTERRUPT_EBB;
+}
+}
+}
+
+return 0;
+}
+#endif
+
 static int ppc_next_unmasked_interrupt_generic(CPUPPCState *env)
 {
 bool async_deliver;
@@ -1793,6 +1907,11 @@ static int 
ppc_next_unmasked_interrupt_generic(CPUPPCState *env)
 static int ppc_next_unmasked_interrupt(CPUPPCState *env)
 {
 switch (env->excp_model) {
+#if 

[PATCH v3 03/29] target/ppc: split interrupt masking and delivery from ppc_hw_interrupt

2022-10-11 Thread Matheus Ferst
Split ppc_hw_interrupt into an interrupt masking method,
ppc_next_unmasked_interrupt, and an interrupt processing method,
ppc_deliver_interrupt.

Signed-off-by: Matheus Ferst 
---
 target/ppc/excp_helper.c | 207 +--
 1 file changed, 131 insertions(+), 76 deletions(-)

diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index c3c30c5d1b..f92b6c2b18 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -1678,29 +1678,22 @@ void ppc_cpu_do_interrupt(CPUState *cs)
 powerpc_excp(cpu, cs->exception_index);
 }
 
-static void ppc_hw_interrupt(CPUPPCState *env)
+static int ppc_next_unmasked_interrupt(CPUPPCState *env)
 {
-PowerPCCPU *cpu = env_archcpu(env);
 bool async_deliver;
 
 /* External reset */
 if (env->pending_interrupts & PPC_INTERRUPT_RESET) {
-env->pending_interrupts &= ~PPC_INTERRUPT_RESET;
-powerpc_excp(cpu, POWERPC_EXCP_RESET);
-return;
+return PPC_INTERRUPT_RESET;
 }
 /* Machine check exception */
 if (env->pending_interrupts & PPC_INTERRUPT_MCK) {
-env->pending_interrupts &= ~PPC_INTERRUPT_MCK;
-powerpc_excp(cpu, POWERPC_EXCP_MCHECK);
-return;
+return PPC_INTERRUPT_MCK;
 }
 #if 0 /* TODO */
 /* External debug exception */
 if (env->pending_interrupts & PPC_INTERRUPT_DEBUG) {
-env->pending_interrupts &= ~PPC_INTERRUPT_DEBUG;
-powerpc_excp(cpu, POWERPC_EXCP_DEBUG);
-return;
+return PPC_INTERRUPT_DEBUG;
 }
 #endif
 
@@ -1718,9 +1711,7 @@ static void ppc_hw_interrupt(CPUPPCState *env)
 bool hdice = !!(env->spr[SPR_LPCR] & LPCR_HDICE);
 if ((async_deliver || !FIELD_EX64_HV(env->msr)) && hdice) {
 /* HDEC clears on delivery */
-env->pending_interrupts &= ~PPC_INTERRUPT_HDECR;
-powerpc_excp(cpu, POWERPC_EXCP_HDECR);
-return;
+return PPC_INTERRUPT_HDECR;
 }
 }
 
@@ -1729,8 +1720,7 @@ static void ppc_hw_interrupt(CPUPPCState *env)
 /* LPCR will be clear when not supported so this will work */
 bool hvice = !!(env->spr[SPR_LPCR] & LPCR_HVICE);
 if ((async_deliver || !FIELD_EX64_HV(env->msr)) && hvice) {
-powerpc_excp(cpu, POWERPC_EXCP_HVIRT);
-return;
+return PPC_INTERRUPT_HVIRT;
 }
 }
 
@@ -1742,77 +1732,47 @@ static void ppc_hw_interrupt(CPUPPCState *env)
 if ((async_deliver && !(heic && FIELD_EX64_HV(env->msr) &&
 !FIELD_EX64(env->msr, MSR, PR))) ||
 (env->has_hv_mode && !FIELD_EX64_HV(env->msr) && !lpes0)) {
-if (books_vhyp_promotes_external_to_hvirt(cpu)) {
-powerpc_excp(cpu, POWERPC_EXCP_HVIRT);
-} else {
-powerpc_excp(cpu, POWERPC_EXCP_EXTERNAL);
-}
-return;
+return PPC_INTERRUPT_EXT;
 }
 }
 if (FIELD_EX64(env->msr, MSR, CE)) {
 /* External critical interrupt */
 if (env->pending_interrupts & PPC_INTERRUPT_CEXT) {
-powerpc_excp(cpu, POWERPC_EXCP_CRITICAL);
-return;
+return PPC_INTERRUPT_CEXT;
 }
 }
 if (async_deliver != 0) {
 /* Watchdog timer on embedded PowerPC */
 if (env->pending_interrupts & PPC_INTERRUPT_WDT) {
-env->pending_interrupts &= ~PPC_INTERRUPT_WDT;
-powerpc_excp(cpu, POWERPC_EXCP_WDT);
-return;
+return PPC_INTERRUPT_WDT;
 }
 if (env->pending_interrupts & PPC_INTERRUPT_CDOORBELL) {
-env->pending_interrupts &= ~PPC_INTERRUPT_CDOORBELL;
-powerpc_excp(cpu, POWERPC_EXCP_DOORCI);
-return;
+return PPC_INTERRUPT_CDOORBELL;
 }
 /* Fixed interval timer on embedded PowerPC */
 if (env->pending_interrupts & PPC_INTERRUPT_FIT) {
-env->pending_interrupts &= ~PPC_INTERRUPT_FIT;
-powerpc_excp(cpu, POWERPC_EXCP_FIT);
-return;
+return PPC_INTERRUPT_FIT;
 }
 /* Programmable interval timer on embedded PowerPC */
 if (env->pending_interrupts & PPC_INTERRUPT_PIT) {
-env->pending_interrupts &= ~PPC_INTERRUPT_PIT;
-powerpc_excp(cpu, POWERPC_EXCP_PIT);
-return;
+return PPC_INTERRUPT_PIT;
 }
 /* Decrementer exception */
 if (env->pending_interrupts & PPC_INTERRUPT_DECR) {
-if (ppc_decr_clear_on_delivery(env)) {
-env->pending_interrupts &= ~PPC_INTERRUPT_DECR;
-}
-powerpc_excp(cpu, POWERPC_EXCP_DECR);
-return;
+return PPC_INTERRUPT_DECR;
 }
 if (env->pending_interrupts & PPC_INTERRUPT_DOORBELL) {
-env->pending_interrupts &= ~PPC_INTERRUPT_DOORBELL;
-if (is_book3s_arch2x(env)) {
-powerpc_excp(cpu, POWERPC_EXCP_SDOOR);
-  

[PATCH v2 8/8] reset: do not re-randomize RNG seed on snapshot load

2022-10-11 Thread Jason A. Donenfeld
Snapshot loading is supposed to be deterministic, so we shouldn't
re-randomize the various seeds used.

Signed-off-by: Jason A. Donenfeld 
---
 hw/arm/boot.c  | 3 ++-
 hw/i386/x86.c  | 2 +-
 hw/mips/boston.c   | 2 +-
 hw/openrisc/boot.c | 2 +-
 hw/riscv/boot.c| 2 +-
 hw/rx/rx-gdbsim.c  | 2 +-
 6 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/hw/arm/boot.c b/hw/arm/boot.c
index 6a6f4c92c2..511f7b22b1 100644
--- a/hw/arm/boot.c
+++ b/hw/arm/boot.c
@@ -683,7 +683,8 @@ int arm_load_dtb(hwaddr addr, const struct arm_boot_info 
*binfo,
  * the DTB is copied again upon reset, even if addr points into RAM.
  */
 rom_add_blob_fixed_as("dtb", fdt, size, addr, as);
-qemu_register_reset(qemu_fdt_randomize_seeds, rom_ptr_for_as(as, addr, 
size));
+qemu_register_reset_nosnapshotload(qemu_fdt_randomize_seeds,
+   rom_ptr_for_as(as, addr, size));
 
 g_free(fdt);
 
diff --git a/hw/i386/x86.c b/hw/i386/x86.c
index 1148f70c03..bd50a064a3 100644
--- a/hw/i386/x86.c
+++ b/hw/i386/x86.c
@@ -,7 +,7 @@ void x86_load_linux(X86MachineState *x86ms,
 setup_data->type = cpu_to_le32(SETUP_RNG_SEED);
 setup_data->len = cpu_to_le32(RNG_SEED_LENGTH);
 qemu_guest_getrandom_nofail(setup_data->data, RNG_SEED_LENGTH);
-qemu_register_reset(reset_rng_seed, setup_data);
+qemu_register_reset_nosnapshotload(reset_rng_seed, setup_data);
 fw_cfg_add_bytes_callback(fw_cfg, FW_CFG_KERNEL_DATA, reset_rng_seed, 
NULL,
   setup_data, kernel, kernel_size, true);
 } else {
diff --git a/hw/mips/boston.c b/hw/mips/boston.c
index a560ce0324..cab63f43bf 100644
--- a/hw/mips/boston.c
+++ b/hw/mips/boston.c
@@ -811,7 +811,7 @@ static void boston_mach_init(MachineState *machine)
 /* Calculate real fdt size after filter */
 dt_size = fdt_totalsize(dtb_load_data);
 rom_add_blob_fixed("dtb", dtb_load_data, dt_size, dtb_paddr);
-qemu_register_reset(qemu_fdt_randomize_seeds,
+qemu_register_reset_nosnapshotload(qemu_fdt_randomize_seeds,
 rom_ptr(dtb_paddr, dt_size));
 } else {
 /* Try to load file as FIT */
diff --git a/hw/openrisc/boot.c b/hw/openrisc/boot.c
index 8b9f11b6d8..007e80cd5a 100644
--- a/hw/openrisc/boot.c
+++ b/hw/openrisc/boot.c
@@ -112,7 +112,7 @@ uint32_t openrisc_load_fdt(void *fdt, hwaddr load_start,
 
 rom_add_blob_fixed_as("fdt", fdt, fdtsize, fdt_addr,
   _space_memory);
-qemu_register_reset(qemu_fdt_randomize_seeds,
+qemu_register_reset_nosnapshotload(qemu_fdt_randomize_seeds,
 rom_ptr_for_as(_space_memory, fdt_addr, 
fdtsize));
 
 return fdt_addr;
diff --git a/hw/riscv/boot.c b/hw/riscv/boot.c
index aaecf21543..c389edb3cd 100644
--- a/hw/riscv/boot.c
+++ b/hw/riscv/boot.c
@@ -242,7 +242,7 @@ uint64_t riscv_load_fdt(hwaddr dram_base, uint64_t 
mem_size, void *fdt)
 
 rom_add_blob_fixed_as("fdt", fdt, fdtsize, fdt_addr,
   _space_memory);
-qemu_register_reset(qemu_fdt_randomize_seeds,
+qemu_register_reset_nosnapshotload(qemu_fdt_randomize_seeds,
 rom_ptr_for_as(_space_memory, fdt_addr, 
fdtsize));
 
 return fdt_addr;
diff --git a/hw/rx/rx-gdbsim.c b/hw/rx/rx-gdbsim.c
index 198d048964..47c17026c7 100644
--- a/hw/rx/rx-gdbsim.c
+++ b/hw/rx/rx-gdbsim.c
@@ -149,7 +149,7 @@ static void rx_gdbsim_init(MachineState *machine)
 dtb_offset = ROUND_DOWN(machine->ram_size - dtb_size, 16);
 rom_add_blob_fixed("dtb", dtb, dtb_size,
SDRAM_BASE + dtb_offset);
-qemu_register_reset(qemu_fdt_randomize_seeds,
+qemu_register_reset_nosnapshotload(qemu_fdt_randomize_seeds,
 rom_ptr(SDRAM_BASE + dtb_offset, dtb_size));
 /* Set dtb address to R1 */
 RX_CPU(first_cpu)->env.regs[1] = SDRAM_BASE + dtb_offset;
-- 
2.37.3




[PATCH v2 5/8] rx: re-randomize rng-seed on reboot

2022-10-11 Thread Jason A. Donenfeld
When the system reboots, the rng-seed that the FDT has should be
re-randomized, so that the new boot gets a new seed. Since the FDT is in
the ROM region at this point, we add a hook right after the ROM has been
added, so that we have a pointer to that copy of the FDT.

Cc: Yoshinori Sato 
Signed-off-by: Jason A. Donenfeld 
---
 hw/rx/rx-gdbsim.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hw/rx/rx-gdbsim.c b/hw/rx/rx-gdbsim.c
index 8ffe1b8035..198d048964 100644
--- a/hw/rx/rx-gdbsim.c
+++ b/hw/rx/rx-gdbsim.c
@@ -25,6 +25,7 @@
 #include "hw/rx/rx62n.h"
 #include "sysemu/qtest.h"
 #include "sysemu/device_tree.h"
+#include "sysemu/reset.h"
 #include "hw/boards.h"
 #include "qom/object.h"
 
@@ -148,6 +149,8 @@ static void rx_gdbsim_init(MachineState *machine)
 dtb_offset = ROUND_DOWN(machine->ram_size - dtb_size, 16);
 rom_add_blob_fixed("dtb", dtb, dtb_size,
SDRAM_BASE + dtb_offset);
+qemu_register_reset(qemu_fdt_randomize_seeds,
+rom_ptr(SDRAM_BASE + dtb_offset, dtb_size));
 /* Set dtb address to R1 */
 RX_CPU(first_cpu)->env.regs[1] = SDRAM_BASE + dtb_offset;
 }
-- 
2.37.3




[PATCH v2 2/8] arm: re-randomize rng-seed on reboot

2022-10-11 Thread Jason A. Donenfeld
When the system reboots, the rng-seed that the FDT has should be
re-randomized, so that the new boot gets a new seed. Since the FDT is in
the ROM region at this point, we add a hook right after the ROM has been
added, so that we have a pointer to that copy of the FDT.

Cc: Peter Maydell 
Cc: qemu-...@nongnu.org
Signed-off-by: Jason A. Donenfeld 
---
 hw/arm/boot.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/arm/boot.c b/hw/arm/boot.c
index ada2717f76..6a6f4c92c2 100644
--- a/hw/arm/boot.c
+++ b/hw/arm/boot.c
@@ -683,6 +683,7 @@ int arm_load_dtb(hwaddr addr, const struct arm_boot_info 
*binfo,
  * the DTB is copied again upon reset, even if addr points into RAM.
  */
 rom_add_blob_fixed_as("dtb", fdt, size, addr, as);
+qemu_register_reset(qemu_fdt_randomize_seeds, rom_ptr_for_as(as, addr, 
size));
 
 g_free(fdt);
 
-- 
2.37.3




[PATCH v3 01/29] target/ppc: define PPC_INTERRUPT_* values directly

2022-10-11 Thread Matheus Ferst
This enum defines the bit positions in env->pending_interrupts for each
interrupt. However, except for the comparison in kvmppc_set_interrupt,
the values are always used as (1 << PPC_INTERRUPT_*). Define them
directly like that to save some clutter. No functional change intended.

Reviewed-by: David Gibson 
Signed-off-by: Matheus Ferst 
---
 hw/ppc/ppc.c | 10 +++---
 hw/ppc/trace-events  |  2 +-
 target/ppc/cpu.h | 40 +++---
 target/ppc/cpu_init.c| 56 +++---
 target/ppc/excp_helper.c | 74 
 target/ppc/misc_helper.c |  6 ++--
 6 files changed, 94 insertions(+), 94 deletions(-)

diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
index 690f448cb9..77e611e81c 100644
--- a/hw/ppc/ppc.c
+++ b/hw/ppc/ppc.c
@@ -40,7 +40,7 @@
 static void cpu_ppc_tb_stop (CPUPPCState *env);
 static void cpu_ppc_tb_start (CPUPPCState *env);
 
-void ppc_set_irq(PowerPCCPU *cpu, int n_IRQ, int level)
+void ppc_set_irq(PowerPCCPU *cpu, int irq, int level)
 {
 CPUState *cs = CPU(cpu);
 CPUPPCState *env = >env;
@@ -56,21 +56,21 @@ void ppc_set_irq(PowerPCCPU *cpu, int n_IRQ, int level)
 old_pending = env->pending_interrupts;
 
 if (level) {
-env->pending_interrupts |= 1 << n_IRQ;
+env->pending_interrupts |= irq;
 cpu_interrupt(cs, CPU_INTERRUPT_HARD);
 } else {
-env->pending_interrupts &= ~(1 << n_IRQ);
+env->pending_interrupts &= ~irq;
 if (env->pending_interrupts == 0) {
 cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD);
 }
 }
 
 if (old_pending != env->pending_interrupts) {
-kvmppc_set_interrupt(cpu, n_IRQ, level);
+kvmppc_set_interrupt(cpu, irq, level);
 }
 
 
-trace_ppc_irq_set_exit(env, n_IRQ, level, env->pending_interrupts,
+trace_ppc_irq_set_exit(env, irq, level, env->pending_interrupts,
CPU(cpu)->interrupt_request);
 
 if (locked) {
diff --git a/hw/ppc/trace-events b/hw/ppc/trace-events
index a07d5aca0f..956938ebcd 100644
--- a/hw/ppc/trace-events
+++ b/hw/ppc/trace-events
@@ -127,7 +127,7 @@ ppc40x_set_tb_clk(uint32_t value) "new frequency %" PRIu32
 ppc40x_timers_init(uint32_t value) "frequency %" PRIu32
 
 ppc_irq_set(void *env, uint32_t pin, uint32_t level) "env [%p] pin %d level %d"
-ppc_irq_set_exit(void *env, uint32_t n_IRQ, uint32_t level, uint32_t pending, 
uint32_t request) "env [%p] n_IRQ %d level %d => pending 0x%08" PRIx32 " req 
0x%08" PRIx32
+ppc_irq_set_exit(void *env, uint32_t irq, uint32_t level, uint32_t pending, 
uint32_t request) "env [%p] irq 0x%05" PRIx32 " level %d => pending 0x%08" 
PRIx32 " req 0x%08" PRIx32
 ppc_irq_set_state(const char *name, uint32_t level) "\"%s\" level %d"
 ppc_irq_reset(const char *name) "%s"
 ppc_irq_cpu(const char *action) "%s"
diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index cca6c4e51c..2433756973 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -2416,27 +2416,27 @@ enum {
 /* Hardware exceptions definitions */
 enum {
 /* External hardware exception sources */
-PPC_INTERRUPT_RESET = 0,  /* Reset exception  */
-PPC_INTERRUPT_WAKEUP, /* Wakeup exception */
-PPC_INTERRUPT_MCK,/* Machine check exception  */
-PPC_INTERRUPT_EXT,/* External interrupt   */
-PPC_INTERRUPT_SMI,/* System management interrupt  */
-PPC_INTERRUPT_CEXT,   /* Critical external interrupt  */
-PPC_INTERRUPT_DEBUG,  /* External debug exception */
-PPC_INTERRUPT_THERM,  /* Thermal exception*/
+PPC_INTERRUPT_RESET = 0x1,  /* Reset exception
*/
+PPC_INTERRUPT_WAKEUP= 0x2,  /* Wakeup exception   
*/
+PPC_INTERRUPT_MCK   = 0x4,  /* Machine check exception
*/
+PPC_INTERRUPT_EXT   = 0x8,  /* External interrupt 
*/
+PPC_INTERRUPT_SMI   = 0x00010,  /* System management interrupt
*/
+PPC_INTERRUPT_CEXT  = 0x00020,  /* Critical external interrupt
*/
+PPC_INTERRUPT_DEBUG = 0x00040,  /* External debug exception   
*/
+PPC_INTERRUPT_THERM = 0x00080,  /* Thermal exception  
*/
 /* Internal hardware exception sources */
-PPC_INTERRUPT_DECR,   /* Decrementer exception*/
-PPC_INTERRUPT_HDECR,  /* Hypervisor decrementer exception */
-PPC_INTERRUPT_PIT,/* Programmable interval timer interrupt */
-PPC_INTERRUPT_FIT,/* Fixed interval timer interrupt   */
-PPC_INTERRUPT_WDT,/* Watchdog timer interrupt */
-PPC_INTERRUPT_CDOORBELL,  /* Critical doorbell interrupt  */
-PPC_INTERRUPT_DOORBELL,   /* Doorbell interrupt   */
-

[PATCH v3 02/29] target/ppc: always use ppc_set_irq to set env->pending_interrupts

2022-10-11 Thread Matheus Ferst
Use ppc_set_irq to raise/clear interrupts to ensure CPU_INTERRUPT_HARD
will be set/reset accordingly.

Reviewed-by: Fabiano Rosas 
Signed-off-by: Matheus Ferst 
---
 target/ppc/excp_helper.c | 17 +++--
 target/ppc/misc_helper.c |  9 ++---
 2 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index 3f8ff9bcf3..c3c30c5d1b 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -23,6 +23,7 @@
 #include "exec/exec-all.h"
 #include "internal.h"
 #include "helper_regs.h"
+#include "hw/ppc/ppc.h"
 
 #include "trace.h"
 
@@ -2080,7 +2081,6 @@ void helper_rfebb(CPUPPCState *env, target_ulong s)
 static void do_ebb(CPUPPCState *env, int ebb_excp)
 {
 PowerPCCPU *cpu = env_archcpu(env);
-CPUState *cs = CPU(cpu);
 
 /*
  * FSCR_EBB and FSCR_IC_EBB are the same bits used with
@@ -2098,8 +2098,7 @@ static void do_ebb(CPUPPCState *env, int ebb_excp)
 if (FIELD_EX64(env->msr, MSR, PR)) {
 powerpc_excp(cpu, ebb_excp);
 } else {
-env->pending_interrupts |= PPC_INTERRUPT_EBB;
-cpu_interrupt(cs, CPU_INTERRUPT_HARD);
+ppc_set_irq(cpu, PPC_INTERRUPT_EBB, 1);
 }
 }
 
@@ -2292,7 +2291,7 @@ void helper_msgclr(CPUPPCState *env, target_ulong rb)
 return;
 }
 
-env->pending_interrupts &= ~irq;
+ppc_set_irq(env_archcpu(env), irq, 0);
 }
 
 void helper_msgsnd(target_ulong rb)
@@ -2311,8 +2310,7 @@ void helper_msgsnd(target_ulong rb)
 CPUPPCState *cenv = >env;
 
 if ((rb & DBELL_BRDCAST) || (cenv->spr[SPR_BOOKE_PIR] == pir)) {
-cenv->pending_interrupts |= irq;
-cpu_interrupt(cs, CPU_INTERRUPT_HARD);
+ppc_set_irq(cpu, irq, 1);
 }
 }
 qemu_mutex_unlock_iothread();
@@ -2336,7 +2334,7 @@ void helper_book3s_msgclr(CPUPPCState *env, target_ulong 
rb)
 return;
 }
 
-env->pending_interrupts &= ~PPC_INTERRUPT_HDOORBELL;
+ppc_set_irq(env_archcpu(env), PPC_INTERRUPT_HDOORBELL, 0);
 }
 
 static void book3s_msgsnd_common(int pir, int irq)
@@ -2350,8 +2348,7 @@ static void book3s_msgsnd_common(int pir, int irq)
 
 /* TODO: broadcast message to all threads of the same  processor */
 if (cenv->spr_cb[SPR_PIR].default_value == pir) {
-cenv->pending_interrupts |= irq;
-cpu_interrupt(cs, CPU_INTERRUPT_HARD);
+ppc_set_irq(cpu, irq, 1);
 }
 }
 qemu_mutex_unlock_iothread();
@@ -2377,7 +2374,7 @@ void helper_book3s_msgclrp(CPUPPCState *env, target_ulong 
rb)
 return;
 }
 
-env->pending_interrupts &= ~PPC_INTERRUPT_DOORBELL;
+ppc_set_irq(env_archcpu(env), PPC_INTERRUPT_HDOORBELL, 0);
 }
 
 /*
diff --git a/target/ppc/misc_helper.c b/target/ppc/misc_helper.c
index 05e35572bc..a9bc1522e2 100644
--- a/target/ppc/misc_helper.c
+++ b/target/ppc/misc_helper.c
@@ -25,6 +25,7 @@
 #include "qemu/error-report.h"
 #include "qemu/main-loop.h"
 #include "mmu-book3s-v3.h"
+#include "hw/ppc/ppc.h"
 
 #include "helper_regs.h"
 
@@ -173,7 +174,6 @@ target_ulong helper_load_dpdes(CPUPPCState *env)
 void helper_store_dpdes(CPUPPCState *env, target_ulong val)
 {
 PowerPCCPU *cpu = env_archcpu(env);
-CPUState *cs = CPU(cpu);
 
 helper_hfscr_facility_check(env, HFSCR_MSGP, "store DPDES", HFSCR_IC_MSGP);
 
@@ -184,12 +184,7 @@ void helper_store_dpdes(CPUPPCState *env, target_ulong val)
 return;
 }
 
-if (val & 0x1) {
-env->pending_interrupts |= PPC_INTERRUPT_DOORBELL;
-cpu_interrupt(cs, CPU_INTERRUPT_HARD);
-} else {
-env->pending_interrupts &= ~PPC_INTERRUPT_DOORBELL;
-}
+ppc_set_irq(cpu, PPC_INTERRUPT_DOORBELL, val & 0x1);
 }
 #endif /* defined(TARGET_PPC64) */
 
-- 
2.25.1




[PATCH v3 00/29] PowerPC interrupt rework

2022-10-11 Thread Matheus Ferst
Link to v2: https://lists.gnu.org/archive/html/qemu-ppc/2022-09/msg00556.html
This series is also available as a git branch: 
https://github.com/PPC64/qemu/tree/ferst-interrupt-fix-v3
Patches without review: 3-27

This new version rebases the patch series on the current master and
fixes some problems pointed out by Fabiano on v2.

Matheus Ferst (29):
  target/ppc: define PPC_INTERRUPT_* values directly
  target/ppc: always use ppc_set_irq to set env->pending_interrupts
  target/ppc: split interrupt masking and delivery from ppc_hw_interrupt
  target/ppc: prepare to split interrupt masking and delivery by excp_model
  target/ppc: create an interrupt masking method for POWER9/POWER10
  target/ppc: remove unused interrupts from p9_next_unmasked_interrupt
  target/ppc: create an interrupt deliver method for POWER9/POWER10
  target/ppc: remove unused interrupts from p9_deliver_interrupt
  target/ppc: remove generic architecture checks from p9_deliver_interrupt
  target/ppc: move power-saving interrupt masking out of cpu_has_work_POWER9
  target/ppc: add power-saving interrupt masking logic to 
p9_next_unmasked_interrupt
  target/ppc: create an interrupt masking method for POWER8
  target/ppc: remove unused interrupts from p8_next_unmasked_interrupt
  target/ppc: create an interrupt deliver method for POWER8
  target/ppc: remove unused interrupts from p8_deliver_interrupt
  target/ppc: remove generic architecture checks from p8_deliver_interrupt
  target/ppc: move power-saving interrupt masking out of cpu_has_work_POWER8
  target/ppc: add power-saving interrupt masking logic to 
p8_next_unmasked_interrupt
  target/ppc: create an interrupt masking method for POWER7
  target/ppc: remove unused interrupts from p7_next_unmasked_interrupt
  target/ppc: create an interrupt deliver method for POWER7
  target/ppc: remove unused interrupts from p7_deliver_interrupt
  target/ppc: remove generic architecture checks from p7_deliver_interrupt
  target/ppc: move power-saving interrupt masking out of cpu_has_work_POWER7
  target/ppc: add power-saving interrupt masking logic to 
p7_next_unmasked_interrupt
  target/ppc: remove ppc_store_lpcr from CONFIG_USER_ONLY builds
  target/ppc: introduce ppc_maybe_interrupt
  target/ppc: unify cpu->has_work based on cs->interrupt_request
  target/ppc: move the p*_interrupt_powersave methods to excp_helper.c

 hw/ppc/pnv_core.c|   1 +
 hw/ppc/ppc.c |  17 +-
 hw/ppc/spapr_hcall.c |   6 +
 hw/ppc/spapr_rtas.c  |   2 +-
 hw/ppc/trace-events  |   2 +-
 target/ppc/cpu.c |   4 +
 target/ppc/cpu.h |  43 +-
 target/ppc/cpu_init.c| 212 +-
 target/ppc/excp_helper.c | 887 ++-
 target/ppc/helper.h  |   1 +
 target/ppc/helper_regs.c |   2 +
 target/ppc/misc_helper.c |  11 +-
 target/ppc/translate.c   |   2 +
 13 files changed, 833 insertions(+), 357 deletions(-)

-- 
2.25.1




[PATCH v2 6/8] mips: re-randomize rng-seed on reboot

2022-10-11 Thread Jason A. Donenfeld
When the system reboots, the rng-seed that the FDT has should be
re-randomized, so that the new boot gets a new seed. Since the FDT is in
the ROM region at this point, we add a hook right after the ROM has been
added, so that we have a pointer to that copy of the FDT.

Cc: Aleksandar Rikalo 
Cc: Paul Burton 
Cc: Philippe Mathieu-Daudé 
Signed-off-by: Jason A. Donenfeld 
---
 hw/mips/boston.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hw/mips/boston.c b/hw/mips/boston.c
index d2ab9da1a0..a560ce0324 100644
--- a/hw/mips/boston.c
+++ b/hw/mips/boston.c
@@ -41,6 +41,7 @@
 #include "sysemu/sysemu.h"
 #include "sysemu/qtest.h"
 #include "sysemu/runstate.h"
+#include "sysemu/reset.h"
 
 #include 
 #include "qom/object.h"
@@ -810,6 +811,8 @@ static void boston_mach_init(MachineState *machine)
 /* Calculate real fdt size after filter */
 dt_size = fdt_totalsize(dtb_load_data);
 rom_add_blob_fixed("dtb", dtb_load_data, dt_size, dtb_paddr);
+qemu_register_reset(qemu_fdt_randomize_seeds,
+rom_ptr(dtb_paddr, dt_size));
 } else {
 /* Try to load file as FIT */
 fit_err = load_fit(_fit_loader, machine->kernel_filename, 
s);
-- 
2.37.3




[PATCH v2 1/8] device-tree: add re-randomization helper function

2022-10-11 Thread Jason A. Donenfeld
When the system reboots, the rng-seed that the FDT has should be
re-randomized, so that the new boot gets a new seed. Several
architectures require this functionality, so export a function for
injecting a new seed into the given FDT.

Cc: Alistair Francis 
Cc: David Gibson 
Signed-off-by: Jason A. Donenfeld 
---
 include/sysemu/device_tree.h |  9 +
 softmmu/device_tree.c| 21 +
 2 files changed, 30 insertions(+)

diff --git a/include/sysemu/device_tree.h b/include/sysemu/device_tree.h
index ef060a9759..d552f324b6 100644
--- a/include/sysemu/device_tree.h
+++ b/include/sysemu/device_tree.h
@@ -196,6 +196,15 @@ int qemu_fdt_setprop_sized_cells_from_array(void *fdt,
 qdt_tmp); \
 })
 
+
+/**
+ * qemu_fdt_randomize_seeds:
+ * @fdt: device tree blob
+ *
+ * Re-randomize all "rng-seed" properties with new seeds.
+ */
+void qemu_fdt_randomize_seeds(void *fdt);
+
 #define FDT_PCI_RANGE_RELOCATABLE  0x8000
 #define FDT_PCI_RANGE_PREFETCHABLE 0x4000
 #define FDT_PCI_RANGE_ALIASED  0x2000
diff --git a/softmmu/device_tree.c b/softmmu/device_tree.c
index 6ca3fad285..d986c7b7b3 100644
--- a/softmmu/device_tree.c
+++ b/softmmu/device_tree.c
@@ -22,6 +22,7 @@
 #include "qemu/option.h"
 #include "qemu/bswap.h"
 #include "qemu/cutils.h"
+#include "qemu/guest-random.h"
 #include "sysemu/device_tree.h"
 #include "hw/loader.h"
 #include "hw/boards.h"
@@ -643,3 +644,23 @@ out:
 g_free(propcells);
 return ret;
 }
+
+void qemu_fdt_randomize_seeds(void *fdt)
+{
+int noffset, poffset, len;
+const char *name;
+uint8_t *data;
+
+for (noffset = fdt_next_node(fdt, 0, NULL);
+ noffset >= 0;
+ noffset = fdt_next_node(fdt, noffset, NULL)) {
+for (poffset = fdt_first_property_offset(fdt, noffset);
+ poffset >= 0;
+ poffset = fdt_next_property_offset(fdt, poffset)) {
+data = (uint8_t *)fdt_getprop_by_offset(fdt, poffset, , );
+if (!data || strcmp(name, "rng-seed"))
+continue;
+qemu_guest_getrandom_nofail(data, len);
+}
+}
+}
-- 
2.37.3




[PATCH v2 3/8] riscv: re-randomize rng-seed on reboot

2022-10-11 Thread Jason A. Donenfeld
When the system reboots, the rng-seed that the FDT has should be
re-randomized, so that the new boot gets a new seed. Since the FDT is in
the ROM region at this point, we add a hook right after the ROM has been
added, so that we have a pointer to that copy of the FDT.

Cc: Palmer Dabbelt 
Cc: Alistair Francis 
Cc: Bin Meng 
Cc: qemu-ri...@nongnu.org
Signed-off-by: Jason A. Donenfeld 
---
 hw/riscv/boot.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hw/riscv/boot.c b/hw/riscv/boot.c
index 1ae7596873..aaecf21543 100644
--- a/hw/riscv/boot.c
+++ b/hw/riscv/boot.c
@@ -30,6 +30,7 @@
 #include "sysemu/device_tree.h"
 #include "sysemu/qtest.h"
 #include "sysemu/kvm.h"
+#include "sysemu/reset.h"
 
 #include 
 
@@ -241,6 +242,8 @@ uint64_t riscv_load_fdt(hwaddr dram_base, uint64_t 
mem_size, void *fdt)
 
 rom_add_blob_fixed_as("fdt", fdt, fdtsize, fdt_addr,
   _space_memory);
+qemu_register_reset(qemu_fdt_randomize_seeds,
+rom_ptr_for_as(_space_memory, fdt_addr, 
fdtsize));
 
 return fdt_addr;
 }
-- 
2.37.3




[PATCH v2 7/8] reset: allow registering handlers that aren't called by snapshot loading

2022-10-11 Thread Jason A. Donenfeld
Snapshot loading only expects to call deterministic handlers, not
non-deterministic ones. So introduce a way of registering handlers that
won't be called when reseting for snapshots.

Signed-off-by: Jason A. Donenfeld 
---
 hw/arm/aspeed.c|  4 ++--
 hw/arm/mps2-tz.c   |  4 ++--
 hw/core/reset.c| 15 ++-
 hw/hppa/machine.c  |  4 ++--
 hw/i386/microvm.c  |  4 ++--
 hw/i386/pc.c   |  6 +++---
 hw/ppc/pegasos2.c  |  4 ++--
 hw/ppc/pnv.c   |  4 ++--
 hw/ppc/spapr.c |  4 ++--
 hw/s390x/s390-virtio-ccw.c |  4 ++--
 include/hw/boards.h|  2 +-
 include/sysemu/reset.h |  5 -
 migration/savevm.c |  2 +-
 qapi/run-state.json|  4 +++-
 softmmu/runstate.c |  4 ++--
 15 files changed, 44 insertions(+), 26 deletions(-)

diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c
index bc3ecdb619..69cadb1c37 100644
--- a/hw/arm/aspeed.c
+++ b/hw/arm/aspeed.c
@@ -1349,12 +1349,12 @@ static void 
aspeed_machine_bletchley_class_init(ObjectClass *oc, void *data)
 aspeed_soc_num_cpus(amc->soc_name);
 }
 
-static void fby35_reset(MachineState *state)
+static void fby35_reset(MachineState *state, ShutdownCause reason)
 {
 AspeedMachineState *bmc = ASPEED_MACHINE(state);
 AspeedGPIOState *gpio = >soc.gpio;
 
-qemu_devices_reset();
+qemu_devices_reset(reason);
 
 /* Board ID: 7 (Class-1, 4 slots) */
 object_property_set_bool(OBJECT(gpio), "gpioV4", true, _fatal);
diff --git a/hw/arm/mps2-tz.c b/hw/arm/mps2-tz.c
index 394192b9b2..284c09c91d 100644
--- a/hw/arm/mps2-tz.c
+++ b/hw/arm/mps2-tz.c
@@ -1239,7 +1239,7 @@ static void mps2_set_remap(Object *obj, const char 
*value, Error **errp)
 }
 }
 
-static void mps2_machine_reset(MachineState *machine)
+static void mps2_machine_reset(MachineState *machine, ShutdownCause reason)
 {
 MPS2TZMachineState *mms = MPS2TZ_MACHINE(machine);
 
@@ -1249,7 +1249,7 @@ static void mps2_machine_reset(MachineState *machine)
  * reset see the correct mapping.
  */
 remap_memory(mms, mms->remap);
-qemu_devices_reset();
+qemu_devices_reset(reason);
 }
 
 static void mps2tz_class_init(ObjectClass *oc, void *data)
diff --git a/hw/core/reset.c b/hw/core/reset.c
index 36be82c491..bcf323d6dd 100644
--- a/hw/core/reset.c
+++ b/hw/core/reset.c
@@ -33,6 +33,7 @@ typedef struct QEMUResetEntry {
 QTAILQ_ENTRY(QEMUResetEntry) entry;
 QEMUResetHandler *func;
 void *opaque;
+bool skip_on_snapshot_load;
 } QEMUResetEntry;
 
 static QTAILQ_HEAD(, QEMUResetEntry) reset_handlers =
@@ -47,6 +48,16 @@ void qemu_register_reset(QEMUResetHandler *func, void 
*opaque)
 QTAILQ_INSERT_TAIL(_handlers, re, entry);
 }
 
+void qemu_register_reset_nosnapshotload(QEMUResetHandler *func, void *opaque)
+{
+QEMUResetEntry *re = g_new0(QEMUResetEntry, 1);
+
+re->func = func;
+re->opaque = opaque;
+re->skip_on_snapshot_load = true;
+QTAILQ_INSERT_TAIL(_handlers, re, entry);
+}
+
 void qemu_unregister_reset(QEMUResetHandler *func, void *opaque)
 {
 QEMUResetEntry *re;
@@ -60,12 +71,14 @@ void qemu_unregister_reset(QEMUResetHandler *func, void 
*opaque)
 }
 }
 
-void qemu_devices_reset(void)
+void qemu_devices_reset(ShutdownCause reason)
 {
 QEMUResetEntry *re, *nre;
 
 /* reset all devices */
 QTAILQ_FOREACH_SAFE(re, _handlers, entry, nre) {
+if (reason == SHUTDOWN_CAUSE_SNAPSHOT_LOAD && 
re->skip_on_snapshot_load)
+continue;
 re->func(re->opaque);
 }
 }
diff --git a/hw/hppa/machine.c b/hw/hppa/machine.c
index e53d5f0fa7..19ea7c2c66 100644
--- a/hw/hppa/machine.c
+++ b/hw/hppa/machine.c
@@ -411,12 +411,12 @@ static void machine_hppa_init(MachineState *machine)
 cpu[0]->env.gr[19] = FW_CFG_IO_BASE;
 }
 
-static void hppa_machine_reset(MachineState *ms)
+static void hppa_machine_reset(MachineState *ms, ShutdownCause reason)
 {
 unsigned int smp_cpus = ms->smp.cpus;
 int i;
 
-qemu_devices_reset();
+qemu_devices_reset(reason);
 
 /* Start all CPUs at the firmware entry point.
  *  Monarch CPU will initialize firmware, secondary CPUs
diff --git a/hw/i386/microvm.c b/hw/i386/microvm.c
index 7fe8cce03e..860cfa00f5 100644
--- a/hw/i386/microvm.c
+++ b/hw/i386/microvm.c
@@ -467,7 +467,7 @@ static void microvm_machine_state_init(MachineState 
*machine)
 microvm_devices_init(mms);
 }
 
-static void microvm_machine_reset(MachineState *machine)
+static void microvm_machine_reset(MachineState *machine, ShutdownCause reason)
 {
 MicrovmMachineState *mms = MICROVM_MACHINE(machine);
 CPUState *cs;
@@ -480,7 +480,7 @@ static void microvm_machine_reset(MachineState *machine)
 mms->kernel_cmdline_fixed = true;
 }
 
-qemu_devices_reset();
+qemu_devices_reset(reason);
 
 CPU_FOREACH(cs) {
 cpu = X86_CPU(cs);
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 566accf7e6..66a0245a65 100644
--- a/hw/i386/pc.c
+++ 

[PATCH v2 0/8] rerandomize RNG seeds on reboot and handle record

2022-10-11 Thread Jason A. Donenfeld
When the system reboots, the rng seed that QEMU passes should be
re-randomized, so that the new boot gets a new seed. This series wires
that up for FDT.

Then, since the record subsystem makes use of reset as well, we
add a new reset cause for record, so that we can avoid
re-randomizing in these cases.

Jason A. Donenfeld (8):
  device-tree: add re-randomization helper function
  arm: re-randomize rng-seed on reboot
  riscv: re-randomize rng-seed on reboot
  openrisc: re-randomize rng-seed on reboot
  rx: re-randomize rng-seed on reboot
  mips: re-randomize rng-seed on reboot
  reset: allow registering handlers that aren't called by snapshot
loading
  reset: do not re-randomize RNG seed on snapshot load

 hw/arm/aspeed.c  |  4 ++--
 hw/arm/boot.c|  2 ++
 hw/arm/mps2-tz.c |  4 ++--
 hw/core/reset.c  | 15 ++-
 hw/hppa/machine.c|  4 ++--
 hw/i386/microvm.c|  4 ++--
 hw/i386/pc.c |  6 +++---
 hw/i386/x86.c|  2 +-
 hw/mips/boston.c |  3 +++
 hw/openrisc/boot.c   |  3 +++
 hw/ppc/pegasos2.c|  4 ++--
 hw/ppc/pnv.c |  4 ++--
 hw/ppc/spapr.c   |  4 ++--
 hw/riscv/boot.c  |  3 +++
 hw/rx/rx-gdbsim.c|  3 +++
 hw/s390x/s390-virtio-ccw.c   |  4 ++--
 include/hw/boards.h  |  2 +-
 include/sysemu/device_tree.h |  9 +
 include/sysemu/reset.h   |  5 -
 migration/savevm.c   |  2 +-
 qapi/run-state.json  |  4 +++-
 softmmu/device_tree.c| 21 +
 softmmu/runstate.c   |  4 ++--
 23 files changed, 89 insertions(+), 27 deletions(-)

-- 
2.37.3




Re: [PATCH 1/6] device-tree: add re-randomization helper function

2022-10-11 Thread Jason A. Donenfeld
On Tue, Oct 11, 2022 at 2:06 PM Jason A. Donenfeld  wrote:
>
> On Tue, Oct 11, 2022 at 09:46:01AM +0300, Pavel Dovgalyuk wrote:
> > On 10.10.2022 18:32, Peter Maydell wrote:
> > > On Mon, 10 Oct 2022 at 16:21, Jason A. Donenfeld  wrote:
> > >>
> > >> On Mon, Oct 10, 2022 at 11:54:50AM +0100, Peter Maydell wrote:
> > >>> The error is essentially the record-and-replay subsystem saying "the
> > >>> replay just asked for a random number at point when the recording
> > >>> did not ask for one, and so there's no 'this is what the number was'
> > >>> info in the record".
> > >>>
> > >>> I have had a quick look, and I think the reason for this is that
> > >>> load_snapshot() ("reset the VM state to the snapshot state stored in the
> > >>> disk image or migration stream") does a system reset. The replay
> > >>> process involves a lot of "load state from a snapshot and play
> > >>> forwards from there" operations. It doesn't expect that load_snapshot()
> > >>> would result in something reading random data, but now that we are
> > >>> calling qemu_guest_getrandom() in a reset hook, that happens.
> > >>
> > >> Hmm... so this seems like a bug in the replay code then? Shouldn't that
> > >> reset handler get hit during both passes, so the entry should be in
> > >> each?
> > >
> > > No, because record is just
> > > "reset the system, record all the way to the end stop",
> > > but replay is
> > > "set the system to the point we want to start at by using
> > > load_snapshot, play from there", and depending on the actions
> > > you do in the debugger like reverse-continue we might repeatedly
> > > do "reload that snapshot (implying a system reset) and play from there"
> > > multiple times.
> >
> > The idea of the patches is fdt randomization during reset, right?
> > But reset is used not only for real reboot, but also for restoring the
> > snapshots.
> > In the latter case it is like "just clear the hw registers to simplify
> > the initialization".
> > Therefore no other virtual hardware tried to read external data yet. And
> > random numbers are external to the machine, they come from the outer world.
> >
> > It means that this is completely new reset case and new solution should
> > be found for it.
>
> Do you have any proposals for that?

Okay I've actually read your message like 6 times now and think I may
have come up with something. Initial testing indicates it works well.
I'll send a new series shortly.

Jason



[PATCH v2 4/8] openrisc: re-randomize rng-seed on reboot

2022-10-11 Thread Jason A. Donenfeld
When the system reboots, the rng-seed that the FDT has should be
re-randomized, so that the new boot gets a new seed. Since the FDT is in
the ROM region at this point, we add a hook right after the ROM has been
added, so that we have a pointer to that copy of the FDT.

Cc: Stafford Horne 
Signed-off-by: Jason A. Donenfeld 
---
 hw/openrisc/boot.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hw/openrisc/boot.c b/hw/openrisc/boot.c
index 128ccbcba2..8b9f11b6d8 100644
--- a/hw/openrisc/boot.c
+++ b/hw/openrisc/boot.c
@@ -14,6 +14,7 @@
 #include "hw/openrisc/boot.h"
 #include "sysemu/device_tree.h"
 #include "sysemu/qtest.h"
+#include "sysemu/reset.h"
 
 #include 
 
@@ -111,6 +112,8 @@ uint32_t openrisc_load_fdt(void *fdt, hwaddr load_start,
 
 rom_add_blob_fixed_as("fdt", fdt, fdtsize, fdt_addr,
   _space_memory);
+qemu_register_reset(qemu_fdt_randomize_seeds,
+rom_ptr_for_as(_space_memory, fdt_addr, 
fdtsize));
 
 return fdt_addr;
 }
-- 
2.37.3




Re: [PATCH 1/6] device-tree: add re-randomization helper function

2022-10-11 Thread Jason A. Donenfeld
On Tue, Oct 11, 2022 at 09:46:01AM +0300, Pavel Dovgalyuk wrote:
> On 10.10.2022 18:32, Peter Maydell wrote:
> > On Mon, 10 Oct 2022 at 16:21, Jason A. Donenfeld  wrote:
> >>
> >> On Mon, Oct 10, 2022 at 11:54:50AM +0100, Peter Maydell wrote:
> >>> The error is essentially the record-and-replay subsystem saying "the
> >>> replay just asked for a random number at point when the recording
> >>> did not ask for one, and so there's no 'this is what the number was'
> >>> info in the record".
> >>>
> >>> I have had a quick look, and I think the reason for this is that
> >>> load_snapshot() ("reset the VM state to the snapshot state stored in the
> >>> disk image or migration stream") does a system reset. The replay
> >>> process involves a lot of "load state from a snapshot and play
> >>> forwards from there" operations. It doesn't expect that load_snapshot()
> >>> would result in something reading random data, but now that we are
> >>> calling qemu_guest_getrandom() in a reset hook, that happens.
> >>
> >> Hmm... so this seems like a bug in the replay code then? Shouldn't that
> >> reset handler get hit during both passes, so the entry should be in
> >> each?
> > 
> > No, because record is just
> > "reset the system, record all the way to the end stop",
> > but replay is
> > "set the system to the point we want to start at by using
> > load_snapshot, play from there", and depending on the actions
> > you do in the debugger like reverse-continue we might repeatedly
> > do "reload that snapshot (implying a system reset) and play from there"
> > multiple times.
> 
> The idea of the patches is fdt randomization during reset, right?
> But reset is used not only for real reboot, but also for restoring the 
> snapshots.
> In the latter case it is like "just clear the hw registers to simplify 
> the initialization".
> Therefore no other virtual hardware tried to read external data yet. And 
> random numbers are external to the machine, they come from the outer world.
> 
> It means that this is completely new reset case and new solution should 
> be found for it.

Do you have any proposals for that?

Jason



[PATCH v11 17/17] net: stream: add QAPI events to report connection state

2022-10-11 Thread Laurent Vivier
The netdev reports NETDEV_STREAM_CONNECTED event when the backend
is connected, and NETDEV_STREAM_EOC when it is disconnected.

The NETDEV_STREAM_CONNECTED event includes the URI of the destination
address.

Signed-off-by: Laurent Vivier 
Acked-by: Michael S. Tsirkin 
---
 net/stream.c  | 11 +--
 qapi/net.json | 46 ++
 2 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/net/stream.c b/net/stream.c
index 0293e38e5b57..821ae3265356 100644
--- a/net/stream.c
+++ b/net/stream.c
@@ -38,6 +38,7 @@
 #include "io/channel.h"
 #include "io/channel-socket.h"
 #include "io/net-listener.h"
+#include "qapi/qapi-events-net.h"
 
 typedef struct NetStreamState {
 NetClientState nc;
@@ -168,6 +169,8 @@ static gboolean net_stream_send(QIOChannel *ioc,
 s->nc.link_down = true;
 qemu_set_info_str(>nc, "");
 
+qapi_event_send_netdev_stream_eoc(s->nc.name);
+
 return G_SOURCE_REMOVE;
 }
 buf = buf1;
@@ -243,9 +246,10 @@ static void net_stream_listen(QIONetListener *listener,
 g_assert(addr != NULL);
 uri = socket_uri(addr);
 qemu_set_info_str(>nc, uri);
-g_free(uri);
 qapi_free_SocketAddress(addr);
 
+qapi_event_send_netdev_stream_connected(s->nc.name, uri);
+g_free(uri);
 }
 
 static void net_stream_server_listening(QIOTask *task, gpointer opaque)
@@ -317,12 +321,12 @@ static void net_stream_client_connected(QIOTask *task, 
gpointer opaque)
 g_assert(addr != NULL);
 uri = socket_uri(addr);
 qemu_set_info_str(>nc, uri);
-g_free(uri);
 
 ret = qemu_socket_try_set_nonblock(sioc->fd);
 if (addr->type == SOCKET_ADDRESS_TYPE_FD && ret < 0) {
 qemu_set_info_str(>nc, "can't use file descriptor %s (errno %d)",
   addr->u.fd.str, -ret);
+g_free(uri);
 qapi_free_SocketAddress(addr);
 goto error;
 }
@@ -338,6 +342,9 @@ static void net_stream_client_connected(QIOTask *task, 
gpointer opaque)
 s, NULL);
 s->nc.link_down = false;
 
+qapi_event_send_netdev_stream_connected(s->nc.name, uri);
+g_free(uri);
+
 return;
 error:
 object_unref(OBJECT(s->ioc));
diff --git a/qapi/net.json b/qapi/net.json
index 6a1a49749294..69f83bceff3f 100644
--- a/qapi/net.json
+++ b/qapi/net.json
@@ -895,3 +895,49 @@
 ##
 { 'event': 'FAILOVER_NEGOTIATED',
   'data': {'device-id': 'str'} }
+
+##
+# @NETDEV_STREAM_CONNECTED:
+#
+# Emitted when the netdev stream backend is connected
+#
+# @netdev-id: QEMU netdev id that is connected
+# @uri: The Uniform Resource Identifier identifying the destination address
+#
+# Since: 7.2
+#
+# Example:
+#
+# <- { 'event': 'NETDEV_STREAM_CONNECTED',
+#  'data': {'uri': 'tcp:::1:1234', 'netdev-id': 'netdev0'},
+#  'timestamp': {'seconds': 1663330564, 'microseconds': 804317} }
+#
+# or
+#
+# <- { 'event': 'NETDEV_STREAM_CONNECTED',
+#  'data': {'uri': ''unix:/tmp/qemu0', 'netdev-id': 'netdev0'},
+#  'timestamp': {'seconds': 1663330564, 'microseconds': 804317} }
+#
+##
+{ 'event': 'NETDEV_STREAM_CONNECTED',
+  'data': { 'netdev-id': 'str',
+'uri': 'str' } }
+
+##
+# @NETDEV_STREAM_EOC:
+#
+# Emitted when the netdev stream backend is disconnected
+#
+# @netdev-id: QEMU netdev id that is disconnected
+#
+# Since: 7.2
+#
+# Example:
+#
+# <- { 'event': 'NETDEV_STREAM_EOC',
+#  'data': {'netdev-id': 'netdev0'},
+#  'timestamp': {'seconds': 1663330937, 'microseconds': 526695} }
+#
+##
+{ 'event': 'NETDEV_STREAM_EOC',
+  'data': { 'netdev-id': 'str' } }
-- 
2.37.3




[PATCH v11 15/17] net: stream: move to QIO to enable additional parameters

2022-10-11 Thread Laurent Vivier
Use QIOChannel, QIOChannelSocket and QIONetListener.
This allows net/stream to use all the available parameters provided by
SocketAddress.

Signed-off-by: Laurent Vivier 
Acked-by: Michael S. Tsirkin 
---
 meson   |   2 +-
 net/stream.c| 493 +---
 qemu-options.hx |   4 +-
 3 files changed, 180 insertions(+), 319 deletions(-)

diff --git a/meson b/meson
index 3a9b285a55b9..12f9f04ba0de 16
--- a/meson
+++ b/meson
@@ -1 +1 @@
-Subproject commit 3a9b285a55b91b53b2acda987192274352ecb5be
+Subproject commit 12f9f04ba0decfda425dbbf9a501084c153a2d18
diff --git a/net/stream.c b/net/stream.c
index d26c79d035fc..0293e38e5b57 100644
--- a/net/stream.c
+++ b/net/stream.c
@@ -35,48 +35,36 @@
 #include "qemu/iov.h"
 #include "qemu/main-loop.h"
 #include "qemu/cutils.h"
+#include "io/channel.h"
+#include "io/channel-socket.h"
+#include "io/net-listener.h"
 
 typedef struct NetStreamState {
 NetClientState nc;
-int listen_fd;
-int fd;
+QIOChannel *listen_ioc;
+QIONetListener *listener;
+QIOChannel *ioc;
+guint ioc_read_tag;
+guint ioc_write_tag;
 SocketReadState rs;
 unsigned int send_index;  /* number of bytes sent*/
-bool read_poll;   /* waiting to receive data? */
-bool write_poll;  /* waiting to transmit data? */
 } NetStreamState;
 
-static void net_stream_send(void *opaque);
-static void net_stream_accept(void *opaque);
-static void net_stream_writable(void *opaque);
+static void net_stream_listen(QIONetListener *listener,
+  QIOChannelSocket *cioc,
+  void *opaque);
 
-static void net_stream_update_fd_handler(NetStreamState *s)
+static gboolean net_stream_writable(QIOChannel *ioc,
+GIOCondition condition,
+gpointer data)
 {
-qemu_set_fd_handler(s->fd,
-s->read_poll ? net_stream_send : NULL,
-s->write_poll ? net_stream_writable : NULL,
-s);
-}
-
-static void net_stream_read_poll(NetStreamState *s, bool enable)
-{
-s->read_poll = enable;
-net_stream_update_fd_handler(s);
-}
-
-static void net_stream_write_poll(NetStreamState *s, bool enable)
-{
-s->write_poll = enable;
-net_stream_update_fd_handler(s);
-}
-
-static void net_stream_writable(void *opaque)
-{
-NetStreamState *s = opaque;
+NetStreamState *s = data;
 
-net_stream_write_poll(s, false);
+s->ioc_write_tag = 0;
 
 qemu_flush_queued_packets(>nc);
+
+return G_SOURCE_REMOVE;
 }
 
 static ssize_t net_stream_receive(NetClientState *nc, const uint8_t *buf,
@@ -93,13 +81,15 @@ static ssize_t net_stream_receive(NetClientState *nc, const 
uint8_t *buf,
 .iov_len  = size,
 },
 };
+struct iovec local_iov[2];
+unsigned int nlocal_iov;
 size_t remaining;
 ssize_t ret;
 
 remaining = iov_size(iov, 2) - s->send_index;
-ret = iov_send(s->fd, iov, 2, s->send_index, remaining);
-
-if (ret == -1 && errno == EAGAIN) {
+nlocal_iov = iov_copy(local_iov, 2, iov, 2, s->send_index, remaining);
+ret = qio_channel_writev(s->ioc, local_iov, nlocal_iov, NULL);
+if (ret == QIO_CHANNEL_ERR_BLOCK) {
 ret = 0; /* handled further down */
 }
 if (ret == -1) {
@@ -108,19 +98,25 @@ static ssize_t net_stream_receive(NetClientState *nc, 
const uint8_t *buf,
 }
 if (ret < (ssize_t)remaining) {
 s->send_index += ret;
-net_stream_write_poll(s, true);
+s->ioc_write_tag = qio_channel_add_watch(s->ioc, G_IO_OUT,
+ net_stream_writable, s, NULL);
 return 0;
 }
 s->send_index = 0;
 return size;
 }
 
+static gboolean net_stream_send(QIOChannel *ioc,
+GIOCondition condition,
+gpointer data);
+
 static void net_stream_send_completed(NetClientState *nc, ssize_t len)
 {
 NetStreamState *s = DO_UPCAST(NetStreamState, nc, nc);
 
-if (!s->read_poll) {
-net_stream_read_poll(s, true);
+if (!s->ioc_read_tag) {
+s->ioc_read_tag = qio_channel_add_watch(s->ioc, G_IO_IN,
+net_stream_send, s, NULL);
 }
 }
 
@@ -131,19 +127,24 @@ static void net_stream_rs_finalize(SocketReadState *rs)
 if (qemu_send_packet_async(>nc, rs->buf,
rs->packet_len,
net_stream_send_completed) == 0) {
-net_stream_read_poll(s, false);
+if (s->ioc_read_tag) {
+g_source_remove(s->ioc_read_tag);
+s->ioc_read_tag = 0;
+}
 }
 }
 
-static void net_stream_send(void *opaque)
+static gboolean net_stream_send(QIOChannel *ioc,
+GIOCondition condition,
+gpointer data)
 {
-NetStreamState 

[PATCH v11 13/17] qemu-sockets: move and rename SocketAddress_to_str()

2022-10-11 Thread Laurent Vivier
Rename SocketAddress_to_str() to socket_uri() and move it to
util/qemu-sockets.c close to socket_parse().

socket_uri() generates a string from a SocketAddress while
socket_parse() generates a SocketAddress from a string.

Signed-off-by: Laurent Vivier 
Reviewed-by: David Gibson 
Reviewed-by: Dr. David Alan Gilbert 
Acked-by: Michael S. Tsirkin 
---
 include/qemu/sockets.h |  2 +-
 monitor/hmp-cmds.c | 23 +--
 util/qemu-sockets.c| 20 
 3 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/include/qemu/sockets.h b/include/qemu/sockets.h
index db4bedb6fa20..214058d8e307 100644
--- a/include/qemu/sockets.h
+++ b/include/qemu/sockets.h
@@ -58,6 +58,7 @@ NetworkAddressFamily inet_netfamily(int family);
 int unix_listen(const char *path, Error **errp);
 int unix_connect(const char *path, Error **errp);
 
+char *socket_uri(SocketAddress *addr);
 SocketAddress *socket_parse(const char *str, Error **errp);
 int socket_connect(SocketAddress *addr, Error **errp);
 int socket_listen(SocketAddress *addr, int num, Error **errp);
@@ -141,5 +142,4 @@ SocketAddress *socket_address_flatten(SocketAddressLegacy 
*addr);
  * Return 0 on success.
  */
 int socket_address_parse_named_fd(SocketAddress *addr, Error **errp);
-
 #endif /* QEMU_SOCKETS_H */
diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
index f90eea8d01c6..edf7068a9224 100644
--- a/monitor/hmp-cmds.c
+++ b/monitor/hmp-cmds.c
@@ -197,27 +197,6 @@ void hmp_info_mice(Monitor *mon, const QDict *qdict)
 qapi_free_MouseInfoList(mice_list);
 }
 
-static char *SocketAddress_to_str(SocketAddress *addr)
-{
-switch (addr->type) {
-case SOCKET_ADDRESS_TYPE_INET:
-return g_strdup_printf("tcp:%s:%s",
-   addr->u.inet.host,
-   addr->u.inet.port);
-case SOCKET_ADDRESS_TYPE_UNIX:
-return g_strdup_printf("unix:%s",
-   addr->u.q_unix.path);
-case SOCKET_ADDRESS_TYPE_FD:
-return g_strdup_printf("fd:%s", addr->u.fd.str);
-case SOCKET_ADDRESS_TYPE_VSOCK:
-return g_strdup_printf("tcp:%s:%s",
-   addr->u.vsock.cid,
-   addr->u.vsock.port);
-default:
-return g_strdup("unknown address type");
-}
-}
-
 void hmp_info_migrate(Monitor *mon, const QDict *qdict)
 {
 MigrationInfo *info;
@@ -380,7 +359,7 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
 monitor_printf(mon, "socket address: [\n");
 
 for (addr = info->socket_address; addr; addr = addr->next) {
-char *s = SocketAddress_to_str(addr->value);
+char *s = socket_uri(addr->value);
 monitor_printf(mon, "\t%s\n", s);
 g_free(s);
 }
diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c
index 83f4bd6fd211..9f6f655fd526 100644
--- a/util/qemu-sockets.c
+++ b/util/qemu-sockets.c
@@ -1077,6 +1077,26 @@ int unix_connect(const char *path, Error **errp)
 return sock;
 }
 
+char *socket_uri(SocketAddress *addr)
+{
+switch (addr->type) {
+case SOCKET_ADDRESS_TYPE_INET:
+return g_strdup_printf("tcp:%s:%s",
+   addr->u.inet.host,
+   addr->u.inet.port);
+case SOCKET_ADDRESS_TYPE_UNIX:
+return g_strdup_printf("unix:%s",
+   addr->u.q_unix.path);
+case SOCKET_ADDRESS_TYPE_FD:
+return g_strdup_printf("fd:%s", addr->u.fd.str);
+case SOCKET_ADDRESS_TYPE_VSOCK:
+return g_strdup_printf("tcp:%s:%s",
+   addr->u.vsock.cid,
+   addr->u.vsock.port);
+default:
+return g_strdup("unknown address type");
+}
+}
 
 SocketAddress *socket_parse(const char *str, Error **errp)
 {
-- 
2.37.3




[PATCH v11 16/17] tests/qtest: netdev: test stream and dgram backends

2022-10-11 Thread Laurent Vivier
Signed-off-by: Laurent Vivier 
Acked-by: Michael S. Tsirkin 
---
 tests/qtest/meson.build |   1 +
 tests/qtest/netdev-socket.c | 417 
 2 files changed, 418 insertions(+)
 create mode 100644 tests/qtest/netdev-socket.c

diff --git a/tests/qtest/meson.build b/tests/qtest/meson.build
index 455f1bbb7e52..464774e7b630 100644
--- a/tests/qtest/meson.build
+++ b/tests/qtest/meson.build
@@ -27,6 +27,7 @@ qtests_generic = [
   'test-hmp',
   'qos-test',
   'readconfig-test',
+  'netdev-socket',
 ]
 if config_host.has_key('CONFIG_MODULES')
   qtests_generic += [ 'modules-test' ]
diff --git a/tests/qtest/netdev-socket.c b/tests/qtest/netdev-socket.c
new file mode 100644
index ..4ea66b4c6988
--- /dev/null
+++ b/tests/qtest/netdev-socket.c
@@ -0,0 +1,417 @@
+/*
+ * QTest testcase for netdev stream and dgram
+ *
+ * Copyright (c) 2022 Red Hat, Inc.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "libqtest.h"
+
+#define CONNECTION_TIMEOUT5
+
+#define EXPECT_STATE(q, e, t) \
+do {  \
+char *resp = qtest_hmp(q, "info network");\
+if (t) {  \
+strrchr(resp, t)[0] = 0;  \
+} \
+g_test_timer_start(); \
+while (g_test_timer_elapsed() < CONNECTION_TIMEOUT) { \
+if (strcmp(resp, e) == 0) {   \
+break;\
+} \
+g_free(resp); \
+resp = qtest_hmp(q, "info network");  \
+if (t) {  \
+strrchr(resp, t)[0] = 0;  \
+} \
+} \
+g_assert_cmpstr(resp, ==, e); \
+g_free(resp); \
+} while (0)
+
+static int inet_get_free_port_socket(int sock)
+{
+struct sockaddr_in addr;
+socklen_t len;
+
+memset(, 0, sizeof(addr));
+addr.sin_family = AF_INET;
+addr.sin_addr.s_addr = INADDR_ANY;
+addr.sin_port = 0;
+if (bind(sock, (struct sockaddr *), sizeof(addr)) < 0) {
+return -1;
+}
+
+len = sizeof(addr);
+if (getsockname(sock,  (struct sockaddr *), ) < 0) {
+return -1;
+}
+
+return ntohs(addr.sin_port);
+}
+
+static int inet_get_free_port_multiple(int nb, int *port)
+{
+int sock[nb];
+int i;
+
+for (i = 0; i < nb; i++) {
+sock[i] = socket(AF_INET, SOCK_STREAM, 0);
+if (sock[i] < 0) {
+break;
+}
+port[i] = inet_get_free_port_socket(sock[i]);
+}
+
+nb = i;
+for (i = 0; i < nb; i++) {
+closesocket(sock[i]);
+}
+
+return nb;
+}
+
+static int inet_get_free_port(void)
+{
+int nb, port;
+
+nb = inet_get_free_port_multiple(1, );
+g_assert_cmpint(nb, ==, 1);
+
+return port;
+}
+
+static void test_stream_inet_ipv4(void)
+{
+QTestState *qts0, *qts1;
+char *expect;
+int port;
+
+port = inet_get_free_port();
+qts0 = qtest_initf("-nodefaults "
+   "-netdev stream,id=st0,addr.type=inet,"
+   "addr.ipv4=on,addr.ipv6=off,"
+   "addr.host=localhost,addr.port=%d", port);
+
+EXPECT_STATE(qts0, "st0: index=0,type=stream,\r\n", 0);
+
+qts1 = qtest_initf("-nodefaults "
+   "-netdev stream,server=false,id=st0,addr.type=inet,"
+   "addr.ipv4=on,addr.ipv6=off,"
+   "addr.host=localhost,addr.port=%d", port);
+
+expect = g_strdup_printf("st0: index=0,type=stream,tcp:127.0.0.1:%d\r\n",
+ port);
+EXPECT_STATE(qts1, expect, 0);
+g_free(expect);
+
+/* the port is unknown, check only the address */
+EXPECT_STATE(qts0, "st0: index=0,type=stream,tcp:127.0.0.1", ':');
+
+qtest_quit(qts1);
+qtest_quit(qts0);
+}
+
+static void test_stream_inet_ipv6(void)
+{
+QTestState *qts0, *qts1;
+char *expect;
+int port;
+
+port = inet_get_free_port();
+qts0 = qtest_initf("-nodefaults "
+   "-netdev stream,id=st0,addr.type=inet,"
+   "addr.ipv4=off,addr.ipv6=on,"
+   "addr.host=localhost,addr.port=%d", port);
+
+EXPECT_STATE(qts0, "st0: index=0,type=stream,\r\n", 0);
+
+qts1 = qtest_initf("-nodefaults "
+   "-netdev stream,server=false,id=st0,addr.type=inet,"
+   "addr.ipv4=off,addr.ipv6=on,"
+   "addr.host=localhost,addr.port=%d", port);
+
+expect = g_strdup_printf("st0: 

  1   2   3   >