Re: [PATCH 1/3] vfio: Make VFIOIOMMUClass::attach_device() and its wrapper return bool

2024-05-06 Thread Cédric Le Goater

On 5/7/24 04:09, Duan, Zhenzhong wrote:




-Original Message-
From: Cédric Le Goater 
Subject: Re: [PATCH 1/3] vfio: Make VFIOIOMMUClass::attach_device() and
its wrapper return bool

On 5/6/24 10:33, Zhenzhong Duan wrote:

Make VFIOIOMMUClass::attach_device() and its wrapper function
vfio_attach_device() return bool.

This is to follow the coding standand to return bool if 'Error **'
is used to pass error.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
   include/hw/vfio/vfio-common.h |  4 ++--
   include/hw/vfio/vfio-container-base.h |  4 ++--
   hw/vfio/ap.c  |  6 ++
   hw/vfio/ccw.c |  6 ++
   hw/vfio/common.c  |  4 ++--
   hw/vfio/container.c   | 14 +++---
   hw/vfio/iommufd.c | 11 +--
   hw/vfio/pci.c |  8 +++-
   hw/vfio/platform.c|  7 +++
   9 files changed, 28 insertions(+), 36 deletions(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-

common.h

index b9da6c08ef..a7b6fc8f46 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -198,8 +198,8 @@ void vfio_region_exit(VFIORegion *region);
   void vfio_region_finalize(VFIORegion *region);
   void vfio_reset_handler(void *opaque);
   struct vfio_device_info *vfio_get_device_info(int fd);
-int vfio_attach_device(char *name, VFIODevice *vbasedev,
-   AddressSpace *as, Error **errp);
+bool vfio_attach_device(char *name, VFIODevice *vbasedev,
+AddressSpace *as, Error **errp);
   void vfio_detach_device(VFIODevice *vbasedev);

   int vfio_kvm_device_add_fd(int fd, Error **errp);
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-

container-base.h

index 3582d5f97a..c839cfd9cb 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -118,8 +118,8 @@ struct VFIOIOMMUClass {
   int (*dma_unmap)(const VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
IOMMUTLBEntry *iotlb);
-int (*attach_device)(const char *name, VFIODevice *vbasedev,
- AddressSpace *as, Error **errp);
+bool (*attach_device)(const char *name, VFIODevice *vbasedev,
+  AddressSpace *as, Error **errp);
   void (*detach_device)(VFIODevice *vbasedev);
   /* migration feature */
   int (*set_dirty_page_tracking)(const VFIOContainerBase *bcontainer,
diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
index 7c4caa5938..d50600b702 100644
--- a/hw/vfio/ap.c
+++ b/hw/vfio/ap.c
@@ -156,7 +156,6 @@ static void

vfio_ap_unregister_irq_notifier(VFIOAPDevice *vapdev,

   static void vfio_ap_realize(DeviceState *dev, Error **errp)
   {
   ERRP_GUARD();
-int ret;
   Error *err = NULL;
   VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev);
   VFIODevice *vbasedev = >vdev;
@@ -165,9 +164,8 @@ static void vfio_ap_realize(DeviceState *dev, Error

**errp)

   return;
   }

-ret = vfio_attach_device(vbasedev->name, vbasedev,
- _space_memory, errp);
-if (ret) {
+if (!vfio_attach_device(vbasedev->name, vbasedev,
+_space_memory, errp)) {
   goto error;
   }

diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index 90e4a53437..782bd4bed7 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -580,7 +580,6 @@ static void vfio_ccw_realize(DeviceState *dev,

Error **errp)

   S390CCWDeviceClass *cdc = S390_CCW_DEVICE_GET_CLASS(cdev);
   VFIODevice *vbasedev = >vdev;
   Error *err = NULL;
-int ret;

   /* Call the class init function for subchannel. */
   if (cdc->realize) {
@@ -594,9 +593,8 @@ static void vfio_ccw_realize(DeviceState *dev,

Error **errp)

   return;
   }

-ret = vfio_attach_device(cdev->mdevid, vbasedev,
- _space_memory, errp);
-if (ret) {
+if (!vfio_attach_device(cdev->mdevid, vbasedev,
+_space_memory, errp)) {
   goto out_attach_dev_err;
   }

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 8f9cbdc026..890d30910e 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1492,8 +1492,8 @@ retry:
   return info;
   }

-int vfio_attach_device(char *name, VFIODevice *vbasedev,
-   AddressSpace *as, Error **errp)
+bool vfio_attach_device(char *name, VFIODevice *vbasedev,
+AddressSpace *as, Error **errp)
   {
   const VFIOIOMMUClass *ops =


VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY));


I think vfio_attach_device() can be cleaned up a little further :

ret = ops->attach_device(name, vbasedev, as, errp);
 if (ret < 0) {
 return ret;
 }


Not understand this.
I have both ops->attach_device() and vfio_attach_device() return 

[PATCH v2 0/3] Upgrade ACPI SPCR table to support SPCR table version 4 format

2024-05-06 Thread Sia Jee Heng
Update the SPCR table to accommodate the SPCR Table version 4 [1].
The SPCR table has been modified to adhere to the version 4 format [2].

Meanwhile, the virt SPCR golden reference files have been updated to
accommodate the SPCR Table version 4.

This patch series depends on Sunil's patch series [3], where Bios-Table-Test
is now supported by both ARM and RISC-V.

[1]: 
https://learn.microsoft.com/en-us/windows-hardware/drivers/serports/serial-port-console-redirection-table
[2]: https://github.com/acpica/acpica/pull/931
[3]: 
https://lore.kernel.org/all/20240315130519.2378765-1-suni...@ventanamicro.com/

Changes in v2:
- Utilizes a three-patch approach to modify the ACPI pre-built binary
  files required by the Bios-Table-Test.
- Rebases and incorporates changes to support both ARM and RISC-V ACPI
  pre-built binary files.

Sia Jee Heng (3):
  qtest: allow SPCR acpi table changes
  hw/acpi: Upgrade ACPI SPCR table to support SPCR table version 4
format
  tests/qtest/bios-tables-test: Update virt SPCR golden references

 hw/acpi/aml-build.c   |  14 +++---
 hw/arm/virt-acpi-build.c  |  10 --
 hw/riscv/virt-acpi-build.c|  12 +---
 include/hw/acpi/acpi-defs.h   |   7 +--
 include/hw/acpi/aml-build.h   |   2 +-
 tests/data/acpi/virt/aarch64/SPCR | Bin 80 -> 90 bytes
 tests/data/acpi/virt/riscv64/SPCR | Bin 80 -> 90 bytes
 7 files changed, 34 insertions(+), 11 deletions(-)

-- 
2.34.1




[PATCH v2 3/3] tests/qtest/bios-tables-test: Update virt SPCR golden references

2024-05-06 Thread Sia Jee Heng
Update the virt SPCR golden reference files to accommodate the
SPCR Table version 4 [1], utilizing the iasl binary compiled from the
latest ACPICA repository. The SPCR table has been modified to
adhere to the version 4 format [2].

[1]: 
https://learn.microsoft.com/en-us/windows-hardware/drivers/serports/serial-port-console-redirection-table
[2]: https://github.com/acpica/acpica/pull/931

Diffs from iasl:

 /*
  * Intel ACPI Component Architecture
  * AML/ASL+ Disassembler version 20240322 (64-bit version)
  * Copyright (c) 2000 - 2023 Intel Corporation
  *
- * Disassembly of tests/data/acpi/virt/aarch64/SPCR
+ * Disassembly of /tmp/aml-DVTAN2
  *
  * ACPI Data Table [SPCR]
  *
  * Format: [HexOffset DecimalOffset ByteLength]  FieldName : FieldValue (in 
hex)
  */

 [000h  004h]   Signature : "SPCR"[Serial Port Console 
Redirection Table]
-[004h 0004 004h]Table Length : 0050
-[008h 0008 001h]Revision : 02
-[009h 0009 001h]Checksum : B1
+[004h 0004 004h]Table Length : 005A
+[008h 0008 001h]Revision : 04
+[009h 0009 001h]Checksum : 1D
 [00Ah 0010 006h]  Oem ID : "BOCHS "
 [010h 0016 008h]Oem Table ID : "BXPC"
 [018h 0024 004h]Oem Revision : 0001
 [01Ch 0028 004h] Asl Compiler ID : "BXPC"
 [020h 0032 004h]   Asl Compiler Revision : 0001

 [024h 0036 001h]  Interface Type : 03
 [025h 0037 003h]Reserved : 00

 [028h 0040 00Ch]Serial Port Register : [Generic Address Structure]
 [028h 0040 001h]Space ID : 00 [SystemMemory]
 [029h 0041 001h]   Bit Width : 20
 [02Ah 0042 001h]  Bit Offset : 00
 [02Bh 0043 001h]Encoded Access Width : 03 [DWord Access:32]
 [02Ch 0044 008h] Address : 0900

 [035h 0053 001h] PCAT-compatible IRQ : 00
 [036h 0054 004h]   Interrupt : 0021
 [03Ah 0058 001h]   Baud Rate : 03
 [03Bh 0059 001h]  Parity : 00
 [03Ch 0060 001h]   Stop Bits : 01
 [03Dh 0061 001h]Flow Control : 02
 [03Eh 0062 001h]   Terminal Type : 00
 [03Fh 0063 001h]Language : 00
 [040h 0064 002h]   PCI Device ID : 
 [042h 0066 002h]   PCI Vendor ID : 
 [044h 0068 001h] PCI Bus : 00
 [045h 0069 001h]  PCI Device : 00
 [046h 0070 001h]PCI Function : 00
 [047h 0071 004h]   PCI Flags : 
 [04Bh 0075 001h] PCI Segment : 00
 [04Ch 0076 004h] Uart Clock Freq : 
-/ ACPI table terminates in the middle of a data structure! (dump table)
-CurrentOffset: 50, TableLength: 50 ***/
\ No newline at end of file
+[050h 0080 004h]   Precise Baud rate : 
+[054h 0084 002h]   NameSpaceStringLength : 0002
+[056h 0086 002h]   NameSpaceStringOffset : 0058
+[058h 0088 002h] NamespaceString : "."
+
+Raw Table Data: Length 90 (0x5A)
+
+: 53 50 43 52 5A 00 00 00 04 1D 42 4F 43 48 53 20  // SPCRZ.BOCHS
+0010: 42 58 50 43 20 20 20 20 01 00 00 00 42 58 50 43  // BXPCBXPC
+0020: 01 00 00 00 03 00 00 00 00 20 00 03 00 00 00 09  // . ..
+0030: 00 00 00 00 08 00 21 00 00 00 03 00 01 02 00 00  // ..!.
+0040: FF FF FF FF 00 00 00 00 00 00 00 00 00 00 00 00  // 
+0050: 00 00 00 00 02 00 58 00 2E 00// ..X...

 /*
  * Intel ACPI Component Architecture
  * AML/ASL+ Disassembler version 20200925 (64-bit version)
  * Copyright (c) 2000 - 2020 Intel Corporation
  *
- * Disassembly of tests/data/acpi/virt/riscv64/SPCR, Mon May  6 05:34:22 2024
+ * Disassembly of /tmp/aml-27E8M2, Mon May  6 05:34:22 2024
  *
  * ACPI Data Table [SPCR]
  *
  * Format: [HexOffset DecimalOffset ByteLength]  FieldName : FieldValue
  */

 [000h    4]Signature : "SPCR"[Serial Port Console 
Redirection table]
-[004h 0004   4] Table Length : 0050
-[008h 0008   1] Revision : 02
-[009h 0009   1] Checksum : B9
+[004h 0004   4] Table Length : 005A
+[008h 0008   1] Revision : 04
+[009h 0009   1] Checksum : 25
 [00Ah 0010   6]   Oem ID : "BOCHS "
 [010h 0016   8] Oem Table ID : "BXPC"
 [018h 0024   4] Oem Revision : 0001
 [01Ch 0028   4]  Asl Compiler ID : "BXPC"
 [020h 0032   4]Asl Compiler Revision : 0001

 [024h 0036   1]   Interface Type : 00
 [025h 0037   3] Reserved : 00

 [028h 0040  12] Serial Port Register : [Generic Address Structure]
 [028h 0040   1] 

[PATCH v2 1/3] qtest: allow SPCR acpi table changes

2024-05-06 Thread Sia Jee Heng
Signed-off-by: Sia Jee Heng 
---
 tests/qtest/bios-tables-test-allowed-diff.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/qtest/bios-tables-test-allowed-diff.h 
b/tests/qtest/bios-tables-test-allowed-diff.h
index dfb8523c8b..3f12ca546b 100644
--- a/tests/qtest/bios-tables-test-allowed-diff.h
+++ b/tests/qtest/bios-tables-test-allowed-diff.h
@@ -1 +1,3 @@
 /* List of comma-separated changed AML files to ignore */
+"tests/data/acpi/virt/riscv64/SPCR",
+"tests/data/acpi/virt/aarch64/SPCR",
-- 
2.34.1




[PATCH v2 2/3] hw/acpi: Upgrade ACPI SPCR table to support SPCR table version 4 format

2024-05-06 Thread Sia Jee Heng
Update the SPCR table to accommodate the SPCR Table version 4 [1].
The SPCR table has been modified to adhere to the version 4 format [2].

[1]: 
https://learn.microsoft.com/en-us/windows-hardware/drivers/serports/serial-port-console-redirection-table
[2]: https://github.com/acpica/acpica/pull/931

Signed-off-by: Sia Jee Heng 
---
 hw/acpi/aml-build.c | 14 +++---
 hw/arm/virt-acpi-build.c| 10 --
 hw/riscv/virt-acpi-build.c  | 12 +---
 include/hw/acpi/acpi-defs.h |  7 +--
 include/hw/acpi/aml-build.h |  2 +-
 5 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index 6d4517cfbe..7c43573eef 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -1996,7 +1996,7 @@ static void build_processor_hierarchy_node(GArray *tbl, 
uint32_t flags,
 
 void build_spcr(GArray *table_data, BIOSLinker *linker,
 const AcpiSpcrData *f, const uint8_t rev,
-const char *oem_id, const char *oem_table_id)
+const char *oem_id, const char *oem_table_id, const char *name)
 {
 AcpiTable table = { .sig = "SPCR", .rev = rev, .oem_id = oem_id,
 .oem_table_id = oem_table_id };
@@ -2042,8 +2042,16 @@ void build_spcr(GArray *table_data, BIOSLinker *linker,
 build_append_int_noprefix(table_data, f->pci_flags, 4);
 /* PCI Segment */
 build_append_int_noprefix(table_data, f->pci_segment, 1);
-/* Reserved */
-build_append_int_noprefix(table_data, 0, 4);
+/* UartClkFreq */
+build_append_int_noprefix(table_data, f->uart_clk_freq, 4);
+/* PreciseBaudrate */
+build_append_int_noprefix(table_data, f->precise_baudrate, 4);
+/* NameSpaceStringLength */
+build_append_int_noprefix(table_data, f->namespace_string_length, 2);
+/* NameSpaceStringOffset */
+build_append_int_noprefix(table_data, f->namespace_string_offset, 2);
+/* NamespaceString[] */
+g_array_append_vals(table_data, name, f->namespace_string_length);
 
 acpi_table_end(linker, );
 }
diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index 6a1bde61ce..cb345e8659 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -428,11 +428,12 @@ build_iort(GArray *table_data, BIOSLinker *linker, 
VirtMachineState *vms)
 
 /*
  * Serial Port Console Redirection Table (SPCR)
- * Rev: 1.07
+ * Rev: 1.10
  */
 static void
 spcr_setup(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
 {
+const char name[] = ".";
 AcpiSpcrData serial = {
 .interface_type = 3,   /* ARM PL011 UART */
 .base_addr.id = AML_AS_SYSTEM_MEMORY,
@@ -456,9 +457,14 @@ spcr_setup(GArray *table_data, BIOSLinker *linker, 
VirtMachineState *vms)
 .pci_function = 0,
 .pci_flags = 0,
 .pci_segment = 0,
+.uart_clk_freq = 0,
+.precise_baudrate = 0,
+.namespace_string_length = sizeof(name),
+.namespace_string_offset = 88,
 };
 
-build_spcr(table_data, linker, , 2, vms->oem_id, vms->oem_table_id);
+build_spcr(table_data, linker, , 4, vms->oem_id, vms->oem_table_id,
+   name);
 }
 
 /*
diff --git a/hw/riscv/virt-acpi-build.c b/hw/riscv/virt-acpi-build.c
index 0925528160..5fa3942491 100644
--- a/hw/riscv/virt-acpi-build.c
+++ b/hw/riscv/virt-acpi-build.c
@@ -176,14 +176,15 @@ acpi_dsdt_add_uart(Aml *scope, const MemMapEntry 
*uart_memmap,
 
 /*
  * Serial Port Console Redirection Table (SPCR)
- * Rev: 1.07
+ * Rev: 1.10
  */
 
 static void
 spcr_setup(GArray *table_data, BIOSLinker *linker, RISCVVirtState *s)
 {
+const char name[] = ".";
 AcpiSpcrData serial = {
-.interface_type = 0,   /* 16550 compatible */
+.interface_type = 0x12,   /* 16550 compatible */
 .base_addr.id = AML_AS_SYSTEM_MEMORY,
 .base_addr.width = 32,
 .base_addr.offset = 0,
@@ -205,9 +206,14 @@ spcr_setup(GArray *table_data, BIOSLinker *linker, 
RISCVVirtState *s)
 .pci_function = 0,
 .pci_flags = 0,
 .pci_segment = 0,
+.uart_clk_freq = 0,
+.precise_baudrate = 0,
+.namespace_string_length = sizeof(name),
+.namespace_string_offset = 88,
 };
 
-build_spcr(table_data, linker, , 2, s->oem_id, s->oem_table_id);
+build_spcr(table_data, linker, , 4, s->oem_id, s->oem_table_id,
+   name);
 }
 
 /* RHCT Node[N] starts at offset 56 */
diff --git a/include/hw/acpi/acpi-defs.h b/include/hw/acpi/acpi-defs.h
index 0e6e82b339..2e6e341998 100644
--- a/include/hw/acpi/acpi-defs.h
+++ b/include/hw/acpi/acpi-defs.h
@@ -112,7 +112,6 @@ typedef struct AcpiSpcrData {
 uint8_t flow_control;
 uint8_t terminal_type;
 uint8_t language;
-uint8_t reserved1;
 uint16_t pci_device_id;/* Must be 0x if not PCI device */
 uint16_t pci_vendor_id;/* Must be 0x if not PCI device */
 uint8_t pci_bus;
@@ -120,7 +119,11 @@ typedef struct AcpiSpcrData {
 

Re: [PATCH-for-9.1 v2 2/3] migration: Remove RDMA protocol handling

2024-05-06 Thread Jinpu Wang
Hi Peter, hi Daniel,
On Mon, May 6, 2024 at 5:29 PM Peter Xu  wrote:
>
> On Mon, May 06, 2024 at 12:08:43PM +0200, Jinpu Wang wrote:
> > Hi Peter, hi Daniel,
>
> Hi, Jinpu,
>
> Thanks for sharing this test results.  Sounds like a great news.
>
> What's your plan next?  Would it then be worthwhile / possible moving QEMU
> into that direction?  Would that greatly simplify rdma code as Dan
> mentioned?
I'm rather not familiar with QEMU migration yet,  from the test
result, I think it's a possible direction,
just we need to at least based on a rather recent release like
rdma-core v33 with proper 'fork' support.

Maybe Dan or you could give more detail about what you have in mind
for using rsocket as a replacement for the future.
We will also look into the implementation details in the meantime.

Thx!
J

>
> Thanks,
>
> >
> > On Fri, May 3, 2024 at 4:33 PM Peter Xu  wrote:
> > >
> > > On Fri, May 03, 2024 at 08:40:03AM +0200, Jinpu Wang wrote:
> > > > I had a brief check in the rsocket changelog, there seems some
> > > > improvement over time,
> > > >  might be worth revisiting this. due to socket abstraction, we can't
> > > > use some feature like
> > > >  ODP, it won't be a small and easy task.
> > >
> > > It'll be good to know whether Dan's suggestion would work first, without
> > > rewritting everything yet so far.  Not sure whether some perf test could
> > > help with the rsocket APIs even without QEMU's involvements (or looking 
> > > for
> > > test data supporting / invalidate such conversions).
> > >
> > I did a quick test with iperf on 100 G environment and 40 G
> > environment, in summary rsocket works pretty well.
> >
> > iperf tests between 2 hosts with 40 G (IB),
> > first  a few test with different num. of threads on top of ipoib
> > interface, later with preload rsocket on top of same ipoib interface.
> >
> > jw...@ps401a-914.nst:~$ iperf -p 52000 -c 10.43.3.145
> > 
> > Client connecting to 10.43.3.145, TCP port 52000
> > TCP window size:  165 KByte (default)
> > 
> > [  3] local 10.43.3.146 port 55602 connected with 10.43.3.145 port 52000
> > [ ID] Interval   Transfer Bandwidth
> > [  3] 0.-10.0001 sec  2.85 GBytes  2.44 Gbits/sec
> > jw...@ps401a-914.nst:~$ iperf -p 52000 -c 10.43.3.145 -P 2
> > 
> > Client connecting to 10.43.3.145, TCP port 52000
> > TCP window size:  165 KByte (default)
> > 
> > [  4] local 10.43.3.146 port 39640 connected with 10.43.3.145 port 52000
> > [  3] local 10.43.3.146 port 39626 connected with 10.43.3.145 port 52000
> > [ ID] Interval   Transfer Bandwidth
> > [  3] 0.-10.0012 sec  2.85 GBytes  2.45 Gbits/sec
> > [  4] 0.-10.0026 sec  2.86 GBytes  2.45 Gbits/sec
> > [SUM] 0.-10.0026 sec  5.71 GBytes  4.90 Gbits/sec
> > [ CT] final connect times (min/avg/max/stdev) =
> > 0.281/0.300/0.318/0.318 ms (tot/err) = 2/0
> > jw...@ps401a-914.nst:~$ iperf -p 52000 -c 10.43.3.145 -P 4
> > 
> > Client connecting to 10.43.3.145, TCP port 52000
> > TCP window size:  165 KByte (default)
> > 
> > [  4] local 10.43.3.146 port 46956 connected with 10.43.3.145 port 52000
> > [  6] local 10.43.3.146 port 46978 connected with 10.43.3.145 port 52000
> > [  3] local 10.43.3.146 port 46944 connected with 10.43.3.145 port 52000
> > [  5] local 10.43.3.146 port 46962 connected with 10.43.3.145 port 52000
> > [ ID] Interval   Transfer Bandwidth
> > [  3] 0.-10.0017 sec  2.85 GBytes  2.45 Gbits/sec
> > [  4] 0.-10.0015 sec  2.85 GBytes  2.45 Gbits/sec
> > [  5] 0.-10.0026 sec  2.85 GBytes  2.45 Gbits/sec
> > [  6] 0.-10.0005 sec  2.85 GBytes  2.45 Gbits/sec
> > [SUM] 0.-10.0005 sec  11.4 GBytes  9.80 Gbits/sec
> > [ CT] final connect times (min/avg/max/stdev) =
> > 0.274/0.312/0.360/0.212 ms (tot/err) = 4/0
> > jw...@ps401a-914.nst:~$ iperf -p 52000 -c 10.43.3.145 -P 8
> > 
> > Client connecting to 10.43.3.145, TCP port 52000
> > TCP window size:  165 KByte (default)
> > 
> > [  7] local 10.43.3.146 port 35062 connected with 10.43.3.145 port 52000
> > [  6] local 10.43.3.146 port 35058 connected with 10.43.3.145 port 52000
> > [  8] local 10.43.3.146 port 35066 connected with 10.43.3.145 port 52000
> > [  9] local 10.43.3.146 port 35074 connected with 10.43.3.145 port 52000
> > [  3] local 10.43.3.146 port 35038 connected with 10.43.3.145 port 52000
> > [ 12] local 10.43.3.146 port 35088 connected with 10.43.3.145 port 52000
> > [  5] local 10.43.3.146 port 35048 connected with 10.43.3.145 port 52000
> > [  4] local 10.43.3.146 port 35050 connected with 

Re: [PATCH 1/2] ppc/pnv: Begin a more complete ADU LPC model for POWER9/10

2024-05-06 Thread Nicholas Piggin
On Fri May 3, 2024 at 3:44 PM AEST, Cédric Le Goater wrote:
> On 5/3/24 06:51, Nicholas Piggin wrote:
> > On Thu May 2, 2024 at 6:47 PM AEST, Cédric Le Goater wrote:
> >> On 5/1/24 14:39, Nicholas Piggin wrote:
> >>> On Wed Apr 17, 2024 at 9:25 PM AEST, Cédric Le Goater wrote:
>  Hello Nick,
> 
>  On 4/17/24 13:02, Nicholas Piggin wrote:
> > This implements a framework for an ADU unit model.
> >
> > The ADU unit actually implements XSCOM, which is the bridge between MMIO
> > and PIB. However it also includes control and status registers and other
> > functions that are exposed as PIB (xscom) registers.
> >
> > To keep things simple, pnv_xscom.c remains the XSCOM bridge
> > implementation, and pnv_adu.c implements the ADU registers and other
> > functions.
> >
> > So far, just the ADU no-op registers in the pnv_xscom.c default handler
> > are moved over to the adu model.
> >
> > Signed-off-by: Nicholas Piggin 
> > ---
> > include/hw/ppc/pnv_adu.h   |  34 
> > include/hw/ppc/pnv_chip.h  |   3 +
> > include/hw/ppc/pnv_xscom.h |   6 ++
> > hw/ppc/pnv.c   |  16 ++
> > hw/ppc/pnv_adu.c   | 111 
> > +
> > hw/ppc/pnv_xscom.c |   9 ---
> > hw/ppc/meson.build |   1 +
> > hw/ppc/trace-events|   4 ++
> > 8 files changed, 175 insertions(+), 9 deletions(-)
> > create mode 100644 include/hw/ppc/pnv_adu.h
> > create mode 100644 hw/ppc/pnv_adu.c
> >
> > diff --git a/include/hw/ppc/pnv_adu.h b/include/hw/ppc/pnv_adu.h
> > new file mode 100644
> > index 00..9dc91857a9
> > --- /dev/null
> > +++ b/include/hw/ppc/pnv_adu.h
> > @@ -0,0 +1,34 @@
> > +/*
> > + * QEMU PowerPC PowerNV Emulation of some ADU behaviour
> > + *
> > + * Copyright (c) 2024, IBM Corporation.
> > + *
> > + * SPDX-License-Identifier: LGPL-2.1-or-later
> 
> 
>  Did you mean GPL-2.0-or-later ?
> >>>
> >>> Hey Cedric,
> >>>
> >>> Thanks for reviewing, I've been away so sorry for the late reply.
> >>>
> >>> It just came from one of the headers I copied which was LGPL. But
> >>> there's really nothing much in it and could find a GPL header to
> >>> copy. Is GPL-2.0-or-later preferred?
> >>
> >> I would since all pnv models are GPL.
> > 
> > Some of pnv is actually LGPL. 
>
> I was grepping for 'LGPL' and not 'Lesser' ... Indeed you are right.
> Most files miss an SPDX-License-Identifier tag also.
>
> > That's okay I'll change to GPL.
>
> LGPL is more relaxed if the code needs to be used in libraries, but
> I am not sure it applies to the PNV models. What would you prefer ?

GPL seems to be more common and I don't see a need for LGPL here,
so maybe GPL?

We could probably switch all LGPL pnv over to GPL if we wanted to.
I think LGPL permits such relicensing. Will leave this discussion
for another time though.

Thanks,
Nick



Re: [PATCH] spapr: Migrate ail-mode-3 spapr cap

2024-05-06 Thread Harsh Prateek Bora




On 5/6/24 17:26, Nicholas Piggin wrote:

This cap did not add the migration code when it was introduced. This
results in migration failure when changing the default using the
command line.

Cc: qemu-sta...@nongnu.org
Fixes: ccc5a4c5e10 ("spapr: Add SPAPR_CAP_AIL_MODE_3 for AIL mode 3 support for 
H_SET_MODE hcall")
Signed-off-by: Nicholas Piggin 


Reviewed-by: Harsh Prateek Bora 


---
  include/hw/ppc/spapr.h | 1 +
  hw/ppc/spapr.c | 1 +
  hw/ppc/spapr_caps.c| 1 +
  3 files changed, 3 insertions(+)

diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 4aaf23d28f..f6de3e9972 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -1004,6 +1004,7 @@ extern const VMStateDescription 
vmstate_spapr_cap_large_decr;
  extern const VMStateDescription vmstate_spapr_cap_ccf_assist;
  extern const VMStateDescription vmstate_spapr_cap_fwnmi;
  extern const VMStateDescription vmstate_spapr_cap_rpt_invalidate;
+extern const VMStateDescription vmstate_spapr_cap_ail_mode_3;
  extern const VMStateDescription vmstate_spapr_wdt;
  
  static inline uint8_t spapr_get_cap(SpaprMachineState *spapr, int cap)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index d2d1e310a3..065f58ec93 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2169,6 +2169,7 @@ static const VMStateDescription vmstate_spapr = {
  _spapr_cap_fwnmi,
  _spapr_fwnmi,
  _spapr_cap_rpt_invalidate,
+_spapr_cap_ail_mode_3,
  _spapr_cap_nested_papr,
  NULL
  }
diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c
index 0a15415a1d..2f74923560 100644
--- a/hw/ppc/spapr_caps.c
+++ b/hw/ppc/spapr_caps.c
@@ -974,6 +974,7 @@ SPAPR_CAP_MIG_STATE(large_decr, 
SPAPR_CAP_LARGE_DECREMENTER);
  SPAPR_CAP_MIG_STATE(ccf_assist, SPAPR_CAP_CCF_ASSIST);
  SPAPR_CAP_MIG_STATE(fwnmi, SPAPR_CAP_FWNMI);
  SPAPR_CAP_MIG_STATE(rpt_invalidate, SPAPR_CAP_RPT_INVALIDATE);
+SPAPR_CAP_MIG_STATE(ail_mode_3, SPAPR_CAP_AIL_MODE_3);
  
  void spapr_caps_init(SpaprMachineState *spapr)

  {




[PATCH v2 1/6] hw/loongarch: Refine acpi srat table for numa memory

2024-05-06 Thread Bibo Mao
One LoongArch virt machine platform, there is limitation for memory
map information. The minimum memory size is 256M and minimum memory
size for numa node0 is 256M also. With qemu numa qtest, it is possible
that memory size of numa node0 is 128M.

Limitations for minimum memory size for both total memory and numa
node0 is removed for acpi srat table creation.

Signed-off-by: Bibo Mao 
---
 hw/loongarch/acpi-build.c | 58 +++
 1 file changed, 34 insertions(+), 24 deletions(-)

diff --git a/hw/loongarch/acpi-build.c b/hw/loongarch/acpi-build.c
index e5ab1080af..d0247d93ee 100644
--- a/hw/loongarch/acpi-build.c
+++ b/hw/loongarch/acpi-build.c
@@ -165,8 +165,9 @@ static void
 build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine)
 {
 int i, arch_id, node_id;
-uint64_t mem_len, mem_base;
-int nb_numa_nodes = machine->numa_state->num_nodes;
+hwaddr len, base, gap;
+NodeInfo *numa_info;
+int nodes, nb_numa_nodes = machine->numa_state->num_nodes;
 LoongArchMachineState *lams = LOONGARCH_MACHINE(machine);
 MachineClass *mc = MACHINE_GET_CLASS(lams);
 const CPUArchIdList *arch_ids = mc->possible_cpu_arch_ids(machine);
@@ -195,35 +196,44 @@ build_srat(GArray *table_data, BIOSLinker *linker, 
MachineState *machine)
 build_append_int_noprefix(table_data, 0, 4); /* Reserved */
 }
 
-/* Node0 */
-build_srat_memory(table_data, VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE,
-  0, MEM_AFFINITY_ENABLED);
-mem_base = VIRT_HIGHMEM_BASE;
-if (!nb_numa_nodes) {
-mem_len = machine->ram_size - VIRT_LOWMEM_SIZE;
-} else {
-mem_len = machine->numa_state->nodes[0].node_mem - VIRT_LOWMEM_SIZE;
+base = VIRT_LOWMEM_BASE;
+gap = VIRT_LOWMEM_SIZE;
+numa_info = machine->numa_state->nodes;
+nodes = nb_numa_nodes;
+if (!nodes) {
+nodes = 1;
 }
-if (mem_len)
-build_srat_memory(table_data, mem_base, mem_len, 0, 
MEM_AFFINITY_ENABLED);
-
-/* Node1 - Nodemax */
-if (nb_numa_nodes) {
-mem_base += mem_len;
-for (i = 1; i < nb_numa_nodes; ++i) {
-if (machine->numa_state->nodes[i].node_mem > 0) {
-build_srat_memory(table_data, mem_base,
-  machine->numa_state->nodes[i].node_mem, i,
-  MEM_AFFINITY_ENABLED);
-mem_base += machine->numa_state->nodes[i].node_mem;
-}
+
+for (i = 0; i < nodes; i++) {
+if (nb_numa_nodes) {
+len = numa_info[i].node_mem;
+} else {
+len = machine->ram_size;
+}
+
+/*
+ * memory for the node splited into two part
+ *   lowram:  [base, +gap)
+ *   highram: [VIRT_HIGHMEM_BASE, +(len - gap))
+ */
+if (len >= gap) {
+build_srat_memory(table_data, base, len, i, MEM_AFFINITY_ENABLED);
+len -= gap;
+base = VIRT_HIGHMEM_BASE;
+gap = machine->ram_size - VIRT_LOWMEM_SIZE;
+}
+
+if (len) {
+build_srat_memory(table_data, base, len, i, MEM_AFFINITY_ENABLED);
+base += len;
+gap  -= len;
 }
 }
 
 if (machine->device_memory) {
 build_srat_memory(table_data, machine->device_memory->base,
   memory_region_size(>device_memory->mr),
-  nb_numa_nodes - 1,
+  nodes - 1,
   MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
 }
 
-- 
2.39.3




[PATCH v2 6/6] tests/qtest: Add numa test for loongarch system

2024-05-06 Thread Bibo Mao
Add numa test case for loongarch system, it passes to run
with command "make check-qtest", after the following patch
is applied.
https://lore.kernel.org/all/20240319022606.2994565-1-maob...@loongson.cn/

Signed-off-by: Bibo Mao 
---
 tests/qtest/meson.build | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/qtest/meson.build b/tests/qtest/meson.build
index 6f2f594ace..a72436df65 100644
--- a/tests/qtest/meson.build
+++ b/tests/qtest/meson.build
@@ -256,6 +256,8 @@ qtests_s390x = \
 qtests_riscv32 = \
   (config_all_devices.has_key('CONFIG_SIFIVE_E_AON') ? 
['sifive-e-aon-watchdog-test'] : [])
 
+qtests_loongarch64 = ['numa-test']
+
 qos_test_ss = ss.source_set()
 qos_test_ss.add(
   'ac97-test.c',
-- 
2.39.3




[PATCH v2 4/6] hw/loongarch: Refine system dram memory region

2024-05-06 Thread Bibo Mao
For system dram memory region, it is not necessary to use numa node
information. There is only low memory region and high memory region.

Remove numa node information for ddr memory region here, it can reduce
memory region number about LoongArch virt machine.

Signed-off-by: Bibo Mao 
---
 hw/loongarch/virt.c | 55 ++---
 1 file changed, 17 insertions(+), 38 deletions(-)

diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c
index aca8290795..5abfe0a9d7 100644
--- a/hw/loongarch/virt.c
+++ b/hw/loongarch/virt.c
@@ -974,18 +974,13 @@ static void loongarch_init(MachineState *machine)
 {
 LoongArchCPU *lacpu;
 const char *cpu_model = machine->cpu_type;
-ram_addr_t offset = 0;
-ram_addr_t ram_size = machine->ram_size;
-uint64_t highram_size = 0, phyAddr = 0;
 MemoryRegion *address_space_mem = get_system_memory();
 LoongArchMachineState *lams = LOONGARCH_MACHINE(machine);
-int nb_numa_nodes = machine->numa_state->num_nodes;
-NodeInfo *numa_info = machine->numa_state->nodes;
 int i;
+hwaddr base, size, ram_size = machine->ram_size;
 const CPUArchIdList *possible_cpus;
 MachineClass *mc = MACHINE_GET_CLASS(machine);
 CPUState *cpu;
-char *ramName = NULL;
 
 if (!cpu_model) {
 cpu_model = LOONGARCH_CPU_TYPE_NAME("la464");
@@ -1020,41 +1015,27 @@ static void loongarch_init(MachineState *machine)
 fdt_add_memory_nodes(machine);
 fw_cfg_add_memory(machine);
 
-/* Node0 memory */
-memory_region_init_alias(>lowmem, NULL, "loongarch.node0.lowram",
- machine->ram, offset, VIRT_LOWMEM_SIZE);
-memory_region_add_subregion(address_space_mem, phyAddr, >lowmem);
-
-offset += VIRT_LOWMEM_SIZE;
-if (nb_numa_nodes > 0) {
-assert(numa_info[0].node_mem > VIRT_LOWMEM_SIZE);
-highram_size = numa_info[0].node_mem - VIRT_LOWMEM_SIZE;
-} else {
-highram_size = ram_size - VIRT_LOWMEM_SIZE;
+size = ram_size;
+base = VIRT_LOWMEM_BASE;
+if (size > VIRT_LOWMEM_SIZE) {
+size = VIRT_LOWMEM_SIZE;
 }
-phyAddr = VIRT_HIGHMEM_BASE;
-memory_region_init_alias(>highmem, NULL, "loongarch.node0.highram",
-  machine->ram, offset, highram_size);
-memory_region_add_subregion(address_space_mem, phyAddr, >highmem);
-
-/* Node1 - Nodemax memory */
-offset += highram_size;
-phyAddr += highram_size;
-
-for (i = 1; i < nb_numa_nodes; i++) {
-MemoryRegion *nodemem = g_new(MemoryRegion, 1);
-ramName = g_strdup_printf("loongarch.node%d.ram", i);
-memory_region_init_alias(nodemem, NULL, ramName, machine->ram,
- offset,  numa_info[i].node_mem);
-memory_region_add_subregion(address_space_mem, phyAddr, nodemem);
-offset += numa_info[i].node_mem;
-phyAddr += numa_info[i].node_mem;
+
+memory_region_init_alias(>lowmem, NULL, "loongarch.lowram",
+  machine->ram, base, size);
+memory_region_add_subregion(address_space_mem, base, >lowmem);
+base += size;
+if (ram_size - size) {
+base = VIRT_HIGHMEM_BASE;
+memory_region_init_alias(>highmem, NULL, "loongarch.highram",
+machine->ram, VIRT_LOWMEM_BASE + size, ram_size - size);
+memory_region_add_subregion(address_space_mem, base, >highmem);
+base += ram_size - size;
 }
 
 /* initialize device memory address space */
 if (machine->ram_size < machine->maxram_size) {
 ram_addr_t device_mem_size = machine->maxram_size - machine->ram_size;
-hwaddr device_mem_base;
 
 if (machine->ram_slots > ACPI_MAX_RAM_SLOTS) {
 error_report("unsupported amount of memory slots: %"PRIu64,
@@ -1068,9 +1049,7 @@ static void loongarch_init(MachineState *machine)
  "%d bytes", TARGET_PAGE_SIZE);
 exit(EXIT_FAILURE);
 }
-/* device memory base is the top of high memory address. */
-device_mem_base = ROUND_UP(VIRT_HIGHMEM_BASE + highram_size, 1 * GiB);
-machine_memory_devices_init(machine, device_mem_base, device_mem_size);
+machine_memory_devices_init(machine, base, device_mem_size);
 }
 
 /* load the BIOS image. */
-- 
2.39.3




[PATCH v2 3/6] hw/loongarch: Refine fwcfg memory map

2024-05-06 Thread Bibo Mao
Memory map table for fwcfg is used for UEFI BIOS, UEFI BIOS uses the first
entry from fwcfg memory map as the first memory HOB, the second memory HOB
will be used if the first memory HOB is used up.

Memory map table for fwcfg does not care about numa node, however in
generic the first memory HOB is part of numa node0, so that runtime
memory of UEFI which is allocated from the first memory HOB is located
at numa node0.

Signed-off-by: Bibo Mao 
---
 hw/loongarch/virt.c | 60 ++---
 1 file changed, 57 insertions(+), 3 deletions(-)

diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c
index db76bc94f8..aca8290795 100644
--- a/hw/loongarch/virt.c
+++ b/hw/loongarch/virt.c
@@ -914,6 +914,62 @@ static const MemoryRegionOps loongarch_qemu_ops = {
 },
 };
 
+static void fw_cfg_add_memory(MachineState *ms)
+{
+hwaddr base, size, ram_size, gap;
+int nb_numa_nodes, nodes;
+NodeInfo *numa_info;
+
+ram_size = ms->ram_size;
+base = VIRT_LOWMEM_BASE;
+gap = VIRT_LOWMEM_SIZE;
+nodes = nb_numa_nodes = ms->numa_state->num_nodes;
+numa_info = ms->numa_state->nodes;
+if (!nodes) {
+nodes = 1;
+}
+
+/* add fw_cfg memory map of node0 */
+if (nb_numa_nodes) {
+size = numa_info[0].node_mem;
+} else {
+size = ram_size;
+}
+
+if (size >= gap) {
+memmap_add_entry(base, gap, 1);
+size -= gap;
+base = VIRT_HIGHMEM_BASE;
+gap = ram_size - VIRT_LOWMEM_SIZE;
+}
+
+if (size) {
+memmap_add_entry(base, size, 1);
+base += size;
+}
+
+if (nodes < 2) {
+return;
+}
+
+/* add fw_cfg memory map of other nodes */
+size = ram_size - numa_info[0].node_mem;
+gap  = VIRT_LOWMEM_BASE + VIRT_LOWMEM_SIZE;
+if (base < gap && (base + size) > gap) {
+/*
+ * memory map for the maining nodes splited into two part
+ *   lowram:  [base, +(gap - base))
+ *   highram: [VIRT_HIGHMEM_BASE, +(size - (gap - base)))
+ */
+memmap_add_entry(base, gap - base, 1);
+size -= gap - base;
+base = VIRT_HIGHMEM_BASE;
+}
+
+   if (size)
+memmap_add_entry(base, size, 1);
+}
+
 static void loongarch_init(MachineState *machine)
 {
 LoongArchCPU *lacpu;
@@ -962,9 +1018,9 @@ static void loongarch_init(MachineState *machine)
 fdt_add_cpu_nodes(lams);
 
 fdt_add_memory_nodes(machine);
+fw_cfg_add_memory(machine);
 
 /* Node0 memory */
-memmap_add_entry(VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE, 1);
 memory_region_init_alias(>lowmem, NULL, "loongarch.node0.lowram",
  machine->ram, offset, VIRT_LOWMEM_SIZE);
 memory_region_add_subregion(address_space_mem, phyAddr, >lowmem);
@@ -977,7 +1033,6 @@ static void loongarch_init(MachineState *machine)
 highram_size = ram_size - VIRT_LOWMEM_SIZE;
 }
 phyAddr = VIRT_HIGHMEM_BASE;
-memmap_add_entry(phyAddr, highram_size, 1);
 memory_region_init_alias(>highmem, NULL, "loongarch.node0.highram",
   machine->ram, offset, highram_size);
 memory_region_add_subregion(address_space_mem, phyAddr, >highmem);
@@ -992,7 +1047,6 @@ static void loongarch_init(MachineState *machine)
 memory_region_init_alias(nodemem, NULL, ramName, machine->ram,
  offset,  numa_info[i].node_mem);
 memory_region_add_subregion(address_space_mem, phyAddr, nodemem);
-memmap_add_entry(phyAddr, numa_info[i].node_mem, 1);
 offset += numa_info[i].node_mem;
 phyAddr += numa_info[i].node_mem;
 }
-- 
2.39.3




[PATCH v2 5/6] hw/loongarch: Remove minimum and default memory size

2024-05-06 Thread Bibo Mao
Some qtest test cases such as numa use default memory size of generic
machine class, which is 128M by fault.

Here generic default memory size is used, and also remove minimum memory
size which is 1G originally.

Signed-off-by: Bibo Mao 
---
 hw/loongarch/virt.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c
index 5abfe0a9d7..36477172ea 100644
--- a/hw/loongarch/virt.c
+++ b/hw/loongarch/virt.c
@@ -986,10 +986,6 @@ static void loongarch_init(MachineState *machine)
 cpu_model = LOONGARCH_CPU_TYPE_NAME("la464");
 }
 
-if (ram_size < 1 * GiB) {
-error_report("ram_size must be greater than 1G.");
-exit(1);
-}
 create_fdt(lams);
 
 /* Create IOCSR space */
@@ -1284,7 +1280,6 @@ static void loongarch_class_init(ObjectClass *oc, void 
*data)
 
 mc->desc = "Loongson-3A5000 LS7A1000 machine";
 mc->init = loongarch_init;
-mc->default_ram_size = 1 * GiB;
 mc->default_cpu_type = LOONGARCH_CPU_TYPE_NAME("la464");
 mc->default_ram_id = "loongarch.ram";
 mc->max_cpus = LOONGARCH_MAX_CPUS;
-- 
2.39.3




[PATCH v2 0/6] hw/loongarch: Refine numa memory map

2024-05-06 Thread Bibo Mao
One LoongArch virt machine platform, there is limitation for memory
map information. The minimum memory size is 256M and minimum memory
size for numa node0 is 256M also. With qemu numa qtest, it is possible
that memory size of numa node0 is 128M.

Limitations for minimum memory size for both total memory and numa
node0 is removed here, including acpi srat table, fadt memory map table
and fw_cfg memory map table.

Also remove numa node about memory region, there is only low memory
region and how memory region.

---
v1 ... v2:
  1. Refresh the patch based on the latest qemu version, solve the
confliction issue.
  2. Add numa test case for loongarch system.
---
Bibo Mao (6):
  hw/loongarch: Refine acpi srat table for numa memory
  hw/loongarch: Refine fadt memory table for numa memory
  hw/loongarch: Refine fwcfg memory map
  hw/loongarch: Refine system dram memory region
  hw/loongarch: Remove minimum and default memory size
  tests/qtest: Add numa test for loongarch system

 hw/loongarch/acpi-build.c |  58 +++--
 hw/loongarch/virt.c   | 167 +++---
 tests/qtest/meson.build   |   2 +
 3 files changed, 154 insertions(+), 73 deletions(-)


base-commit: 248f6f62df073a3b4158fd0093863ab885feabb5
-- 
2.39.3




[PATCH v2 2/6] hw/loongarch: Refine fadt memory table for numa memory

2024-05-06 Thread Bibo Mao
One LoongArch virt machine platform, there is limitation for memory
map information. The minimum memory size is 256M and minimum memory
size for numa node0 is 256M also. With qemu numa qtest, it is possible
that memory size of numa node0 is 128M.

Limitations for minimum memory size for both total memory and numa
node0 is removed for fadt numa memory table creation.

Signed-off-by: Bibo Mao 
---
 hw/loongarch/virt.c | 47 ++---
 1 file changed, 44 insertions(+), 3 deletions(-)

diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c
index c0999878df..db76bc94f8 100644
--- a/hw/loongarch/virt.c
+++ b/hw/loongarch/virt.c
@@ -473,6 +473,48 @@ static void fdt_add_memory_node(MachineState *ms,
 g_free(nodename);
 }
 
+static void fdt_add_memory_nodes(MachineState *ms)
+{
+hwaddr base, size, ram_size, gap;
+int i, nb_numa_nodes, nodes;
+NodeInfo *numa_info;
+
+ram_size = ms->ram_size;
+base = VIRT_LOWMEM_BASE;
+gap = VIRT_LOWMEM_SIZE;
+nodes = nb_numa_nodes = ms->numa_state->num_nodes;
+numa_info = ms->numa_state->nodes;
+if (!nodes) {
+nodes = 1;
+}
+
+for (i = 0; i < nodes; i++) {
+if (nb_numa_nodes) {
+size = numa_info[i].node_mem;
+} else {
+size = ram_size;
+}
+
+/*
+ * memory for the node splited into two part
+ *   lowram:  [base, +gap)
+ *   highram: [VIRT_HIGHMEM_BASE, +(len - gap))
+ */
+if (size >= gap) {
+fdt_add_memory_node(ms, base, gap, i);
+size -= gap;
+base = VIRT_HIGHMEM_BASE;
+gap = ram_size - VIRT_LOWMEM_SIZE;
+}
+
+if (size) {
+fdt_add_memory_node(ms, base, size, i);
+base += size;
+gap -= size;
+}
+}
+}
+
 static void virt_build_smbios(LoongArchMachineState *lams)
 {
 MachineState *ms = MACHINE(lams);
@@ -919,9 +961,10 @@ static void loongarch_init(MachineState *machine)
 }
 fdt_add_cpu_nodes(lams);
 
+fdt_add_memory_nodes(machine);
+
 /* Node0 memory */
 memmap_add_entry(VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE, 1);
-fdt_add_memory_node(machine, VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE, 0);
 memory_region_init_alias(>lowmem, NULL, "loongarch.node0.lowram",
  machine->ram, offset, VIRT_LOWMEM_SIZE);
 memory_region_add_subregion(address_space_mem, phyAddr, >lowmem);
@@ -935,7 +978,6 @@ static void loongarch_init(MachineState *machine)
 }
 phyAddr = VIRT_HIGHMEM_BASE;
 memmap_add_entry(phyAddr, highram_size, 1);
-fdt_add_memory_node(machine, phyAddr, highram_size, 0);
 memory_region_init_alias(>highmem, NULL, "loongarch.node0.highram",
   machine->ram, offset, highram_size);
 memory_region_add_subregion(address_space_mem, phyAddr, >highmem);
@@ -951,7 +993,6 @@ static void loongarch_init(MachineState *machine)
  offset,  numa_info[i].node_mem);
 memory_region_add_subregion(address_space_mem, phyAddr, nodemem);
 memmap_add_entry(phyAddr, numa_info[i].node_mem, 1);
-fdt_add_memory_node(machine, phyAddr, numa_info[i].node_mem, i);
 offset += numa_info[i].node_mem;
 phyAddr += numa_info[i].node_mem;
 }
-- 
2.39.3




Re: [PATCH v3 1/1] accel/kvm: Fix segmentation fault

2024-05-06 Thread Zhijian Li (Fujitsu)


on 5/7/2024 10:50 AM, Masato Imai wrote:
> When the KVM acceleration parameter is not set, executing calc_dirty_rate
> with the -r or -b option results in a segmentation fault due to accessing
> a null kvm_state pointer in the kvm_dirty_ring_enabled function. This
> commit adds a null check for kvm_status to prevent segmentation faults.
>
> Signed-off-by: Masato Imai 

LGTM,
Tested-by: Li Zhijian 


> ---
>   accel/kvm/kvm-all.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index c0be9f5eed..544293be8a 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -2329,7 +2329,7 @@ bool kvm_vcpu_id_is_valid(int vcpu_id)
>   
>   bool kvm_dirty_ring_enabled(void)
>   {
> -return kvm_state->kvm_dirty_ring_size ? true : false;
> +return kvm_state && kvm_state->kvm_dirty_ring_size;
>   }
>   
>   static void query_stats_cb(StatsResultList **result, StatsTarget target,


[PATCH v3 1/1] accel/kvm: Fix segmentation fault

2024-05-06 Thread Masato Imai
When the KVM acceleration parameter is not set, executing calc_dirty_rate
with the -r or -b option results in a segmentation fault due to accessing
a null kvm_state pointer in the kvm_dirty_ring_enabled function. This
commit adds a null check for kvm_status to prevent segmentation faults.

Signed-off-by: Masato Imai 
---
 accel/kvm/kvm-all.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index c0be9f5eed..544293be8a 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -2329,7 +2329,7 @@ bool kvm_vcpu_id_is_valid(int vcpu_id)
 
 bool kvm_dirty_ring_enabled(void)
 {
-return kvm_state->kvm_dirty_ring_size ? true : false;
+return kvm_state && kvm_state->kvm_dirty_ring_size;
 }
 
 static void query_stats_cb(StatsResultList **result, StatsTarget target,
-- 
2.34.1




[PATCH v3 0/1] accel/kvm: Fix segmentation fault

2024-05-06 Thread Masato Imai
Changes from v2:
- avoid segfault in kvm/accel instead of migration/dirtyrate

v2: 
https://lore.kernel.org/qemu-devel/20240423091306.754432-1-...@sfc.wide.ad.jp

Masato Imai (1):
  accel/kvm: Fix segmentation fault

 accel/kvm/kvm-all.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

-- 
2.34.1




RE: [PATCH v3 00/19] Add a host IOMMU device abstraction to check with vIOMMU

2024-05-06 Thread Duan, Zhenzhong



>-Original Message-
>From: Jason Gunthorpe 
>Subject: Re: [PATCH v3 00/19] Add a host IOMMU device abstraction to
>check with vIOMMU
>
>On Mon, May 06, 2024 at 02:30:47AM +, Duan, Zhenzhong wrote:
>
>> I'm not clear how useful multiple iommufd instances support are.
>> One possible benefit is for security? It may bring a slightly fine-grained
>> isolation in kernel.
>
>No. I don't think there is any usecase, it is only harmful.

OK, so we need to limit QEMU to only one iommufd instance.

In cdev series, we support mix of legacy and iommufd backend and multiple 
iommufd backend instances for flexibility.
We need to make a choice to have this limitation only for nesting series or 
globally(including cdev).
May I ask what harmfulness we may have?

Thanks
Zhenzhong



[PATCH] hw/loongarch/virt: Fix memory leak

2024-05-06 Thread Song Gao
The char pointer 'ramName' point to a block of memory,
but never free it. Use 'g_autofree' to automatically free it.

Resolves: Coverity CID 1544773

Fixes: 0cf1478d6 ("hw/loongarch: Add numa support")
Signed-off-by: Song Gao 
---
 hw/loongarch/virt.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c
index c0999878df..ea5100be6d 100644
--- a/hw/loongarch/virt.c
+++ b/hw/loongarch/virt.c
@@ -887,7 +887,6 @@ static void loongarch_init(MachineState *machine)
 const CPUArchIdList *possible_cpus;
 MachineClass *mc = MACHINE_GET_CLASS(machine);
 CPUState *cpu;
-char *ramName = NULL;
 
 if (!cpu_model) {
 cpu_model = LOONGARCH_CPU_TYPE_NAME("la464");
@@ -946,7 +945,7 @@ static void loongarch_init(MachineState *machine)
 
 for (i = 1; i < nb_numa_nodes; i++) {
 MemoryRegion *nodemem = g_new(MemoryRegion, 1);
-ramName = g_strdup_printf("loongarch.node%d.ram", i);
+g_autofree char *ramName = g_strdup_printf("loongarch.node%d.ram", i);
 memory_region_init_alias(nodemem, NULL, ramName, machine->ram,
  offset,  numa_info[i].node_mem);
 memory_region_add_subregion(address_space_mem, phyAddr, nodemem);
-- 
2.25.1




RE: [PATCH 1/3] vfio: Make VFIOIOMMUClass::attach_device() and its wrapper return bool

2024-05-06 Thread Duan, Zhenzhong


>-Original Message-
>From: Cédric Le Goater 
>Subject: Re: [PATCH 1/3] vfio: Make VFIOIOMMUClass::attach_device() and
>its wrapper return bool
>
>On 5/6/24 10:33, Zhenzhong Duan wrote:
>> Make VFIOIOMMUClass::attach_device() and its wrapper function
>> vfio_attach_device() return bool.
>>
>> This is to follow the coding standand to return bool if 'Error **'
>> is used to pass error.
>>
>> Suggested-by: Cédric Le Goater 
>> Signed-off-by: Zhenzhong Duan 
>> ---
>>   include/hw/vfio/vfio-common.h |  4 ++--
>>   include/hw/vfio/vfio-container-base.h |  4 ++--
>>   hw/vfio/ap.c  |  6 ++
>>   hw/vfio/ccw.c |  6 ++
>>   hw/vfio/common.c  |  4 ++--
>>   hw/vfio/container.c   | 14 +++---
>>   hw/vfio/iommufd.c | 11 +--
>>   hw/vfio/pci.c |  8 +++-
>>   hw/vfio/platform.c|  7 +++
>>   9 files changed, 28 insertions(+), 36 deletions(-)
>>
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-
>common.h
>> index b9da6c08ef..a7b6fc8f46 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -198,8 +198,8 @@ void vfio_region_exit(VFIORegion *region);
>>   void vfio_region_finalize(VFIORegion *region);
>>   void vfio_reset_handler(void *opaque);
>>   struct vfio_device_info *vfio_get_device_info(int fd);
>> -int vfio_attach_device(char *name, VFIODevice *vbasedev,
>> -   AddressSpace *as, Error **errp);
>> +bool vfio_attach_device(char *name, VFIODevice *vbasedev,
>> +AddressSpace *as, Error **errp);
>>   void vfio_detach_device(VFIODevice *vbasedev);
>>
>>   int vfio_kvm_device_add_fd(int fd, Error **errp);
>> diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-
>container-base.h
>> index 3582d5f97a..c839cfd9cb 100644
>> --- a/include/hw/vfio/vfio-container-base.h
>> +++ b/include/hw/vfio/vfio-container-base.h
>> @@ -118,8 +118,8 @@ struct VFIOIOMMUClass {
>>   int (*dma_unmap)(const VFIOContainerBase *bcontainer,
>>hwaddr iova, ram_addr_t size,
>>IOMMUTLBEntry *iotlb);
>> -int (*attach_device)(const char *name, VFIODevice *vbasedev,
>> - AddressSpace *as, Error **errp);
>> +bool (*attach_device)(const char *name, VFIODevice *vbasedev,
>> +  AddressSpace *as, Error **errp);
>>   void (*detach_device)(VFIODevice *vbasedev);
>>   /* migration feature */
>>   int (*set_dirty_page_tracking)(const VFIOContainerBase *bcontainer,
>> diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
>> index 7c4caa5938..d50600b702 100644
>> --- a/hw/vfio/ap.c
>> +++ b/hw/vfio/ap.c
>> @@ -156,7 +156,6 @@ static void
>vfio_ap_unregister_irq_notifier(VFIOAPDevice *vapdev,
>>   static void vfio_ap_realize(DeviceState *dev, Error **errp)
>>   {
>>   ERRP_GUARD();
>> -int ret;
>>   Error *err = NULL;
>>   VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev);
>>   VFIODevice *vbasedev = >vdev;
>> @@ -165,9 +164,8 @@ static void vfio_ap_realize(DeviceState *dev, Error
>**errp)
>>   return;
>>   }
>>
>> -ret = vfio_attach_device(vbasedev->name, vbasedev,
>> - _space_memory, errp);
>> -if (ret) {
>> +if (!vfio_attach_device(vbasedev->name, vbasedev,
>> +_space_memory, errp)) {
>>   goto error;
>>   }
>>
>> diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
>> index 90e4a53437..782bd4bed7 100644
>> --- a/hw/vfio/ccw.c
>> +++ b/hw/vfio/ccw.c
>> @@ -580,7 +580,6 @@ static void vfio_ccw_realize(DeviceState *dev,
>Error **errp)
>>   S390CCWDeviceClass *cdc = S390_CCW_DEVICE_GET_CLASS(cdev);
>>   VFIODevice *vbasedev = >vdev;
>>   Error *err = NULL;
>> -int ret;
>>
>>   /* Call the class init function for subchannel. */
>>   if (cdc->realize) {
>> @@ -594,9 +593,8 @@ static void vfio_ccw_realize(DeviceState *dev,
>Error **errp)
>>   return;
>>   }
>>
>> -ret = vfio_attach_device(cdev->mdevid, vbasedev,
>> - _space_memory, errp);
>> -if (ret) {
>> +if (!vfio_attach_device(cdev->mdevid, vbasedev,
>> +_space_memory, errp)) {
>>   goto out_attach_dev_err;
>>   }
>>
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index 8f9cbdc026..890d30910e 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -1492,8 +1492,8 @@ retry:
>>   return info;
>>   }
>>
>> -int vfio_attach_device(char *name, VFIODevice *vbasedev,
>> -   AddressSpace *as, Error **errp)
>> +bool vfio_attach_device(char *name, VFIODevice *vbasedev,
>> +AddressSpace *as, Error **errp)
>>   {
>>   const VFIOIOMMUClass *ops =
>>
>VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY));
>
>
>I think 

RE: [PATCH-for-9.1 v2 2/3] migration: Remove RDMA protocol handling

2024-05-06 Thread Gonglei (Arei)
Hello,

> -Original Message-
> From: Peter Xu [mailto:pet...@redhat.com]
> Sent: Monday, May 6, 2024 11:18 PM
> To: Gonglei (Arei) 
> Cc: Daniel P. Berrangé ; Markus Armbruster
> ; Michael Galaxy ; Yu Zhang
> ; Zhijian Li (Fujitsu) ; Jinpu Wang
> ; Elmar Gerdes ;
> qemu-devel@nongnu.org; Yuval Shaia ; Kevin Wolf
> ; Prasanna Kumar Kalever
> ; Cornelia Huck ;
> Michael Roth ; Prasanna Kumar Kalever
> ; integrat...@gluster.org; Paolo Bonzini
> ; qemu-bl...@nongnu.org; de...@lists.libvirt.org;
> Hanna Reitz ; Michael S. Tsirkin ;
> Thomas Huth ; Eric Blake ; Song
> Gao ; Marc-André Lureau
> ; Alex Bennée ;
> Wainer dos Santos Moschetta ; Beraldo Leal
> ; Pannengyuan ;
> Xiexiangyou 
> Subject: Re: [PATCH-for-9.1 v2 2/3] migration: Remove RDMA protocol handling
> 
> On Mon, May 06, 2024 at 02:06:28AM +, Gonglei (Arei) wrote:
> > Hi, Peter
> 
> Hey, Lei,
> 
> Happy to see you around again after years.
> 
Haha, me too.

> > RDMA features high bandwidth, low latency (in non-blocking lossless
> > network), and direct remote memory access by bypassing the CPU (As you
> > know, CPU resources are expensive for cloud vendors, which is one of
> > the reasons why we introduced offload cards.), which TCP does not have.
> 
> It's another cost to use offload cards, v.s. preparing more cpu resources?
> 
Software and hardware offload converged architecture is the way to go for all 
cloud vendors 
(Including comprehensive benefits in terms of performance, cost, security, and 
innovation speed), 
it's not just a matter of adding the resource of a DPU card.

> > In some scenarios where fast live migration is needed (extremely short
> > interruption duration and migration duration) is very useful. To this
> > end, we have also developed RDMA support for multifd.
> 
> Will any of you upstream that work?  I'm curious how intrusive would it be
> when adding it to multifd, if it can keep only 5 exported functions like what
> rdma.h does right now it'll be pretty nice.  We also want to make sure it 
> works
> with arbitrary sized loads and buffers, e.g. vfio is considering to add IO 
> loads to
> multifd channels too.
> 

In fact, we sent the patchset to the community in 2021. Pls see:
https://lore.kernel.org/all/20210203185906.GT2950@work-vm/T/


> One thing to note that the question here is not about a pure performance
> comparison between rdma and nics only.  It's about help us make a decision
> on whether to drop rdma, iow, even if rdma performs well, the community still
> has the right to drop it if nobody can actively work and maintain it.
> It's just that if nics can perform as good it's more a reason to drop, unless
> companies can help to provide good support and work together.
> 

We are happy to provide the necessary review and maintenance work for RDMA
if the community needs it.

CC'ing Chuan Zheng.


Regards,
-Gonglei



Re: [RFC PATCH v3 3/5] KVM: x86: Add notifications for Heki policy configuration and violation

2024-05-06 Thread Sean Christopherson
On Mon, May 06, 2024, Mickaël Salaün wrote:
> On Fri, May 03, 2024 at 07:03:21AM GMT, Sean Christopherson wrote:
> > > ---
> > > 
> > > Changes since v1:
> > > * New patch. Making user space aware of Heki properties was requested by
> > >   Sean Christopherson.
> > 
> > No, I suggested having userspace _control_ the pinning[*], not merely be 
> > notified
> > of pinning.
> > 
> >  : IMO, manipulation of protections, both for memory (this patch) and CPU 
> > state
> >  : (control registers in the next patch) should come from userspace.  I 
> > have no
> >  : objection to KVM providing plumbing if necessary, but I think userspace 
> > needs to
> >  : to have full control over the actual state.
> >  : 
> >  : One of the things that caused Intel's control register pinning series to 
> > stall
> >  : out was how to handle edge cases like kexec() and reboot.  Deferring to 
> > userspace
> >  : means the kernel doesn't need to define policy, e.g. when to unprotect 
> > memory,
> >  : and avoids questions like "should userspace be able to overwrite pinned 
> > control
> >  : registers".
> >  : 
> >  : And like the confidential VM use case, keeping userspace in the loop is 
> > a big
> >  : beneifit, e.g. the guest can't circumvent protections by coercing 
> > userspace into
> >  : writing to protected memory.
> > 
> > I stand by that suggestion, because I don't see a sane way to handle things 
> > like
> > kexec() and reboot without having a _much_ more sophisticated policy than 
> > would
> > ever be acceptable in KVM.
> > 
> > I think that can be done without KVM having any awareness of CR pinning 
> > whatsoever.
> > E.g. userspace just needs to ability to intercept CR writes and inject 
> > #GPs.  Off
> > the cuff, I suspect the uAPI could look very similar to MSR filtering.  
> > E.g. I bet
> > userspace could enforce MSR pinning without any new KVM uAPI at all.
> > 
> > [*] https://lore.kernel.org/all/zfuyhpuhtmbyd...@google.com
> 
> OK, I had concern about the control not directly coming from the guest,
> especially in the case of pKVM and confidential computing, but I get you

Hardware-based CoCo is completely out of scope, because KVM has zero visibility
into the guest (well, SNP technically allows trapping CR0/CR4, but KVM really
shouldn't intercept CR0/CR4 for SNP guests).

And more importantly, _KVM_ doesn't define any policies for CoCo VMs.  KVM might
help enforce policies that are defined by hardware/firmware, but KVM doesn't
define any of its own.

If pKVM on x86 comes along, then KVM will likely get in the business of defining
policy, but until that happens, KVM needs to stay firmly out of the picture.

> point.  It should indeed be quite similar to the MSR filtering on the
> userspace side, except that we need another interface for the guest to
> request such change (i.e. self-protection).
> 
> Would it be OK to keep this new KVM_HC_LOCK_CR_UPDATE hypercall but
> forward the request to userspace with a VM exit instead?  That would
> also enable userspace to get the request and directly configure the CR
> pinning with the same VM exit.

No?  Maybe?  I strongly suspect that full support will need a richer set of APIs
than a single hypercall.  E.g. to handle kexec(), suspend+resume, emulated SMM,
and so on and so forth.  And that's just for CR pinning.

And hypercalls are hampered by the fact that VMCALL/VMMCALL don't allow for
delegation or restriction, i.e. there's no way for the guest to communicate to
the hypervisor that a less privileged component is allowed to perform some 
action,
nor is there a way for the guest to say some chunk of CPL0 code *isn't* allowed
to request transition.  Delegation and restriction all has to be done 
out-of-band.

It'd probably be more annoying to setup initially, but I think a synthetic 
device
with an MMIO-based interface would be more powerful and flexible in the long 
run.
Then userspace can evolve without needing to wait for KVM to catch up.

Actually, potential bad/crazy idea.  Why does the _host_ need to define policy?
Linux already knows what assets it wants to (un)protect and when.  What's 
missing
is a way for the guest kernel to effectively deprivilege and re-authenticate
itself as needed.  We've been tossing around the idea of paired VMs+vCPUs to
support VTLs and SEV's VMPLs, what if we usurped/piggybacked those ideas, with a
bit of pKVM mixed in?

Borrowing VTL terminology, where VTL0 is the least privileged, userspace 
launches
the VM at VTL0.  At some point, the guest triggers the deprivileging sequence 
and
userspace creates VTL1.  Userpace also provides a way for VTL0 restrict access 
to
its memory, e.g. to effectively make the page tables for the kernel's direct map
writable only from VTL1, to make kernel text RO (or XO), etc.  And VTL0 could 
then
also completely remove its access to code that changes CR0/CR4.

It would obviously require a _lot_ more upfront work, e.g. to isolate the kernel
text that modifies CR0/CR4 so that it can be removed 

Re: [PULL 3/5] hw/loongarch: Add numa support

2024-05-06 Thread gaosong

在 2024/5/3 下午8:50, Peter Maydell 写道:

On Fri, 16 Jun 2023 at 11:03, Song Gao  wrote:

From: Tianrui Zhao 

1. Implement some functions for LoongArch numa support;
2. Implement fdt_add_memory_node() for fdt;
3. build_srat() fills node_id and adds build numa memory.

Reviewed-by: Song Gao 
Signed-off-by: Tianrui Zhao 
Signed-off-by: Song Gao 
Message-Id: <20230613122613.2471743-1-zhaotian...@loongson.cn>

Hi; Coverity has pointed out a memory leak in this commit
(CID 1544773):


@@ -799,17 +823,43 @@ static void loongarch_init(MachineState *machine)
  machine->possible_cpus->cpus[i].cpu = OBJECT(cpu);
  }
  fdt_add_cpu_nodes(lams);
-/* Add memory region */
-memory_region_init_alias(>lowmem, NULL, "loongarch.lowram",
- machine->ram, 0, 256 * MiB);
-memory_region_add_subregion(address_space_mem, offset, >lowmem);
-offset += 256 * MiB;
-memmap_add_entry(0, 256 * MiB, 1);
-highram_size = ram_size - 256 * MiB;
-memory_region_init_alias(>highmem, NULL, "loongarch.highmem",
- machine->ram, offset, highram_size);
-memory_region_add_subregion(address_space_mem, 0x9000, >highmem);
-memmap_add_entry(0x9000, highram_size, 1);
+
+/* Node0 memory */
+memmap_add_entry(VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE, 1);
+fdt_add_memory_node(machine, VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE, 0);
+memory_region_init_alias(>lowmem, NULL, "loongarch.node0.lowram",
+ machine->ram, offset, VIRT_LOWMEM_SIZE);
+memory_region_add_subregion(address_space_mem, phyAddr, >lowmem);
+
+offset += VIRT_LOWMEM_SIZE;
+if (nb_numa_nodes > 0) {
+assert(numa_info[0].node_mem > VIRT_LOWMEM_SIZE);
+highram_size = numa_info[0].node_mem - VIRT_LOWMEM_SIZE;
+} else {
+highram_size = ram_size - VIRT_LOWMEM_SIZE;
+}
+phyAddr = VIRT_HIGHMEM_BASE;
+memmap_add_entry(phyAddr, highram_size, 1);
+fdt_add_memory_node(machine, phyAddr, highram_size, 0);
+memory_region_init_alias(>highmem, NULL, "loongarch.node0.highram",
+  machine->ram, offset, highram_size);
+memory_region_add_subregion(address_space_mem, phyAddr, >highmem);
+
+/* Node1 - Nodemax memory */
+offset += highram_size;
+phyAddr += highram_size;
+
+for (i = 1; i < nb_numa_nodes; i++) {
+MemoryRegion *nodemem = g_new(MemoryRegion, 1);
+ramName = g_strdup_printf("loongarch.node%d.ram", i);
+memory_region_init_alias(nodemem, NULL, ramName, machine->ram,
+ offset,  numa_info[i].node_mem);
+memory_region_add_subregion(address_space_mem, phyAddr, nodemem);
+memmap_add_entry(phyAddr, numa_info[i].node_mem, 1);
+fdt_add_memory_node(machine, phyAddr, numa_info[i].node_mem, i);
+offset += numa_info[i].node_mem;
+phyAddr += numa_info[i].node_mem;

In this loop, we allocate memory via g_strdup_printf(),
but never free it. The nicest fix for this is to use the
g_autofree mechanism so that the memory is automatically
freed when execution reaches the end of the block:
g_autofree ramName = g_strdup_printf("", ...);

Thank you.   I will fix it.

Thanks
Song Gao

thanks
-- PMM





Re: [PATCH v3 1/5] hw/loongarch: Rename LOONGARCH_MACHINE with VIRT_MACHINE

2024-05-06 Thread maobibo




On 2024/5/6 下午2:09, maobibo wrote:



On 2024/5/6 下午12:24, Thomas Huth wrote:

On 06/05/2024 05.02, Bibo Mao wrote:

On LoongArch system, there is only virt machine type now, name
LOONGARCH_MACHINE is confused, rename it with VIRT_MACHINE. Machine name
about Other real hw boards can be added in future.

Signed-off-by: Bibo Mao 
---

...
@@ -1245,7 +1244,7 @@ static void loongarch_class_init(ObjectClass 
*oc, void *data)

  static const TypeInfo loongarch_machine_types[] = {
  {
-    .name   = TYPE_LOONGARCH_MACHINE,
+    .name   = TYPE_VIRT_MACHINE,
  .parent = TYPE_MACHINE,
  .instance_size  = sizeof(LoongArchMachineState),
  .class_init = loongarch_class_init,
diff --git a/include/hw/loongarch/virt.h b/include/hw/loongarch/virt.h
index 4e14bf6060..5ea2f0370d 100644
--- a/include/hw/loongarch/virt.h
+++ b/include/hw/loongarch/virt.h
@@ -73,8 +73,8 @@ struct LoongArchMachineState {
  struct loongarch_boot_info bootinfo;
  };
-#define TYPE_LOONGARCH_MACHINE  MACHINE_TYPE_NAME("virt")
-OBJECT_DECLARE_SIMPLE_TYPE(LoongArchMachineState, LOONGARCH_MACHINE)
+#define TYPE_VIRT_MACHINE  MACHINE_TYPE_NAME("virt")
+OBJECT_DECLARE_SIMPLE_TYPE(LoongArchMachineState, VIRT_MACHINE)
  bool loongarch_is_acpi_enabled(LoongArchMachineState *lams);
  void loongarch_acpi_setup(LoongArchMachineState *lams);
  #endif


  Hi,

there are currently some efforts going on to create the possibility to 
link a QEMU binary that contains all targets in one binary. Since we 
already have a TYPE_VIRT_MACHINE for other targets, I wonder whether 
it might be better to use LOONGARCH_VIRT_MACHINE than just 
VIRT_MACHINE here? Philippe, could you comment on this?


It is great if there is one QEMU binary which supports different 
targets. And LOONGARCH_VIRT_MACHINE is ok for me.

Hi Thomas, Philippe,

Does machine name "virt" need be changed if LOONGARCH_VIRT_MACHINE is 
used? There will be compatible issues if "virt" machine type is not 
suggested to use.


However CPU type "max" is not widely used now, can we get different 
architectures from CPU type rather than machine type for one QEMU binary 
which supports different targets?


Regards
Bibo Mao



Regards
Bibo Mao


  Thomas







RE: [PATCH v2] mem/cxl_type3: support 3, 6, 12 and 16 interleave ways

2024-05-06 Thread Xingtao Yao (Fujitsu)



> -Original Message-
> From: Jonathan Cameron 
> Sent: Tuesday, April 30, 2024 10:43 PM
> To: Yao, Xingtao/姚 幸涛 
> Cc: fan...@samsung.com; qemu-devel@nongnu.org
> Subject: Re: [PATCH v2] mem/cxl_type3: support 3, 6, 12 and 16 interleave ways
> 
> On Wed, 24 Apr 2024 01:36:56 +
> "Xingtao Yao (Fujitsu)"  wrote:
> 
> > ping.
> >
> > > -Original Message-
> > > From: Yao Xingtao 
> > > Sent: Sunday, April 7, 2024 11:07 AM
> > > To: jonathan.came...@huawei.com; fan...@samsung.com
> > > Cc: qemu-devel@nongnu.org; Yao, Xingtao/姚 幸涛
> 
> > > Subject: [PATCH v2] mem/cxl_type3: support 3, 6, 12 and 16 interleave ways
> > >
> > > Since the kernel does not check the interleave capability, a
> > > 3-way, 6-way, 12-way or 16-way region can be create normally.
> > >
> > > Applications can access the memory of 16-way region normally because
> > > qemu can convert hpa to dpa correctly for the power of 2 interleave
> > > ways, after kernel implementing the check, this kind of region will
> > > not be created any more.
> > >
> > > For non power of 2 interleave ways, applications could not access the
> > > memory normally and may occur some unexpected behaviors, such as
> > > segmentation fault.
> > >
> > > So implements this feature is needed.
> > >
> > > Link:
> > >
> https://lore.kernel.org/linux-cxl/3e84b919-7631-d1db-3e1d-33000f3f3868@fujits
> > > u.com/
> > > Signed-off-by: Yao Xingtao 
> > > ---
> > >  hw/mem/cxl_type3.c | 18 ++
> > >  1 file changed, 14 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
> > > index b0a7e9f11b..d6ef784e96 100644
> > > --- a/hw/mem/cxl_type3.c
> > > +++ b/hw/mem/cxl_type3.c
> > > @@ -805,10 +805,17 @@ static bool cxl_type3_dpa(CXLType3Dev *ct3d,
> hwaddr
> > > host_addr, uint64_t *dpa)
> > >  continue;
> > >  }
> > >
> > > -*dpa = dpa_base +
> > > -((MAKE_64BIT_MASK(0, 8 + ig) & hpa_offset) |
> > > - ((MAKE_64BIT_MASK(8 + ig + iw, 64 - 8 - ig - iw) & 
> > > hpa_offset)
> > > -  >> iw));
> > > +if (iw < 8) {
> > > +*dpa = dpa_base +
> > > +((MAKE_64BIT_MASK(0, 8 + ig) & hpa_offset) |
> > > + ((MAKE_64BIT_MASK(8 + ig + iw, 64 - 8 - ig - iw) &
> hpa_offset)
> > > +  >> iw));
> > > +} else {
> > > +*dpa = dpa_base +
> > > +((MAKE_64BIT_MASK(0, 8 + ig) & hpa_offset) |
> > > + MAKE_64BIT_MASK(ig + iw, 64 - ig - iw) & hpa_offset)
> > > +   >> (ig + iw)) / 3) << (ig + 8)));
> > > +}
> > >
> > >  return true;
> > >  }
> > > @@ -906,6 +913,9 @@ static void ct3d_reset(DeviceState *dev)
> > >  uint32_t *write_msk =
> ct3d->cxl_cstate.crb.cache_mem_regs_write_mask;
> > >
> > >  cxl_component_register_init_common(reg_state, write_msk,
> > > CXL2_TYPE3_DEVICE);
> > > +ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY,
> > > 3_6_12_WAY, 1);
> > > +ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY,
> > > 16_WAY, 1);
> > > +
> 
> Why here rather than in hdm_reg_init_common()?
> It's constant data and is currently being set to 0 in there.

according to the CXL specifications (8.2.4.20.1 CXL HDM Decoder Capability 
Register (Offset 00h)),
this feature is only applicable to cxl.mem, upstream switch port and CXL host 
bridges shall hardwrite
these bits to 0.

so I think it would be more appropriate to set these bits here.

> 
> > >  cxl_device_register_init_t3(ct3d);
> > >
> > >  /*
> > > --
> > > 2.37.3
> >




RE: [PATCH V8 6/8] physmem: Add helper function to destroy CPU AddressSpace

2024-05-06 Thread Salil Mehta via
>  From: Peter Maydell 
>  Sent: Monday, May 6, 2024 10:29 AM
>  To: Salil Mehta 
>  
>  On Mon, 6 May 2024 at 10:06, Salil Mehta 
>  wrote:
>  >
>  > Hi Peter,
>  >
>  > Thanks for the review.
>  >
>  > >  From: Peter Maydell   When do we need to
>  > > destroy a single address space in this way that means  we need to
>  > > keep a count of how many ASes the CPU currently has? The  commit
>  > > message talks about the case when we unrealize the whole CPU
>  > > object, but in that situation you can just throw away all the ASes
>  > > at once (eg  by calling some
>  > >  cpu_destroy_address_spaces() function from
>  cpu_common_unrealizefn()).
>  >
>  >
>  > Yes, maybe, we can destroy all at once from common leg as well. I'd
>  > prefer this to be done from the arch specific function for ARM to
>  > maintain the clarity & symmetry of initialization and
>  > un-initialization legs.  For now, all of these address space destruction is
>  happening in context to the arm_cpu_unrealizefn().
>  >
>  > It’s a kind of trade-off between little more code and clarity but I'm
>  > open to further suggestions.
>  >
>  >
>  > >
>  > >  Also, if we're leaking stuff here by failing to destroy it, is that
>  > > a problem for  existing CPU types like x86 that we can already hotplug?
>  >
>  > No we are not. We are taking care of these in the ARM arch specific
>  > legs within functions arm_cpu_(un)realizefn().
>  
>  How can you be taking care of *x86* CPU types in the Arm unrealize?


Sorry, yes, I missed to reply that clearly. There was indeed a leak with x86 
reported
by Phillipe/David last year. In fact, Phillipe floated a patch last year for 
this.
I thought it was fixed already as part of cpu_common_unrealize() but I just
checked and realized that the below proposed changed still isn’t part of the
mainline

https://lore.kernel.org/qemu-devel/20230918160257.30127-9-phi...@linaro.org/

I can definitely add a common CPU AddressSpace destruction leg as part of this
patch if in case arch specific CPU unrealize does not cleans up its CPU
AddressSpace?


Thanks
Salil.

>  
>  thanks
>  -- PMM


Re: [PATCH V1 06/26] migration: precreate vmstate for exec

2024-05-06 Thread Fabiano Rosas
Steve Sistare  writes:

> Provide migration_precreate_save for saving precreate vmstate across exec.
> Create a memfd, save its value in the environment, and serialize state
> to it.  Reverse the process in migration_precreate_load.
>
> Signed-off-by: Steve Sistare 
> ---
>  include/migration/misc.h |   5 ++
>  migration/meson.build|   1 +
>  migration/precreate.c| 139 
> +++
>  3 files changed, 145 insertions(+)
>  create mode 100644 migration/precreate.c
>
> diff --git a/include/migration/misc.h b/include/migration/misc.h
> index c9e200f..cf30351 100644
> --- a/include/migration/misc.h
> +++ b/include/migration/misc.h
> @@ -56,6 +56,11 @@ AnnounceParameters *migrate_announce_params(void);
>  
>  void dump_vmstate_json_to_file(FILE *out_fp);
>  
> +/* migration/precreate.c */
> +int migration_precreate_save(Error **errp);
> +void migration_precreate_unsave(void);
> +int migration_precreate_load(Error **errp);
> +
>  /* migration/migration.c */
>  void migration_object_init(void);
>  void migration_shutdown(void);
> diff --git a/migration/meson.build b/migration/meson.build
> index f76b1ba..50e7cb2 100644
> --- a/migration/meson.build
> +++ b/migration/meson.build
> @@ -26,6 +26,7 @@ system_ss.add(files(
>'ram-compress.c',
>'options.c',
>'postcopy-ram.c',
> +  'precreate.c',
>'savevm.c',
>'socket.c',
>'tls.c',
> diff --git a/migration/precreate.c b/migration/precreate.c
> new file mode 100644
> index 000..0bf5e1f
> --- /dev/null
> +++ b/migration/precreate.c
> @@ -0,0 +1,139 @@
> +/*
> + * Copyright (c) 2022, 2024 Oracle and/or its affiliates.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu/cutils.h"
> +#include "qemu/memfd.h"
> +#include "qapi/error.h"
> +#include "io/channel-file.h"
> +#include "migration/misc.h"
> +#include "migration/qemu-file.h"
> +#include "migration/savevm.h"
> +
> +#define PRECREATE_STATE_NAME "QEMU_PRECREATE_STATE"
> +
> +static QEMUFile *qemu_file_new_fd_input(int fd, const char *name)
> +{
> +g_autoptr(QIOChannelFile) fioc = qio_channel_file_new_fd(fd);
> +QIOChannel *ioc = QIO_CHANNEL(fioc);
> +qio_channel_set_name(ioc, name);
> +return qemu_file_new_input(ioc);
> +}
> +
> +static QEMUFile *qemu_file_new_fd_output(int fd, const char *name)
> +{
> +g_autoptr(QIOChannelFile) fioc = qio_channel_file_new_fd(fd);
> +QIOChannel *ioc = QIO_CHANNEL(fioc);
> +qio_channel_set_name(ioc, name);
> +return qemu_file_new_output(ioc);
> +}
> +
> +static int memfd_create_named(const char *name, Error **errp)
> +{
> +int mfd;
> +char val[16];
> +
> +mfd = memfd_create(name, 0);
> +if (mfd < 0) {
> +error_setg_errno(errp, errno, "memfd_create failed");
> +return -1;
> +}
> +
> +/* Remember mfd in environment for post-exec load */
> +qemu_clear_cloexec(mfd);
> +snprintf(val, sizeof(val), "%d", mfd);
> +g_setenv(name, val, 1);
> +
> +return mfd;
> +}
> +
> +static int memfd_find_named(const char *name, int *mfd_p, Error **errp)
> +{
> +const char *val = g_getenv(name);
> +
> +if (!val) {
> +*mfd_p = -1;
> +return 0;   /* No memfd was created, not an error */
> +}
> +g_unsetenv(name);
> +if (qemu_strtoi(val, NULL, 10, mfd_p)) {
> +error_setg(errp, "Bad %s env value %s", PRECREATE_STATE_NAME, val);
> +return -1;
> +}
> +lseek(*mfd_p, 0, SEEK_SET);
> +return 0;
> +}
> +
> +static void memfd_delete_named(const char *name)
> +{
> +int mfd;
> +const char *val = g_getenv(name);
> +
> +if (val) {
> +g_unsetenv(name);
> +if (!qemu_strtoi(val, NULL, 10, )) {
> +close(mfd);
> +}
> +}
> +}
> +
> +static QEMUFile *qemu_file_new_memfd_output(const char *name, Error **errp)
> +{
> +int mfd = memfd_create_named(name, errp);
> +
> +if (mfd < 0) {
> +return NULL;
> +}
> +
> +return qemu_file_new_fd_output(mfd, name);
> +}
> +
> +static QEMUFile *qemu_file_new_memfd_input(const char *name, Error **errp)
> +{
> +int ret, mfd;
> +
> +ret = memfd_find_named(name, , errp);
> +if (ret || mfd < 0) {
> +return NULL;
> +}
> +
> +return qemu_file_new_fd_input(mfd, name);
> +}
> +
> +int migration_precreate_save(Error **errp)
> +{
> +QEMUFile *f = qemu_file_new_memfd_output(PRECREATE_STATE_NAME, errp);
> +
> +if (!f) {
> +return -1;
> +} else if (qemu_savevm_precreate_save(f, errp)) {
> +memfd_delete_named(PRECREATE_STATE_NAME);
> +return -1;
> +} else {
> +/* Do not close f, as mfd must remain open. */
> +return 0;
> +}
> +}
> +
> +void migration_precreate_unsave(void)
> +{
> +memfd_delete_named(PRECREATE_STATE_NAME);
> +}
> +
> +int migration_precreate_load(Error **errp)
> +{
> +   

Re: [PATCH V1 01/26] oslib: qemu_clear_cloexec

2024-05-06 Thread Fabiano Rosas
Steve Sistare  writes:

+cc dgilbert, marcandre

> Define qemu_clear_cloexec, analogous to qemu_set_cloexec.
>
> Signed-off-by: Steve Sistare 
> Reviewed-by: Dr. David Alan Gilbert 
> Reviewed-by: Marc-André Lureau 

A v1 patch with two reviews already, from people from another company
and they're not in CC. Looks suspicious. =)

Here's a fresh one, hopefully it won't spend another 4 years in the
drawer:

Reviewed-by: Fabiano Rosas 

> ---
>  include/qemu/osdep.h | 9 +
>  util/oslib-posix.c   | 9 +
>  util/oslib-win32.c   | 4 
>  3 files changed, 22 insertions(+)
>
> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> index c7053cd..b58f312 100644
> --- a/include/qemu/osdep.h
> +++ b/include/qemu/osdep.h
> @@ -660,6 +660,15 @@ ssize_t qemu_write_full(int fd, const void *buf, size_t 
> count)
>  
>  void qemu_set_cloexec(int fd);
>  
> +/*
> + * Clear FD_CLOEXEC for a descriptor.
> + *
> + * The caller must guarantee that no other fork+exec's occur before the
> + * exec that is intended to inherit this descriptor, eg by suspending CPUs
> + * and blocking monitor commands.
> + */
> +void qemu_clear_cloexec(int fd);
> +
>  /* Return a dynamically allocated directory path that is appropriate for 
> storing
>   * local state.
>   *
> diff --git a/util/oslib-posix.c b/util/oslib-posix.c
> index e764416..614c3e5 100644
> --- a/util/oslib-posix.c
> +++ b/util/oslib-posix.c
> @@ -272,6 +272,15 @@ int qemu_socketpair(int domain, int type, int protocol, 
> int sv[2])
>  return ret;
>  }
>  
> +void qemu_clear_cloexec(int fd)
> +{
> +int f;
> +f = fcntl(fd, F_GETFD);
> +assert(f != -1);
> +f = fcntl(fd, F_SETFD, f & ~FD_CLOEXEC);
> +assert(f != -1);
> +}
> +
>  char *
>  qemu_get_local_state_dir(void)
>  {
> diff --git a/util/oslib-win32.c b/util/oslib-win32.c
> index b623830..c3e969a 100644
> --- a/util/oslib-win32.c
> +++ b/util/oslib-win32.c
> @@ -222,6 +222,10 @@ void qemu_set_cloexec(int fd)
>  {
>  }
>  
> +void qemu_clear_cloexec(int fd)
> +{
> +}
> +
>  int qemu_get_thread_id(void)
>  {
>  return GetCurrentThreadId();



Re: [PATCH V1 03/26] migration: SAVEVM_FOREACH

2024-05-06 Thread Fabiano Rosas
Steve Sistare  writes:

> Define an abstraction SAVEVM_FOREACH to loop over all savevm state
> handlers, and replace QTAILQ_FOREACH.  Define variants for ALL so
> we can loop over all handlers vs a subset of handlers in a subsequent
> patch, but at this time there is no distinction between the two.
> No functional change.
>
> Signed-off-by: Steve Sistare 
> ---
>  migration/savevm.c | 55 
> +++---
>  1 file changed, 32 insertions(+), 23 deletions(-)
>
> diff --git a/migration/savevm.c b/migration/savevm.c
> index 4509482..6829ba3 100644
> --- a/migration/savevm.c
> +++ b/migration/savevm.c
> @@ -237,6 +237,15 @@ static SaveState savevm_state = {
>  .global_section_id = 0,
>  };
>  
> +#define SAVEVM_FOREACH(se, entry)\
> +QTAILQ_FOREACH(se, _state.handlers, entry)\
> +
> +#define SAVEVM_FOREACH_ALL(se, entry)\
> +QTAILQ_FOREACH(se, _state.handlers, entry)

This feels worse than SAVEVM_FOREACH_NOT_PRECREATED. We'll have to keep
coming back to the definition to figure out which FOREACH is the real
deal.

> +
> +#define SAVEVM_FOREACH_SAFE_ALL(se, entry, new_se)   \
> +QTAILQ_FOREACH_SAFE(se, _state.handlers, entry, new_se)
> +
>  static SaveStateEntry *find_se(const char *idstr, uint32_t instance_id);
>  
>  static bool should_validate_capability(int capability)
> @@ -674,7 +683,7 @@ static uint32_t calculate_new_instance_id(const char 
> *idstr)
>  SaveStateEntry *se;
>  uint32_t instance_id = 0;
>  
> -QTAILQ_FOREACH(se, _state.handlers, entry) {
> +SAVEVM_FOREACH_ALL(se, entry) {

In this patch we can't have both instances...

>  if (strcmp(idstr, se->idstr) == 0
>  && instance_id <= se->instance_id) {
>  instance_id = se->instance_id + 1;
> @@ -690,7 +699,7 @@ static int calculate_compat_instance_id(const char *idstr)
>  SaveStateEntry *se;
>  int instance_id = 0;
>  
> -QTAILQ_FOREACH(se, _state.handlers, entry) {
> +SAVEVM_FOREACH(se, entry) {

...otherwise one of the two changes will go undocumented because the
actual reason for it will only be described in the next patch.

>  if (!se->compat) {
>  continue;
>  }
> @@ -816,7 +825,7 @@ void unregister_savevm(VMStateIf *obj, const char *idstr, 
> void *opaque)
>  }
>  pstrcat(id, sizeof(id), idstr);
>  
> -QTAILQ_FOREACH_SAFE(se, _state.handlers, entry, new_se) {
> +SAVEVM_FOREACH_SAFE_ALL(se, entry, new_se) {
>  if (strcmp(se->idstr, id) == 0 && se->opaque == opaque) {
>  savevm_state_handler_remove(se);
>  g_free(se->compat);
> @@ -939,7 +948,7 @@ void vmstate_unregister(VMStateIf *obj, const 
> VMStateDescription *vmsd,
>  {
>  SaveStateEntry *se, *new_se;
>  
> -QTAILQ_FOREACH_SAFE(se, _state.handlers, entry, new_se) {
> +SAVEVM_FOREACH_SAFE_ALL(se, entry, new_se) {
>  if (se->vmsd == vmsd && se->opaque == opaque) {
>  savevm_state_handler_remove(se);
>  g_free(se->compat);
> @@ -1223,7 +1232,7 @@ bool qemu_savevm_state_blocked(Error **errp)
>  {
>  SaveStateEntry *se;
>  
> -QTAILQ_FOREACH(se, _state.handlers, entry) {
> +SAVEVM_FOREACH(se, entry) {
>  if (se->vmsd && se->vmsd->unmigratable) {
>  error_setg(errp, "State blocked by non-migratable device '%s'",
> se->idstr);
> @@ -1237,7 +1246,7 @@ void qemu_savevm_non_migratable_list(strList **reasons)
>  {
>  SaveStateEntry *se;
>  
> -QTAILQ_FOREACH(se, _state.handlers, entry) {
> +SAVEVM_FOREACH(se, entry) {
>  if (se->vmsd && se->vmsd->unmigratable) {
>  QAPI_LIST_PREPEND(*reasons,
>g_strdup_printf("non-migratable device: %s",
> @@ -1276,7 +1285,7 @@ bool qemu_savevm_state_guest_unplug_pending(void)
>  {
>  SaveStateEntry *se;
>  
> -QTAILQ_FOREACH(se, _state.handlers, entry) {
> +SAVEVM_FOREACH(se, entry) {
>  if (se->vmsd && se->vmsd->dev_unplug_pending &&
>  se->vmsd->dev_unplug_pending(se->opaque)) {
>  return true;
> @@ -1291,7 +1300,7 @@ int qemu_savevm_state_prepare(Error **errp)
>  SaveStateEntry *se;
>  int ret;
>  
> -QTAILQ_FOREACH(se, _state.handlers, entry) {
> +SAVEVM_FOREACH(se, entry) {
>  if (!se->ops || !se->ops->save_prepare) {
>  continue;
>  }
> @@ -1321,7 +1330,7 @@ int qemu_savevm_state_setup(QEMUFile *f, Error **errp)
>  json_writer_start_array(ms->vmdesc, "devices");
>  
>  trace_savevm_state_setup();
> -QTAILQ_FOREACH(se, _state.handlers, entry) {
> +SAVEVM_FOREACH(se, entry) {
>  if (se->vmsd && se->vmsd->early_setup) {
>  ret = vmstate_save(f, se, ms->vmdesc, errp);
>  if (ret) {
> @@ -1365,7 +1374,7 @@ int qemu_savevm_state_resume_prepare(MigrationState *s)
>  
>  

Re: [PATCH v2 04/33] accel/tcg: Reorg translator_ld*

2024-05-06 Thread Philippe Mathieu-Daudé

On 25/4/24 01:31, Richard Henderson wrote:

Reorg translator_access into translator_ld, with a more
memcpy-ish interface.  If both pages are in ram, do not
go through the caller's slow path.

Assert that the access is within the two pages that we are
prepared to protect, per TranslationBlock.  Allow access
prior to pc_first, so long as it is within the first page.

Signed-off-by: Richard Henderson 
---
  accel/tcg/translator.c | 189 ++---
  1 file changed, 101 insertions(+), 88 deletions(-)




  uint64_t translator_ldq(CPUArchState *env, DisasContextBase *db, vaddr pc)
  {
-uint64_t ret, plug;
-void *p = translator_access(env, db, pc, sizeof(ret));
+uint64_t raw, tgt;
  
-if (p) {

-plugin_insn_append(pc, p, sizeof(ret));
-return ldq_p(p);
+if (translator_ld(env, db, , pc, sizeof(raw))) {
+tgt = tswap64(raw);
+} else {
+tgt = cpu_ldl_code(env, pc);


cpu_ldq_code() ?


+raw = tswap64(tgt);
  }
-ret = cpu_ldq_code(env, pc);
-plug = tswap64(ret);
-plugin_insn_append(pc, , sizeof(ret));
-return ret;
+plugin_insn_append(pc, , sizeof(raw));
+return tgt;
  }






Re: [PATCH v2 32/33] target/s390x: Use translator_lduw in get_next_pc

2024-05-06 Thread Philippe Mathieu-Daudé

On 25/4/24 01:31, Richard Henderson wrote:

Signed-off-by: Richard Henderson 
---
  target/s390x/tcg/translate.c | 3 +--
  1 file changed, 1 insertion(+), 2 deletions(-)


Reviewed-by: Philippe Mathieu-Daudé 





Re: [PATCH v2 15/33] plugins: Merge alloc_tcg_plugin_context into plugin_gen_tb_start

2024-05-06 Thread Philippe Mathieu-Daudé

On 25/4/24 01:31, Richard Henderson wrote:

We don't need to allocate plugin context at startup,
we can wait until we actually use it.

Signed-off-by: Richard Henderson 
---
  accel/tcg/plugin-gen.c | 36 
  tcg/tcg.c  | 11 ---
  2 files changed, 20 insertions(+), 27 deletions(-)


Reviewed-by: Philippe Mathieu-Daudé 




Re: [PATCH 02/57] target/arm: Split out gengvec64.c

2024-05-06 Thread Philippe Mathieu-Daudé

On 6/5/24 03:03, Richard Henderson wrote:

Split some routines out of translate-a64.c and translate-sve.c
that are used by both.

Signed-off-by: Richard Henderson 
---
  target/arm/tcg/translate-a64.h |   4 +
  target/arm/tcg/gengvec64.c | 190 +
  target/arm/tcg/translate-a64.c |  26 -
  target/arm/tcg/translate-sve.c | 145 +
  target/arm/tcg/meson.build |   1 +
  5 files changed, 197 insertions(+), 169 deletions(-)
  create mode 100644 target/arm/tcg/gengvec64.c


Reviewed using git-diff --color-moved=dimmed-zebra

Reviewed-by: Philippe Mathieu-Daudé 




Re: [PATCH 01/57] target/arm: Split out gengvec.c

2024-05-06 Thread Philippe Mathieu-Daudé

On 6/5/24 03:03, Richard Henderson wrote:

Signed-off-by: Richard Henderson 
---
  target/arm/tcg/translate.h |5 +
  target/arm/tcg/gengvec.c   | 1612 
  target/arm/tcg/translate.c | 1588 ---
  target/arm/tcg/meson.build |1 +
  4 files changed, 1618 insertions(+), 1588 deletions(-)
  create mode 100644 target/arm/tcg/gengvec.c


Reviewed using git-diff --color-moved=dimmed-zebra

Reviewed-by: Philippe Mathieu-Daudé 




Re: [PATCH] gitlab: Rename ubuntu-22.04-s390x-all to *-system

2024-05-06 Thread Philippe Mathieu-Daudé

On 6/5/24 22:23, Richard Henderson wrote:

We already build the linux-user binaries with
ubuntu-22.04-s390x-all-linux, so there's no need to do it again.

Signed-off-by: Richard Henderson 
---
  .gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)


Reviewed-by: Philippe Mathieu-Daudé 




Re: [PATCH V1 04/26] migration: delete unused parameter mis

2024-05-06 Thread Fabiano Rosas
Steve Sistare  writes:

> Signed-off-by: Steve Sistare 

Reviewed-by: Fabiano Rosas 



Re: [PATCH v8 0/5] Support message-based DMA in vfio-user server

2024-05-06 Thread Mattias Nissler
On Mon, May 6, 2024 at 4:44 PM Peter Xu  wrote:

> On Thu, Mar 28, 2024 at 08:53:36AM +0100, Mattias Nissler wrote:
> > Stefan, to the best of my knowledge this is fully reviewed and ready
> > to go in - can you kindly pick it up or advise in case there's
> > something I missed? Thanks!
>
> Fails cross-compile on mipsel:
>
> https://gitlab.com/peterx/qemu/-/jobs/6787790601


Ah, bummer, thanks for reporting. 4GB of bounce buffer should be plenty, so
switching to 32 bit atomics seems a good idea at first glance. I'll take a
closer look tomorrow and send a respin with a fix.


Re: [PATCH v8 0/5] Support message-based DMA in vfio-user server

2024-05-06 Thread Mattias Nissler
On Mon, May 6, 2024 at 5:01 PM Stefan Hajnoczi  wrote:

> On Thu, 28 Mar 2024 at 03:54, Mattias Nissler 
> wrote:
> >
> > Stefan, to the best of my knowledge this is fully reviewed and ready
> > to go in - can you kindly pick it up or advise in case there's
> > something I missed? Thanks!
>
> This code is outside the areas that I maintain. I think it would make
> sense for Jag to merge it and send a pull request as vfio-user
> maintainer.


OK, thanks for following up, I'll check with Jag.


[PATCH] gitlab: Rename ubuntu-22.04-s390x-all to *-system

2024-05-06 Thread Richard Henderson
We already build the linux-user binaries with
ubuntu-22.04-s390x-all-linux, so there's no need to do it again.

Signed-off-by: Richard Henderson 
---
 .gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml 
b/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml
index 3a2b1e1d24..c3f16d77bb 100644
--- a/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml
+++ b/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml
@@ -21,7 +21,7 @@ ubuntu-22.04-s390x-all-linux:
  - make --output-sync check-tcg
  - make --output-sync -j`nproc` check
 
-ubuntu-22.04-s390x-all:
+ubuntu-22.04-s390x-all-system:
  extends: .custom_runner_template
  needs: []
  stage: build
@@ -35,7 +35,7 @@ ubuntu-22.04-s390x-all:
  script:
  - mkdir build
  - cd build
- - ../configure --disable-libssh
+ - ../configure --disable-libssh --disable-user
|| { cat config.log meson-logs/meson-log.txt; exit 1; }
  - make --output-sync -j`nproc`
  - make --output-sync -j`nproc` check
-- 
2.34.1




[PATCH] gitlab: Drop --static from s390x linux-user build

2024-05-06 Thread Richard Henderson
The host does not have the correct libraries installed for static pie,
which causes host/guest address space interference for some tests.
There's no real gain from linking statically, so drop it.

Signed-off-by: Richard Henderson 
---
Per my suggestion in

https://lore.kernel.org/qemu-devel/50c27a9f-fd75-4f8e-9a2d-488d8df4f...@linaro.org


r~
---
 .gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml 
b/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml
index 105981879f..3a2b1e1d24 100644
--- a/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml
+++ b/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml
@@ -2,7 +2,7 @@
 # setup by the scripts/ci/setup/build-environment.yml task
 # "Install basic packages to build QEMU on Ubuntu 22.04"
 
-ubuntu-22.04-s390x-all-linux-static:
+ubuntu-22.04-s390x-all-linux:
  extends: .custom_runner_template
  needs: []
  stage: build
@@ -15,7 +15,7 @@ ubuntu-22.04-s390x-all-linux-static:
  script:
  - mkdir build
  - cd build
- - ../configure --enable-debug --static --disable-system
+ - ../configure --enable-debug-tcg --disable-system --disable-tools 
--disable-docs
|| { cat config.log meson-logs/meson-log.txt; exit 1; }
  - make --output-sync -j`nproc`
  - make --output-sync check-tcg
-- 
2.34.1




Re: qemu-img cache modes with Linux cgroup v1

2024-05-06 Thread Alex Kalenyuk
Hey, just FYI about tmpfs, during some development on Fedora 39 I noticed
O_DIRECT is now supported on tmpfs (as opposed to our CI which runs Centos
9 Stream).
`qemu-img convert -t none -O raw tests/images/cirros-qcow2.img
/tmp/cirros.raw`
where /tmp is indeed a tmpfs.

I might be missing something so feel free to call that out

On Tue, Aug 1, 2023 at 6:38 PM Stefan Hajnoczi  wrote:

> Hi Daniel,
> I agree with your points.
>
> Stefan
>


Re: [PULL 00/12] qemu-sparc queue 20240506

2024-05-06 Thread Richard Henderson

On 5/6/24 04:44, Mark Cave-Ayland wrote:

The following changes since commit 248f6f62df073a3b4158fd0093863ab885feabb5:

   Merge tag 'pull-axp-20240504' ofhttps://gitlab.com/rth7680/qemu  into 
staging (2024-05-04 08:39:46 -0700)

are available in the Git repository at:

   https://github.com/mcayland/qemu.git  tags/qemu-sparc-20240506

for you to fetch changes up to d6f898cf85c92389182d22f0bcc3a11d7194fc94:

   target/sparc: Split out do_ms16b (2024-05-05 21:02:48 +0100)


qemu-sparc queue
- Default to modern virtio with iommu_platform enabled for sun4u
- Fixes for various VIS instructions from Richard
- CPU name updates from Thomas


Applied, thanks.  Please update https://wiki.qemu.org/ChangeLog/9.1 as 
appropriate.


r~




Re: [PULL 0/7] QAPI patches patches for 2024-05-06

2024-05-06 Thread Richard Henderson

On 5/6/24 04:02, Markus Armbruster wrote:

The following changes since commit 248f6f62df073a3b4158fd0093863ab885feabb5:

   Merge tag 'pull-axp-20240504' ofhttps://gitlab.com/rth7680/qemu  into 
staging (2024-05-04 08:39:46 -0700)

are available in the Git repository at:

   https://repo.or.cz/qemu/armbru.git  tags/pull-qapi-2024-05-06

for you to fetch changes up to 285a8f209af7b4992aa91e8bea03a303fb6406ab:

   qapi: Simplify QAPISchemaVariants @tag_member (2024-05-06 12:38:27 +0200)


QAPI patches patches for 2024-05-06


Applied, thanks.  Please update https://wiki.qemu.org/ChangeLog/9.1 as 
appropriate.


r~




Re: [PULL 00/28] Accelerator patches for 2024-05-06

2024-05-06 Thread Richard Henderson

On 5/6/24 05:37, Philippe Mathieu-Daudé wrote:

The following changes since commit 248f6f62df073a3b4158fd0093863ab885feabb5:

   Merge tag 'pull-axp-20240504' ofhttps://gitlab.com/rth7680/qemu  into 
staging (2024-05-04 08:39:46 -0700)

are available in the Git repository at:

   https://github.com/philmd/qemu.git  tags/accel-next-20240506

for you to fetch changes up to c984d1d8916df8abac71325a5a135cd851b2106a:

   MAINTAINERS: Update my email address (2024-05-06 14:33:49 +0200)


Accelerator patches

- Extract page-protection definitions to page-protection.h
- Rework in accel/tcg in preparation of extracting TCG fields from CPUState
- More uses of get_task_state() in user emulation
- Xen refactors in preparation for adding multiple map caches (Juergen & Edgar)
- MAINTAINERS updates (Aleksandar and Bin)


Applied, thanks.  Please update https://wiki.qemu.org/ChangeLog/9.1 as 
appropriate.


r~




[PATCH] target/sh4: Update DisasContextBase.insn_start

2024-05-06 Thread Richard Henderson
Match the extra inserts of INDEX_op_insn_start, fixing
the db->num_insns != 1 assert in translator_loop.

Fixes: dcd092a0636 ("accel/tcg: Improve can_do_io management")
Signed-off-by: Richard Henderson 
---
 target/sh4/translate.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index e599ab9d1a..b3282f3ac7 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -2189,6 +2189,7 @@ static void decode_gusa(DisasContext *ctx, CPUSH4State 
*env)
  */
 for (i = 1; i < max_insns; ++i) {
 tcg_gen_insn_start(pc + i * 2, ctx->envflags);
+ctx->base.insn_start = tcg_last_op();
 }
 }
 #endif
-- 
2.34.1




[PATCH 2/2] aio: warn about iohandler_ctx special casing

2024-05-06 Thread Stefan Hajnoczi
The main loop has two AioContexts: qemu_aio_context and iohandler_ctx.
The main loop runs them both, but nested aio_poll() calls on
qemu_aio_context exclude iohandler_ctx.

Which one should qemu_get_current_aio_context() return when called from
the main loop? Document that it's always qemu_aio_context.

This has subtle effects on functions that use
qemu_get_current_aio_context(). For example, aio_co_reschedule_self()
does not work when moving from iohandler_ctx to qemu_aio_context because
qemu_get_current_aio_context() does not differentiate these two
AioContexts.

Document this in order to reduce the chance of future bugs.

Signed-off-by: Stefan Hajnoczi 
---
 include/block/aio.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/include/block/aio.h b/include/block/aio.h
index 8378553eb9..4ee81936ed 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -629,6 +629,9 @@ void aio_co_schedule(AioContext *ctx, Coroutine *co);
  *
  * Move the currently running coroutine to new_ctx. If the coroutine is already
  * running in new_ctx, do nothing.
+ *
+ * Note that this function cannot reschedule from iohandler_ctx to
+ * qemu_aio_context.
  */
 void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx);
 
@@ -661,6 +664,9 @@ void aio_co_enter(AioContext *ctx, Coroutine *co);
  * If called from an IOThread this will be the IOThread's AioContext.  If
  * called from the main thread or with the "big QEMU lock" taken it
  * will be the main loop AioContext.
+ *
+ * Note that the return value is never the main loop's iohandler_ctx and the
+ * return value is the main loop AioContext instead.
  */
 AioContext *qemu_get_current_aio_context(void);
 
-- 
2.45.0




[PATCH 1/2] Revert "monitor: use aio_co_reschedule_self()"

2024-05-06 Thread Stefan Hajnoczi
Commit 1f25c172f837 ("monitor: use aio_co_reschedule_self()") was a code
cleanup that uses aio_co_reschedule_self() instead of open coding
coroutine rescheduling.

Bug RHEL-34618 was reported and Kevin Wolf  identified
the root cause. I missed that aio_co_reschedule_self() ->
qemu_get_current_aio_context() only knows about
qemu_aio_context/IOThread AioContexts and not about iohandler_ctx. It
does not function correctly when going back from the iohandler_ctx to
qemu_aio_context.

Go back to open coding the AioContext transitions to avoid this bug.

This reverts commit 1f25c172f83704e350c0829438d832384084a74d.

Buglink: https://issues.redhat.com/browse/RHEL-34618
Signed-off-by: Stefan Hajnoczi 
---
 qapi/qmp-dispatch.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/qapi/qmp-dispatch.c b/qapi/qmp-dispatch.c
index f3488afeef..176b549473 100644
--- a/qapi/qmp-dispatch.c
+++ b/qapi/qmp-dispatch.c
@@ -212,7 +212,8 @@ QDict *coroutine_mixed_fn qmp_dispatch(const QmpCommandList 
*cmds, QObject *requ
  * executing the command handler so that it can make progress if it
  * involves an AIO_WAIT_WHILE().
  */
-aio_co_reschedule_self(qemu_get_aio_context());
+aio_co_schedule(qemu_get_aio_context(), qemu_coroutine_self());
+qemu_coroutine_yield();
 }
 
 monitor_set_cur(qemu_coroutine_self(), cur_mon);
@@ -226,7 +227,9 @@ QDict *coroutine_mixed_fn qmp_dispatch(const QmpCommandList 
*cmds, QObject *requ
  * Move back to iohandler_ctx so that nested event loops for
  * qemu_aio_context don't start new monitor commands.
  */
-aio_co_reschedule_self(iohandler_get_aio_context());
+aio_co_schedule(iohandler_get_aio_context(),
+qemu_coroutine_self());
+qemu_coroutine_yield();
 }
 } else {
/*
-- 
2.45.0




[PATCH 0/2] Revert "monitor: use aio_co_reschedule_self()"

2024-05-06 Thread Stefan Hajnoczi
This series fixes RHEL-34618 "qemu crash on Assertion `luringcb->co->ctx ==
s->aio_context' failed when do block_resize on hotplug disk with aio=io_uring":
https://issues.redhat.com/browse/RHEL-34618

Kevin identified commit 1f25c172f837 ("monitor: use aio_co_reschedule_self()")
as the root cause. There is a subtlety regarding how
qemu_get_current_aio_context() returns qemu_aio_context even though we may be
running in iohandler_ctx.

Revert commit 1f25c172f837, it was just intended as a code cleanup.

Stefan Hajnoczi (2):
  Revert "monitor: use aio_co_reschedule_self()"
  aio: warn about iohandler_ctx special casing

 include/block/aio.h | 6 ++
 qapi/qmp-dispatch.c | 7 +--
 2 files changed, 11 insertions(+), 2 deletions(-)

-- 
2.45.0




Re: Intention to work on GSoC project

2024-05-06 Thread Sahil
Hi,

It's been a while since I last gave an update. Sorry about that. I am ready
to get my hands dirty and start with the implementation.

I have gone through the source of linux's drivers/virtio/virtio_ring.c [1], and
QEMU's hw/virtio/virtio.c [2] and hw/virtio/vhost-shadow-virtqueue.c [3].

Before actually starting I would like to make sure I am on the right track. In
vhost-shadow-virtqueue.c, there's a function "vhost_svq_add" which in turn
calls "vhost_svq_add_split".

Shall I start by implementing a mechanism to check if the feature bit
"VIRTIO_F_RING_PACKED" is set (using "virtio_vdev_has_feature")? And
if it's supported, "vhost_svq_add" should call "vhost_svq_add_packed".
Following this, I can then start implementing "vhost_svq_add_packed"
and progress from there.

What are your thoughts on this?

Thanks,
Sahil

[1] 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/virtio/virtio.c
[2] https://gitlab.com/qemu-project/qemu/-/blob/master/hw/virtio/virtio.c
[3] 
https://gitlab.com/qemu-project/qemu/-/blob/master/hw/virtio/vhost-shadow-virtqueue.c





RE: [PATCH 2/4] target/hexagon: idef-parser remove undefined functions

2024-05-06 Thread ltaylorsimpson



> -Original Message-
> From: Anton Johansson 
> Sent: Monday, May 6, 2024 1:31 PM
> To: qemu-devel@nongnu.org
> Cc: a...@rev.ng; ltaylorsimp...@gmail.com; bc...@quicinc.com
> Subject: [PATCH 2/4] target/hexagon: idef-parser remove undefined
> functions
> 
> Signed-off-by: Anton Johansson 
> ---
>  target/hexagon/idef-parser/parser-helpers.h | 13 -
>  1 file changed, 13 deletions(-)

Reviewed-by: Taylor Simpson 




RE: [PATCH 3/4] target/hexagon: idef-parser fix leak of init_list

2024-05-06 Thread ltaylorsimpson



> -Original Message-
> From: Anton Johansson 
> Sent: Monday, May 6, 2024 1:31 PM
> To: qemu-devel@nongnu.org
> Cc: a...@rev.ng; ltaylorsimp...@gmail.com; bc...@quicinc.com
> Subject: [PATCH 3/4] target/hexagon: idef-parser fix leak of init_list
> 
> gen_inst_init_args() is called for instructions using a predicate as an
rvalue.
> Upon first call, the list of arguments which might need initialization
init_list is
> freed to indicate that they have been processed. For instructions without
an
> rvalue predicate,
> gen_inst_init_args() isn't called and init_list will never be freed.
> 
> Free init_list from free_instruction() if it hasn't already been freed.
> 
> Signed-off-by: Anton Johansson 
> ---
>  target/hexagon/idef-parser/parser-helpers.c | 7 +++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/target/hexagon/idef-parser/parser-helpers.c
> b/target/hexagon/idef-parser/parser-helpers.c
> index 95f2b43076..bae01c2bb8 100644
> --- a/target/hexagon/idef-parser/parser-helpers.c
> +++ b/target/hexagon/idef-parser/parser-helpers.c
> @@ -2121,6 +2121,13 @@ void free_instruction(Context *c)
>  g_string_free(g_array_index(c->inst.strings, GString*, i), TRUE);
>  }
>  g_array_free(c->inst.strings, TRUE);
> +/*
> + * Free list of arguments that might need initialization, if they
haven't
> + * already been free'd.
> + */
> +if (c->inst.init_list) {
> +g_array_free(c->inst.init_list, TRUE);
> +}
>  /* Free INAME token value */
>  g_string_free(c->inst.name, TRUE);
>  /* Free variables and registers */

Why not do this in gen_inst_init_args just before this?
   /* Free argument init list once we have initialized everything */
g_array_free(c->inst.init_list, TRUE);
c->inst.init_list = NULL;


Taylor






RE: [PATCH 4/4] target/hexagon: idef-parser simplify predicate init

2024-05-06 Thread ltaylorsimpson



> -Original Message-
> From: Anton Johansson 
> Sent: Monday, May 6, 2024 1:31 PM
> To: qemu-devel@nongnu.org
> Cc: a...@rev.ng; ltaylorsimp...@gmail.com; bc...@quicinc.com
> Subject: [PATCH 4/4] target/hexagon: idef-parser simplify predicate init
> 
> Only predicate instruction arguments need to be initialized by
idef-parser.
> This commit removes registers from the init_list and simplifies
> gen_inst_init_args() slightly.
> 
> Signed-off-by: Anton Johansson 
> ---
>  target/hexagon/idef-parser/idef-parser.y|  2 --
>  target/hexagon/idef-parser/parser-helpers.c | 20 +---
>  2 files changed, 9 insertions(+), 13 deletions(-)

> diff --git a/target/hexagon/idef-parser/parser-helpers.c
> b/target/hexagon/idef-parser/parser-helpers.c
> index bae01c2bb8..33e8f82007 100644
> --- a/target/hexagon/idef-parser/parser-helpers.c
> +++ b/target/hexagon/idef-parser/parser-helpers.c
> @@ -1652,26 +1652,24 @@ void gen_inst(Context *c, GString *iname)
> 
>  void gen_inst_init_args(Context *c, YYLTYPE *locp)  {
> +/* If init_list is NULL arguments have already been initialized */
>  if (!c->inst.init_list) {
>  return;
>  }
> 
>  for (unsigned i = 0; i < c->inst.init_list->len; i++) {
>  HexValue *val = _array_index(c->inst.init_list, HexValue, i);
> -if (val->type == REGISTER_ARG) {
> -/* Nothing to do here */
> -} else if (val->type == PREDICATE) {
> -char suffix = val->is_dotnew ? 'N' : 'V';
> -EMIT_HEAD(c, "tcg_gen_movi_i%u(P%c%c, 0);\n", val->bit_width,
> -  val->pred.id, suffix);
> -} else {
> -yyassert(c, locp, false, "Invalid arg type!");
> -}
> +yyassert(c, locp, val->type == PREDICATE,
> + "Only predicates need to be initialized!");
> +char suffix = val->is_dotnew ? 'N' : 'V';

Declarations should be at the beginning of the function per QEMU coding
standards.

> +EMIT_HEAD(c, "tcg_gen_movi_i%u(P%c%c, 0);\n", val->bit_width,

Since you know this is a predicate, the bit_width will always be 32.  You
can hard-code that instead of using %u.

> +  val->pred.id, suffix);
>  }
> 
>  /* Free argument init list once we have initialized everything */

Taylor





RE: [PATCH 1/4] target/hexagon: idef-parser remove unused defines

2024-05-06 Thread ltaylorsimpson



> -Original Message-
> From: Anton Johansson 
> Sent: Monday, May 6, 2024 1:31 PM
> To: qemu-devel@nongnu.org
> Cc: a...@rev.ng; ltaylorsimp...@gmail.com; bc...@quicinc.com
> Subject: [PATCH 1/4] target/hexagon: idef-parser remove unused defines
> 
> Before switching to GArray/g_string_printf we used fixed size arrays for
> output buffers and instructions arguments among other things.
> 
> Macros defining the sizes of these buffers were left behind, remove them.
> 
> Signed-off-by: Anton Johansson 
> ---
>  target/hexagon/idef-parser/idef-parser.h | 10 --
>  1 file changed, 10 deletions(-)

Reviewed-by: Taylor Simpson 





[PATCH 2/4] target/hexagon: idef-parser remove undefined functions

2024-05-06 Thread Anton Johansson via
Signed-off-by: Anton Johansson 
---
 target/hexagon/idef-parser/parser-helpers.h | 13 -
 1 file changed, 13 deletions(-)

diff --git a/target/hexagon/idef-parser/parser-helpers.h 
b/target/hexagon/idef-parser/parser-helpers.h
index 7c58087169..2087d534a9 100644
--- a/target/hexagon/idef-parser/parser-helpers.h
+++ b/target/hexagon/idef-parser/parser-helpers.h
@@ -143,8 +143,6 @@ void commit(Context *c);
 
 #define OUT(c, locp, ...) FOR_EACH((c), (locp), OUT_IMPL, __VA_ARGS__)
 
-const char *cmp_swap(Context *c, YYLTYPE *locp, const char *type);
-
 /**
  * Temporary values creation
  */
@@ -236,8 +234,6 @@ HexValue gen_extract_op(Context *c,
 HexValue *index,
 HexExtract *extract);
 
-HexValue gen_read_reg(Context *c, YYLTYPE *locp, HexValue *reg);
-
 void gen_write_reg(Context *c, YYLTYPE *locp, HexValue *reg, HexValue *value);
 
 void gen_assign(Context *c,
@@ -274,13 +270,6 @@ HexValue gen_ctpop_op(Context *c, YYLTYPE *locp, HexValue 
*src);
 
 HexValue gen_rotl(Context *c, YYLTYPE *locp, HexValue *src, HexValue *n);
 
-HexValue gen_deinterleave(Context *c, YYLTYPE *locp, HexValue *mixed);
-
-HexValue gen_interleave(Context *c,
-YYLTYPE *locp,
-HexValue *odd,
-HexValue *even);
-
 HexValue gen_carry_from_add(Context *c,
 YYLTYPE *locp,
 HexValue *op1,
@@ -349,8 +338,6 @@ HexValue gen_rvalue_ternary(Context *c, YYLTYPE *locp, 
HexValue *cond,
 
 const char *cond_to_str(TCGCond cond);
 
-void emit_header(Context *c);
-
 void emit_arg(Context *c, YYLTYPE *locp, HexValue *arg);
 
 void emit_footer(Context *c);
-- 
2.44.0




[PATCH 4/4] target/hexagon: idef-parser simplify predicate init

2024-05-06 Thread Anton Johansson via
Only predicate instruction arguments need to be initialized by
idef-parser. This commit removes registers from the init_list and
simplifies gen_inst_init_args() slightly.

Signed-off-by: Anton Johansson 
---
 target/hexagon/idef-parser/idef-parser.y|  2 --
 target/hexagon/idef-parser/parser-helpers.c | 20 +---
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/target/hexagon/idef-parser/idef-parser.y 
b/target/hexagon/idef-parser/idef-parser.y
index cd2612eb8c..9ffb9f9699 100644
--- a/target/hexagon/idef-parser/idef-parser.y
+++ b/target/hexagon/idef-parser/idef-parser.y
@@ -233,8 +233,6 @@ code : '{' statements '}'
 argument_decl : REG
 {
 emit_arg(c, &@1, &$1);
-/* Enqueue register into initialization list */
-g_array_append_val(c->inst.init_list, $1);
 }
   | PRED
 {
diff --git a/target/hexagon/idef-parser/parser-helpers.c 
b/target/hexagon/idef-parser/parser-helpers.c
index bae01c2bb8..33e8f82007 100644
--- a/target/hexagon/idef-parser/parser-helpers.c
+++ b/target/hexagon/idef-parser/parser-helpers.c
@@ -1652,26 +1652,24 @@ void gen_inst(Context *c, GString *iname)
 
 
 /*
- * Initialize declared but uninitialized registers, but only for
- * non-conditional instructions
+ * Initialize declared but uninitialized instruction arguments. Only needed for
+ * predicate arguments, initialization of registers is handled by the Hexagon
+ * frontend.
  */
 void gen_inst_init_args(Context *c, YYLTYPE *locp)
 {
+/* If init_list is NULL arguments have already been initialized */
 if (!c->inst.init_list) {
 return;
 }
 
 for (unsigned i = 0; i < c->inst.init_list->len; i++) {
 HexValue *val = _array_index(c->inst.init_list, HexValue, i);
-if (val->type == REGISTER_ARG) {
-/* Nothing to do here */
-} else if (val->type == PREDICATE) {
-char suffix = val->is_dotnew ? 'N' : 'V';
-EMIT_HEAD(c, "tcg_gen_movi_i%u(P%c%c, 0);\n", val->bit_width,
-  val->pred.id, suffix);
-} else {
-yyassert(c, locp, false, "Invalid arg type!");
-}
+yyassert(c, locp, val->type == PREDICATE,
+ "Only predicates need to be initialized!");
+char suffix = val->is_dotnew ? 'N' : 'V';
+EMIT_HEAD(c, "tcg_gen_movi_i%u(P%c%c, 0);\n", val->bit_width,
+  val->pred.id, suffix);
 }
 
 /* Free argument init list once we have initialized everything */
-- 
2.44.0




[PATCH 3/4] target/hexagon: idef-parser fix leak of init_list

2024-05-06 Thread Anton Johansson via
gen_inst_init_args() is called for instructions using a predicate as an
rvalue. Upon first call, the list of arguments which might need
initialization init_list is freed to indicate that they have been
processed. For instructions without an rvalue predicate,
gen_inst_init_args() isn't called and init_list will never be freed.

Free init_list from free_instruction() if it hasn't already been freed.

Signed-off-by: Anton Johansson 
---
 target/hexagon/idef-parser/parser-helpers.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/target/hexagon/idef-parser/parser-helpers.c 
b/target/hexagon/idef-parser/parser-helpers.c
index 95f2b43076..bae01c2bb8 100644
--- a/target/hexagon/idef-parser/parser-helpers.c
+++ b/target/hexagon/idef-parser/parser-helpers.c
@@ -2121,6 +2121,13 @@ void free_instruction(Context *c)
 g_string_free(g_array_index(c->inst.strings, GString*, i), TRUE);
 }
 g_array_free(c->inst.strings, TRUE);
+/*
+ * Free list of arguments that might need initialization, if they haven't
+ * already been free'd.
+ */
+if (c->inst.init_list) {
+g_array_free(c->inst.init_list, TRUE);
+}
 /* Free INAME token value */
 g_string_free(c->inst.name, TRUE);
 /* Free variables and registers */
-- 
2.44.0




[PATCH 0/4] target/hexagon: Minor idef-parser cleanup

2024-05-06 Thread Anton Johansson via
Was running idef-parser with valgrind and noticed we were leaking the
init_list GArray, which is used to hold instruction arguments that may
need initialization.  This patchset fixes the leak, removes unused
macros and undefined functions, and simplifies gen_inst_init_args() to
only handle predicate values.

Anton Johansson (4):
  target/hexagon: idef-parser remove unused defines
  target/hexagon: idef-parser remove undefined functions
  target/hexagon: idef-parser fix leak of init_list
  target/hexagon: idef-parser simplify predicate init

 target/hexagon/idef-parser/idef-parser.h| 10 
 target/hexagon/idef-parser/idef-parser.y|  2 --
 target/hexagon/idef-parser/parser-helpers.c | 27 -
 target/hexagon/idef-parser/parser-helpers.h | 13 --
 4 files changed, 16 insertions(+), 36 deletions(-)

-- 
2.44.0




[PATCH 1/4] target/hexagon: idef-parser remove unused defines

2024-05-06 Thread Anton Johansson via
Before switching to GArray/g_string_printf we used fixed size arrays for
output buffers and instructions arguments among other things.

Macros defining the sizes of these buffers were left behind, remove
them.

Signed-off-by: Anton Johansson 
---
 target/hexagon/idef-parser/idef-parser.h | 10 --
 1 file changed, 10 deletions(-)

diff --git a/target/hexagon/idef-parser/idef-parser.h 
b/target/hexagon/idef-parser/idef-parser.h
index 3faa1deecd..8594cbe3a2 100644
--- a/target/hexagon/idef-parser/idef-parser.h
+++ b/target/hexagon/idef-parser/idef-parser.h
@@ -23,16 +23,6 @@
 #include 
 #include 
 
-#define TCGV_NAME_SIZE 7
-#define MAX_WRITTEN_REGS 32
-#define OFFSET_STR_LEN 32
-#define ALLOC_LIST_LEN 32
-#define ALLOC_NAME_SIZE 32
-#define INIT_LIST_LEN 32
-#define OUT_BUF_LEN (1024 * 1024)
-#define SIGNATURE_BUF_LEN (128 * 1024)
-#define HEADER_BUF_LEN (128 * 1024)
-
 /* Variadic macros to wrap the buffer printing functions */
 #define EMIT(c, ...)   
\
 do {   
\
-- 
2.44.0




Re: qemu-img cache modes with Linux cgroup v1

2024-05-06 Thread Stefan Hajnoczi
On Mon, May 06, 2024 at 08:10:25PM +0300, Alex Kalenyuk wrote:
> Hey, just FYI about tmpfs, during some development on Fedora 39 I noticed
> O_DIRECT is now supported on tmpfs (as opposed to our CI which runs Centos
> 9 Stream).
> `qemu-img convert -t none -O raw tests/images/cirros-qcow2.img
> /tmp/cirros.raw`
> where /tmp is indeed a tmpfs.
> 
> I might be missing something so feel free to call that out

Yes, it was added by:

commit e88e0d366f9cfbb810b0c8509dc5d130d5a53e02
Author: Hugh Dickins 
Date:   Thu Aug 10 23:27:07 2023 -0700

tmpfs: trivial support for direct IO

It's fairly new but great to have.

Stefan


signature.asc
Description: PGP signature


Re: [PATCH v2 5/5] monitor: use aio_co_reschedule_self()

2024-05-06 Thread Stefan Hajnoczi
On Fri, May 03, 2024 at 07:33:17PM +0200, Kevin Wolf wrote:
> Am 06.02.2024 um 20:06 hat Stefan Hajnoczi geschrieben:
> > The aio_co_reschedule_self() API is designed to avoid the race
> > condition between scheduling the coroutine in another AioContext and
> > yielding.
> > 
> > The QMP dispatch code uses the open-coded version that appears
> > susceptible to the race condition at first glance:
> > 
> >   aio_co_schedule(qemu_get_aio_context(), qemu_coroutine_self());
> >   qemu_coroutine_yield();
> > 
> > The code is actually safe because the iohandler and qemu_aio_context
> > AioContext run under the Big QEMU Lock. Nevertheless, set a good example
> > and use aio_co_reschedule_self() so it's obvious that there is no race.
> > 
> > Suggested-by: Hanna Reitz 
> > Reviewed-by: Manos Pitsidianakis 
> > Reviewed-by: Hanna Czenczek 
> > Signed-off-by: Stefan Hajnoczi 
> > ---
> >  qapi/qmp-dispatch.c | 7 ++-
> >  1 file changed, 2 insertions(+), 5 deletions(-)
> > 
> > diff --git a/qapi/qmp-dispatch.c b/qapi/qmp-dispatch.c
> > index 176b549473..f3488afeef 100644
> > --- a/qapi/qmp-dispatch.c
> > +++ b/qapi/qmp-dispatch.c
> > @@ -212,8 +212,7 @@ QDict *coroutine_mixed_fn qmp_dispatch(const 
> > QmpCommandList *cmds, QObject *requ
> >   * executing the command handler so that it can make progress 
> > if it
> >   * involves an AIO_WAIT_WHILE().
> >   */
> > -aio_co_schedule(qemu_get_aio_context(), qemu_coroutine_self());
> > -qemu_coroutine_yield();
> > +aio_co_reschedule_self(qemu_get_aio_context());
> 
> Turns out that this one actually causes a regression. [1] This code is
> ŕun in iohandler_ctx, aio_co_reschedule_self() looks at the new context
> and compares it with qemu_get_current_aio_context() - and because both
> are qemu_aio_context, it decides that it has nothing to do. So the
> command handler coroutine actually still runs in iohandler_ctx now,
> which is not what we want.
> 
> We could just revert this patch because it was only meant as a cleanup
> without a semantic difference.
> 
> Or aio_co_reschedule_self() could look at qemu_coroutine_self()->ctx
> instead of using qemu_get_current_aio_context(). That would be a little
> more indirect, though, and I'm not sure if co->ctx is always up to date.
> 
> Any opinions on what is the best way to fix this?

If the commit is reverted then similar bugs may be introduced again in
the future. The qemu_get_current_aio_context() API is unaware of
iohandler_ctx and this can lead to unexpected results.

I will send patches to revert the commit and add doc comments explaining
iohandler_ctx's special behavior. This will reduce, but not eliminate,
the risk of future bugs.

Modifying aio_co_reschedule_self() might be better long-term fix, but
I'm afraid it will create more bugs because it will expose the subtle
distinction between the current coroutine AioContext and non-coroutine
AioContext in new places. I think the root cause is that iohandler_ctx
isn't a full-fledged AioContext with its own event loop. iohandler_ctx
is a special superset of qemu_aio_context that the main loop monitors.

Stefan

> 
> Kevin
> 
> [1] https://issues.redhat.com/browse/RHEL-34618
> 


signature.asc
Description: PGP signature


Re: [PATCH 4/4] tests/qtest: Check STM32L4x5 clock connections

2024-05-06 Thread Inès Varhol



- Le 6 Mai 24, à 6:16, Thomas Huth th...@redhat.com a écrit :

> On 05/05/2024 16.05, Inès Varhol wrote:
>> For USART, GPIO and SYSCFG devices, check that clock frequency before
>> and after enabling the peripheral clock in RCC is correct.
>> 
>> Signed-off-by: Inès Varhol 
>> ---
>> Hello,
>> 
>> Should these tests be regrouped in stm32l4x5_rcc-test.c ?
> 
>  Hi,
> 
> sounds mostly like a matter of taste at a first glance. Or what would be the
> benefit of putting everything into the *rcc-test.c file? Could you maybe
> consolidate the get_clock_freq_hz() function that way? (maybe that
> get_clock_freq_hz() function could also be consolidated as a inline function
> in a shared header instead?)
> 
>  Thomas

Hello,

I was indeed looking to consolidate the functions get_clock_freq_hz() and
check_clock() from *usart-test.c (along with the definitions for RCC
registers).
Thank you for your suggestion, I'll use a header file.

Inès Varhol




Re: [PATCH RFC 00/26] Multifd  device state transfer support with VFIO consumer

2024-05-06 Thread Peter Xu
On Mon, May 06, 2024 at 06:26:46PM +0200, Maciej S. Szmigiero wrote:
> On 29.04.2024 17:09, Peter Xu wrote:
> > On Fri, Apr 26, 2024 at 07:34:09PM +0200, Maciej S. Szmigiero wrote:
> > > On 24.04.2024 00:35, Peter Xu wrote:
> > > > On Wed, Apr 24, 2024 at 12:25:08AM +0200, Maciej S. Szmigiero wrote:
> > > > > On 24.04.2024 00:20, Peter Xu wrote:
> > > > > > On Tue, Apr 23, 2024 at 06:15:35PM +0200, Maciej S. Szmigiero wrote:
> > > > > > > On 19.04.2024 17:31, Peter Xu wrote:
> > > > > > > > On Fri, Apr 19, 2024 at 11:07:21AM +0100, Daniel P. Berrangé 
> > > > > > > > wrote:
> > > > > > > > > On Thu, Apr 18, 2024 at 04:02:49PM -0400, Peter Xu wrote:
> > > > > > > > > > On Thu, Apr 18, 2024 at 08:14:15PM +0200, Maciej S. 
> > > > > > > > > > Szmigiero wrote:
> > > > > > > > > > > I think one of the reasons for these results is that 
> > > > > > > > > > > mixed (RAM + device
> > > > > > > > > > > state) multifd channels participate in the RAM sync 
> > > > > > > > > > > process
> > > > > > > > > > > (MULTIFD_FLAG_SYNC) whereas device state dedicated 
> > > > > > > > > > > channels don't.
> > > > > > > > > > 
> > > > > > > > > > Firstly, I'm wondering whether we can have better names for 
> > > > > > > > > > these new
> > > > > > > > > > hooks.  Currently (only comment on the async* stuff):
> > > > > > > > > > 
> > > > > > > > > >   - complete_precopy_async
> > > > > > > > > >   - complete_precopy
> > > > > > > > > >   - complete_precopy_async_wait
> > > > > > > > > > 
> > > > > > > > > > But perhaps better:
> > > > > > > > > > 
> > > > > > > > > >   - complete_precopy_begin
> > > > > > > > > >   - complete_precopy
> > > > > > > > > >   - complete_precopy_end
> > > > > > > > > > 
> > > > > > > > > > ?
> > > > > > > > > > 
> > > > > > > > > > As I don't see why the device must do something with async 
> > > > > > > > > > in such hook.
> > > > > > > > > > To me it's more like you're splitting one process into 
> > > > > > > > > > multiple, then
> > > > > > > > > > begin/end sounds more generic.
> > > > > > > > > > 
> > > > > > > > > > Then, if with that in mind, IIUC we can already split 
> > > > > > > > > > ram_save_complete()
> > > > > > > > > > into >1 phases too. For example, I would be curious whether 
> > > > > > > > > > the performance
> > > > > > > > > > will go back to normal if we offloading 
> > > > > > > > > > multifd_send_sync_main() into the
> > > > > > > > > > complete_precopy_end(), because we really only need one 
> > > > > > > > > > shot of that, and I
> > > > > > > > > > am quite surprised it already greatly affects VFIO dumping 
> > > > > > > > > > its own things.
> > > > > > > > > > 
> > > > > > > > > > I would even ask one step further as what Dan was asking: 
> > > > > > > > > > have you thought
> > > > > > > > > > about dumping VFIO states via multifd even during 
> > > > > > > > > > iterations?  Would that
> > > > > > > > > > help even more than this series (which IIUC only helps 
> > > > > > > > > > during the blackout
> > > > > > > > > > phase)?
> > > > > > > > > 
> > > > > > > > > To dump during RAM iteration, the VFIO device will need to 
> > > > > > > > > have
> > > > > > > > > dirty tracking and iterate on its state, because the guest 
> > > > > > > > > CPUs
> > > > > > > > > will still be running potentially changing VFIO state. That 
> > > > > > > > > seems
> > > > > > > > > impractical in the general case.
> > > > > > > > 
> > > > > > > > We already do such interations in vfio_save_iterate()?
> > > > > > > > 
> > > > > > > > My understanding is the recent VFIO work is based on the fact 
> > > > > > > > that the VFIO
> > > > > > > > device can track device state changes more or less (besides 
> > > > > > > > being able to
> > > > > > > > save/load full states).  E.g. I still remember in our QE tests 
> > > > > > > > some old
> > > > > > > > devices report much more dirty pages than expected during the 
> > > > > > > > iterations
> > > > > > > > when we were looking into such issue that a huge amount of 
> > > > > > > > dirty pages
> > > > > > > > reported.  But newer models seem to have fixed that and report 
> > > > > > > > much less.
> > > > > > > > 
> > > > > > > > That issue was about GPU not NICs, though, and IIUC a major 
> > > > > > > > portion of such
> > > > > > > > tracking used to be for GPU vRAMs.  So maybe I was mixing up 
> > > > > > > > these, and
> > > > > > > > maybe they work differently.
> > > > > > > 
> > > > > > > The device which this series was developed against (Mellanox 
> > > > > > > ConnectX-7)
> > > > > > > is already transferring its live state before the VM gets stopped 
> > > > > > > (via
> > > > > > > save_live_iterate SaveVMHandler).
> > > > > > > 
> > > > > > > It's just that in addition to the live state it has more than 400 
> > > > > > > MiB
> > > > > > > of state that cannot be transferred while the VM is still running.
> > > > > > > And that fact hurts a lot with respect to the migration downtime.
> > > > > > > 
> > 

Re: [RFC PATCH v3 3/5] KVM: x86: Add notifications for Heki policy configuration and violation

2024-05-06 Thread Mickaël Salaün
On Fri, May 03, 2024 at 07:03:21AM GMT, Sean Christopherson wrote:
> On Fri, May 03, 2024, Mickaël Salaün wrote:
> > Add an interface for user space to be notified about guests' Heki policy
> > and related violations.
> > 
> > Extend the KVM_ENABLE_CAP IOCTL with KVM_CAP_HEKI_CONFIGURE and
> > KVM_CAP_HEKI_DENIAL. Each one takes a bitmask as first argument that can
> > contains KVM_HEKI_EXIT_REASON_CR0 and KVM_HEKI_EXIT_REASON_CR4. The
> > returned value is the bitmask of known Heki exit reasons, for now:
> > KVM_HEKI_EXIT_REASON_CR0 and KVM_HEKI_EXIT_REASON_CR4.
> > 
> > If KVM_CAP_HEKI_CONFIGURE is set, a VM exit will be triggered for each
> > KVM_HC_LOCK_CR_UPDATE hypercalls according to the requested control
> > register. This enables to enlighten the VMM with the guest
> > auto-restrictions.
> > 
> > If KVM_CAP_HEKI_DENIAL is set, a VM exit will be triggered for each
> > pinned CR violation. This enables the VMM to react to a policy
> > violation.
> > 
> > Cc: Borislav Petkov 
> > Cc: Dave Hansen 
> > Cc: H. Peter Anvin 
> > Cc: Ingo Molnar 
> > Cc: Kees Cook 
> > Cc: Madhavan T. Venkataraman 
> > Cc: Paolo Bonzini 
> > Cc: Sean Christopherson 
> > Cc: Thomas Gleixner 
> > Cc: Vitaly Kuznetsov 
> > Cc: Wanpeng Li 
> > Signed-off-by: Mickaël Salaün 
> > Link: https://lore.kernel.org/r/20240503131910.307630-4-...@digikod.net
> > ---
> > 
> > Changes since v1:
> > * New patch. Making user space aware of Heki properties was requested by
> >   Sean Christopherson.
> 
> No, I suggested having userspace _control_ the pinning[*], not merely be 
> notified
> of pinning.
> 
>  : IMO, manipulation of protections, both for memory (this patch) and CPU 
> state
>  : (control registers in the next patch) should come from userspace.  I have 
> no
>  : objection to KVM providing plumbing if necessary, but I think userspace 
> needs to
>  : to have full control over the actual state.
>  : 
>  : One of the things that caused Intel's control register pinning series to 
> stall
>  : out was how to handle edge cases like kexec() and reboot.  Deferring to 
> userspace
>  : means the kernel doesn't need to define policy, e.g. when to unprotect 
> memory,
>  : and avoids questions like "should userspace be able to overwrite pinned 
> control
>  : registers".
>  : 
>  : And like the confidential VM use case, keeping userspace in the loop is a 
> big
>  : beneifit, e.g. the guest can't circumvent protections by coercing 
> userspace into
>  : writing to protected memory.
> 
> I stand by that suggestion, because I don't see a sane way to handle things 
> like
> kexec() and reboot without having a _much_ more sophisticated policy than 
> would
> ever be acceptable in KVM.
> 
> I think that can be done without KVM having any awareness of CR pinning 
> whatsoever.
> E.g. userspace just needs to ability to intercept CR writes and inject #GPs.  
> Off
> the cuff, I suspect the uAPI could look very similar to MSR filtering.  E.g. 
> I bet
> userspace could enforce MSR pinning without any new KVM uAPI at all.
> 
> [*] https://lore.kernel.org/all/zfuyhpuhtmbyd...@google.com

OK, I had concern about the control not directly coming from the guest,
especially in the case of pKVM and confidential computing, but I get you
point.  It should indeed be quite similar to the MSR filtering on the
userspace side, except that we need another interface for the guest to
request such change (i.e. self-protection).

Would it be OK to keep this new KVM_HC_LOCK_CR_UPDATE hypercall but
forward the request to userspace with a VM exit instead?  That would
also enable userspace to get the request and directly configure the CR
pinning with the same VM exit.



Re: [PULL 00/46] Mostly build system and other cleanups patches for 2024-05-06

2024-05-06 Thread Richard Henderson

On 5/6/24 00:50, Paolo Bonzini wrote:

The following changes since commit 4977ce198d2390bff8c71ad5cb1a5f6aa24b56fb:

   Merge tag 'pull-tcg-20240501' ofhttps://gitlab.com/rth7680/qemu  into 
staging (2024-05-01 15:15:33 -0700)

are available in the Git repository at:

   https://gitlab.com/bonzini/qemu.git  tags/for-upstream

for you to fetch changes up to deb686ef0e609ceaec0daa5dc88eb5b3dd9701b0:

   qga/commands-posix: fix typo in qmp_guest_set_user_password (2024-05-03 
19:36:51 +0200)


* target/i386: Introduce SapphireRapids-v3 to add missing features
* switch boards to "default y"
* allow building emulators without any board
* configs: list "implied" device groups in the default configs
* remove unnecessary declarations from typedefs.h
* target/i386: Give IRQs a chance when resetting HF_INHIBIT_IRQ_MASK


Applied, thanks.  Please update https://wiki.qemu.org/ChangeLog/9.1 as 
appropriate.


r~




Re: [PULL 00/15] Hexagon: simplify gen for packets w/o read-after-write

2024-05-06 Thread Richard Henderson

On 5/5/24 19:42, Brian Cain wrote:

The following changes since commit 248f6f62df073a3b4158fd0093863ab885feabb5:

   Merge tag 'pull-axp-20240504' ofhttps://gitlab.com/rth7680/qemu  into 
staging (2024-05-04 08:39:46 -0700)

are available in the Git repository at:

   https://github.com/quic/qemu  tags/pull-hex-20240505

for you to fetch changes up to a4696661491cac8c1c08e7d482d751f808ce3143:

   Hexagon (target/hexagon) Remove hex_common.read_attribs_file (2024-05-05 
16:22:07 -0700)


Short-circuit for packets w/o read-after-write
Cleanup unused code in gen_*.py scripts


Applied, thanks.  Please update https://wiki.qemu.org/ChangeLog/9.1 as 
appropriate.


r~




Re: [PATCH v2 17/25] target/i386: move C0-FF opcodes to new decoder (except for x87)

2024-05-06 Thread Richard Henderson

On 5/6/24 01:09, Paolo Bonzini wrote:

The shift instructions are rewritten instead of reusing code from the old
decoder.  Rotates use CC_OP_ADCOX more extensively and generally rely
more on the optimizer, so that the code generators are shared between
the immediate-count and variable-count cases.

In particular, this makes gen_RCL and gen_RCR pretty efficient for the
count == 1 case, which becomes (apart from a few extra movs) something like:

   (compute_cc_all if needed)
   // save old value for OF calculation
   mov cc_src2, T0
   // the bulk of RCL is just this!
   deposit T0, cc_src, T0, 1, TARGET_LONG_BITS - 1
   // compute carry
   shr cc_dst, cc_src2, length - 1
   and cc_dst, cc_dst, 1
   // compute overflow
   xor cc_src2, cc_src2, T0
   extract cc_src2, cc_src2, length - 1, 1

32-bit MUL and IMUL are also slightly more efficient on 64-bit hosts.

Signed-off-by: Paolo Bonzini
---
  target/i386/tcg/decode-new.h |1 +
  target/i386/tcg/translate.c  |   23 +-
  target/i386/tcg/decode-new.c.inc |  142 +
  target/i386/tcg/emit.c.inc   | 1014 +-
  4 files changed, 1169 insertions(+), 11 deletions(-)


Reviewed-by: Richard Henderson 

r~



Re: [PATCH v2 15/25] target/i386: move 60-BF opcodes to new decoder

2024-05-06 Thread Richard Henderson

On 5/6/24 01:09, Paolo Bonzini wrote:

Compared to the old decoder, the main differences in translation
are for the little-used ARPL instruction.  IMUL is adjusted a bit
to share more code to produce flags, but is otherwise very similar.

Signed-off-by: Paolo Bonzini 


Reviewed-by: Richard Henderson 


+static void gen_POPA(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+   gen_popa(s);
+}

...

+static void gen_PUSHA(DisasContext *s, CPUX86State *env, X86DecodedInsn 
*decode)
+{
+   gen_pusha(s);
+}


3-space indent?


r~



Re: [PATCH v2 05/25] target/i386: cleanup cc_op changes for REP/REPZ/REPNZ

2024-05-06 Thread Richard Henderson

On 5/6/24 09:31, Paolo Bonzini wrote:

The comment deals with the former, the removal with the latter.

The idea of the comment is that after SCAS/CMPS you have CC_OP_SUB*,
so in principle you may expect that you need to set CC_OP_DYNAMIC
explicitly at the end of a REPZ/REPNZ, which is where the CX != 0 and
CX == 0 paths join. But this is not necessary, because there is
nothing after that instruction - the translation block ends.

So I guess the comment could instead be placed at the end of the function?

 /*
  * Only one iteration is done at a time, so the translation
  * block has ended unconditionally at this point and there
  * is no control flow junction - no need to set CC_OP_DYNAMIC.
  */

What do you think?


Just before gen_jmp_rel_csize?  Yes, that seems good.


r~



Re: [PATCH v2 05/25] target/i386: cleanup cc_op changes for REP/REPZ/REPNZ

2024-05-06 Thread Paolo Bonzini
On Mon, May 6, 2024 at 6:08 PM Richard Henderson
 wrote:
> > -gen_update_cc_op(s);
> >   l2 = gen_jz_ecx_string(s);
> > +/*
> > + * Only one iteration is done at a time, so there is
> > + * no control flow junction here and cc_op is never dynamic.
> > + */
> >   fn(s, ot);
> >   gen_op_add_reg_im(s, s->aflag, R_ECX, -1);
> > -gen_update_cc_op(s);
> >   gen_jcc1(s, (JCC_Z << 1) | (nz ^ 1), l2);
> >   if (s->repz_opt) {
> >   gen_op_jz_ecx(s, l2);
>
> Ok, but only because gen_jcc1 does the gen_update_cc_op.  The comment is 
> neither correct
> nor necessary.

Yeah, it's true that gen_jcc1 does the update. On the other hand,
there are two different kinds of cc_op updates:

1) at branches, if you know that only one of the sides might write
cc_op - so you ensure it's up-to-date before the branch - and set
CC_OP_DYNAMIC at the junction. Same if you have movcond instead of a
branch.

2) at end of translation block, to spill the value lazily (because in
the middle of the TB we are able to restore it from insn data). With
these patches there is never a need to do this, because gen_jmp_rel()
and gen_jmp_rel_csize() take care of it.

The comment deals with the former, the removal with the latter.

The idea of the comment is that after SCAS/CMPS you have CC_OP_SUB*,
so in principle you may expect that you need to set CC_OP_DYNAMIC
explicitly at the end of a REPZ/REPNZ, which is where the CX != 0 and
CX == 0 paths join. But this is not necessary, because there is
nothing after that instruction - the translation block ends.

So I guess the comment could instead be placed at the end of the function?

/*
 * Only one iteration is done at a time, so the translation
 * block has ended unconditionally at this point and there
 * is no control flow junction - no need to set CC_OP_DYNAMIC.
 */

What do you think?

Paolo




Re: [PATCH RFC 00/26] Multifd  device state transfer support with VFIO consumer

2024-05-06 Thread Maciej S. Szmigiero

On 29.04.2024 17:09, Peter Xu wrote:

On Fri, Apr 26, 2024 at 07:34:09PM +0200, Maciej S. Szmigiero wrote:

On 24.04.2024 00:35, Peter Xu wrote:

On Wed, Apr 24, 2024 at 12:25:08AM +0200, Maciej S. Szmigiero wrote:

On 24.04.2024 00:20, Peter Xu wrote:

On Tue, Apr 23, 2024 at 06:15:35PM +0200, Maciej S. Szmigiero wrote:

On 19.04.2024 17:31, Peter Xu wrote:

On Fri, Apr 19, 2024 at 11:07:21AM +0100, Daniel P. Berrangé wrote:

On Thu, Apr 18, 2024 at 04:02:49PM -0400, Peter Xu wrote:

On Thu, Apr 18, 2024 at 08:14:15PM +0200, Maciej S. Szmigiero wrote:

I think one of the reasons for these results is that mixed (RAM + device
state) multifd channels participate in the RAM sync process
(MULTIFD_FLAG_SYNC) whereas device state dedicated channels don't.


Firstly, I'm wondering whether we can have better names for these new
hooks.  Currently (only comment on the async* stuff):

  - complete_precopy_async
  - complete_precopy
  - complete_precopy_async_wait

But perhaps better:

  - complete_precopy_begin
  - complete_precopy
  - complete_precopy_end

?

As I don't see why the device must do something with async in such hook.
To me it's more like you're splitting one process into multiple, then
begin/end sounds more generic.

Then, if with that in mind, IIUC we can already split ram_save_complete()
into >1 phases too. For example, I would be curious whether the performance
will go back to normal if we offloading multifd_send_sync_main() into the
complete_precopy_end(), because we really only need one shot of that, and I
am quite surprised it already greatly affects VFIO dumping its own things.

I would even ask one step further as what Dan was asking: have you thought
about dumping VFIO states via multifd even during iterations?  Would that
help even more than this series (which IIUC only helps during the blackout
phase)?


To dump during RAM iteration, the VFIO device will need to have
dirty tracking and iterate on its state, because the guest CPUs
will still be running potentially changing VFIO state. That seems
impractical in the general case.


We already do such interations in vfio_save_iterate()?

My understanding is the recent VFIO work is based on the fact that the VFIO
device can track device state changes more or less (besides being able to
save/load full states).  E.g. I still remember in our QE tests some old
devices report much more dirty pages than expected during the iterations
when we were looking into such issue that a huge amount of dirty pages
reported.  But newer models seem to have fixed that and report much less.

That issue was about GPU not NICs, though, and IIUC a major portion of such
tracking used to be for GPU vRAMs.  So maybe I was mixing up these, and
maybe they work differently.


The device which this series was developed against (Mellanox ConnectX-7)
is already transferring its live state before the VM gets stopped (via
save_live_iterate SaveVMHandler).

It's just that in addition to the live state it has more than 400 MiB
of state that cannot be transferred while the VM is still running.
And that fact hurts a lot with respect to the migration downtime.

AFAIK it's a very similar story for (some) GPUs.


So during iteration phase VFIO cannot yet leverage the multifd channels
when with this series, am I right?


That's right.


Is it possible to extend that use case too?


I guess so, but since this phase (iteration while the VM is still
running) doesn't impact downtime it is much less critical.


But it affects the bandwidth, e.g. even with multifd enabled, the device
iteration data will still bottleneck at ~15Gbps on a common system setup
the best case, even if the hosts are 100Gbps direct connected.  Would that
be a concern in the future too, or it's known problem and it won't be fixed
anyway?


I think any improvements to the migration performance are good, even if
they don't impact downtime.

It's just that this patch set focuses on the downtime phase as the more
critical thing.

After this gets improved there's no reason why not to look at improving
performance of the VM live phase too if it brings sensible improvements.


I remember Avihai used to have plan to look into similar issues, I hope
this is exactly what he is looking for.  Otherwise changing migration
protocol from time to time is cumbersome; we always need to provide a flag
to make sure old systems migrates in the old ways, new systems run the new
ways, and for such a relatively major change I'd want to double check on
how far away we can support offload VFIO iterations data to multifd.


The device state transfer is indicated by a new flag in the multifd
header (MULTIFD_FLAG_DEVICE_STATE).

If we are to use multifd channels for VM live phase transfers these
could simply re-use the same flag type.


Right, and that's also my major purpose of such request to consider both
issues.

If supporting iterators can be easy on top of this, I am thinking whether
we should do this in one shot.  

Re: [PATCH RFC 23/26] migration/multifd: Device state transfer support - send side

2024-05-06 Thread Maciej S. Szmigiero

On 29.04.2024 22:04, Peter Xu wrote:

On Tue, Apr 16, 2024 at 04:43:02PM +0200, Maciej S. Szmigiero wrote:

+bool multifd_queue_page(RAMBlock *block, ram_addr_t offset)
+{
+g_autoptr(GMutexLocker) locker = NULL;
+
+/*
+ * Device state submissions for shared channels can come
+ * from multiple threads and conflict with page submissions
+ * with respect to multifd_send_state access.
+ */
+if (!multifd_send_state->device_state_dedicated_channels) {
+locker = g_mutex_locker_new(_send_state->queue_job_mutex);


Haven't read the rest, but suggest to stick with QemuMutex for the whole
patchset, as that's what we use in the rest migration code, along with
QEMU_LOCK_GUARD().



Ack, from a quick scan of QEMU thread sync primitives it seems that
QemuMutex with QemuLockable and QemuCond should fulfill the
requirements to replace GMutex, GMutexLocker and GCond.

Thanks,
Maciej




Re: [PATCH v2 11/25] target/i386: reintroduce debugging mechanism

2024-05-06 Thread Richard Henderson

On 5/6/24 01:09, Paolo Bonzini wrote:

Signed-off-by: Paolo Bonzini
---
  target/i386/tcg/translate.c  | 27 +++
  target/i386/tcg/decode-new.c.inc |  3 +++
  2 files changed, 30 insertions(+)


Acked-by: Richard Henderson 

r~



Re: [PATCH v2 10/25] target/i386: cleanup *gen_eob*

2024-05-06 Thread Richard Henderson

On 5/6/24 01:09, Paolo Bonzini wrote:

Create a new wrapper for syscall/sysret, and do not go through multiple
layers of wrappers.

Signed-off-by: Paolo Bonzini
---
  target/i386/tcg/translate.c | 25 -
  1 file changed, 12 insertions(+), 13 deletions(-)


Reviewed-by: Richard Henderson 

r~



Re: [PATCH v2 09/25] target/i386: clarify the "reg" argument of functions returning CCPrepare

2024-05-06 Thread Richard Henderson

On 5/6/24 01:09, Paolo Bonzini wrote:

Signed-off-by: Paolo Bonzini
---
  target/i386/tcg/translate.c | 15 ---
  1 file changed, 8 insertions(+), 7 deletions(-)


Reviewed-by: Richard Henderson 

r~



Re: [PATCH v2 08/25] target/i386: do not use s->T0 and s->T1 as scratch registers for CCPrepare

2024-05-06 Thread Richard Henderson

On 5/6/24 01:09, Paolo Bonzini wrote:

Instead of using s->T0 or s->T1, create a scratch register
when computing the C, NC, L or LE conditions.

Signed-off-by: Paolo Bonzini
---
  target/i386/tcg/translate.c | 17 ++---
  1 file changed, 10 insertions(+), 7 deletions(-)


Reviewed-by: Richard Henderson 

r~



Re: [PATCH v2 07/25] target/i386: extend cc_* when using them to compute flags

2024-05-06 Thread Richard Henderson

On 5/6/24 01:09, Paolo Bonzini wrote:

Instead of using s->tmp0 or s->tmp4 as the result, just extend the cc_*
registers in place.  It is harmless and, if multiple setcc instructions
are used, the optimizer will be able to remove the redundant ones.

Signed-off-by: Paolo Bonzini
---
  target/i386/tcg/translate.c | 44 +++--
  1 file changed, 18 insertions(+), 26 deletions(-)


Reviewed-by: Richard Henderson 

r~



Re: [PATCH v2 06/25] target/i386: pull cc_op update to callers of gen_jmp_rel{,_csize}

2024-05-06 Thread Richard Henderson

On 5/6/24 01:09, Paolo Bonzini wrote:

gen_update_cc_op must be called before control flow splits.  Doing it
in gen_jmp_rel{,_csize} may hide bugs, instead assert that cc_op is
clean---even if that means a few more calls to gen_update_cc_op().

With this new invariant, setting cc_op to CC_OP_DYNAMIC is unnecessary
since the caller should have done it.

Signed-off-by: Paolo Bonzini
---
  target/i386/tcg/translate.c | 8 +---
  1 file changed, 5 insertions(+), 3 deletions(-)


Reviewed-by: Richard Henderson 

r~



Re: [PATCH v2 05/25] target/i386: cleanup cc_op changes for REP/REPZ/REPNZ

2024-05-06 Thread Richard Henderson

On 5/6/24 01:09, Paolo Bonzini wrote:

gen_update_cc_op must be called before control flow splits.  Do it
where the jump on ECX!=0 is translated.

On the other hand, remove the call before gen_jcc1, which takes care of
it already, and explain why REPZ/REPNZ need not use CC_OP_DYNAMIC---the
translation block ends before any control-flow-dependent cc_op could
be observed.

Signed-off-by: Paolo Bonzini 
---
  target/i386/tcg/translate.c | 13 +
  1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 3f1d2858fc9..6b766f5dd3f 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -1242,11 +1242,15 @@ static inline void gen_jcc1(DisasContext *s, int b, 
TCGLabel *l1)
  }
  
  /* XXX: does not work with gdbstub "ice" single step - not a

-   serious problem */
+   serious problem.  The caller can jump to the returned label
+   to stop the REP but, if the flags have changed, it has to call
+   gen_update_cc_op before doing so.  */
  static TCGLabel *gen_jz_ecx_string(DisasContext *s)
  {
  TCGLabel *l1 = gen_new_label();
  TCGLabel *l2 = gen_new_label();
+
+gen_update_cc_op(s);
  gen_op_jnz_ecx(s, l1);
  gen_set_label(l2);
  gen_jmp_rel_csize(s, 0, 1);
@@ -1342,7 +1346,6 @@ static void gen_repz(DisasContext *s, MemOp ot,
   void (*fn)(DisasContext *s, MemOp ot))
  {
  TCGLabel *l2;
-gen_update_cc_op(s);
  l2 = gen_jz_ecx_string(s);
  fn(s, ot);
  gen_op_add_reg_im(s, s->aflag, R_ECX, -1);


Ok.



@@ -1364,11 +1367,13 @@ static void gen_repz2(DisasContext *s, MemOp ot, int nz,
void (*fn)(DisasContext *s, MemOp ot))
  {
  TCGLabel *l2;
-gen_update_cc_op(s);
  l2 = gen_jz_ecx_string(s);
+/*
+ * Only one iteration is done at a time, so there is
+ * no control flow junction here and cc_op is never dynamic.
+ */
  fn(s, ot);
  gen_op_add_reg_im(s, s->aflag, R_ECX, -1);
-gen_update_cc_op(s);
  gen_jcc1(s, (JCC_Z << 1) | (nz ^ 1), l2);
  if (s->repz_opt) {
  gen_op_jz_ecx(s, l2);


Ok, but only because gen_jcc1 does the gen_update_cc_op.  The comment is neither correct 
nor necessary.


The reason to write cc_op before branches instead of junctions is to avoid having *two* 
writes of cc_op on either side of the branch.



r~



Re: [PATCH 0/3] Make it possible to compile the x86 binaries without FDC

2024-05-06 Thread Paolo Bonzini
On Thu, Apr 25, 2024 at 8:43 PM Thomas Huth  wrote:
> OTOH, it seems
> to work fine, and the FDC is only disabled when it is not available
> in the binary, so I hope this patch is fine, too.

We do the same for parallel so i think it should be fine---definitely
for -nodefaults, and I'd say in general too.  The CMOS byte already
has a way to communicate no-floppy (0, see cmos_get_fd_drive_type).

Paolo




Re: [PATCH v2 04/25] target/i386: cc_op is not dynamic in gen_jcc1

2024-05-06 Thread Richard Henderson

On 5/6/24 01:09, Paolo Bonzini wrote:

Resetting cc_op to CC_OP_DYNAMIC should be done at control flow junctions,
which is not the case here.  This translation block is ending and the
only effect of calling set_cc_op() would be a discard of s->cc_srcT.
This discard is useless (it's a temporary, not a global) and in fact
prevents gen_prepare_cc from returning s->cc_srcT.

Signed-off-by: Paolo Bonzini 
---
  target/i386/tcg/translate.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)


Reviewed-by: Richard Henderson 


r~



Re: [PATCH v3 1/5] accel/tcg: Move SoftMMU specific units to softmmu_specific_ss[]

2024-05-06 Thread Philippe Mathieu-Daudé

On 3/5/24 14:34, Philippe Mathieu-Daudé wrote:

Currently these files are only used in system emulation,
but could eventually be used by user emulation. Use the
"softmmu_specific_ss" to express they are related to
SoftMMU.

Signed-off-by: Philippe Mathieu-Daudé 
---
  accel/tcg/meson.build | 7 +--
  1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/accel/tcg/meson.build b/accel/tcg/meson.build
index aef80de967..84826f043a 100644
--- a/accel/tcg/meson.build
+++ b/accel/tcg/meson.build
@@ -16,12 +16,13 @@ tcg_specific_ss.add(when: 'CONFIG_SYSTEM_ONLY', if_false: 
files('user-exec-stub.
  if get_option('plugins')
tcg_specific_ss.add(files('plugin-gen.c'))
  endif
-specific_ss.add_all(when: 'CONFIG_TCG', if_true: tcg_specific_ss)
  
-specific_ss.add(when: ['CONFIG_SYSTEM_ONLY', 'CONFIG_TCG'], if_true: files(

+softmmu_specific_ss = ss.source_set()
+softmmu_specific_ss.add(files(
'cputlb.c',
'watchpoint.c',
  ))
+tcg_specific_ss.add(when: 'CONFIG_SYSTEM_ONLY', if_true: softmmu_specific_ss)


Should be .add_all() here.


  system_ss.add(when: ['CONFIG_TCG'], if_true: files(
'icount-common.c',
@@ -34,3 +35,5 @@ tcg_module_ss.add(when: ['CONFIG_SYSTEM_ONLY', 'CONFIG_TCG'], 
if_true: files(
'tcg-accel-ops-icount.c',
'tcg-accel-ops-rr.c',
  ))
+
+specific_ss.add_all(when: 'CONFIG_TCG', if_true: tcg_specific_ss)





Re: More doc updates needed for new migrate argument @channels

2024-05-06 Thread Peter Xu
On Mon, May 06, 2024 at 06:58:00AM +0200, Markus Armbruster wrote:
> Peter Xu  writes:
> 
> [...]
> 
> > I also copied qemu-devel starting from now.
> 
> My bad, I messed that up!

Het, do you want to send a patch?

Thanks,

-- 
Peter Xu




Re: [PULL 00/12] qemu-sparc queue 20240506

2024-05-06 Thread Philippe Mathieu-Daudé

On 6/5/24 16:50, Michael Tokarev wrote:

06.05.2024 14:44, Mark Cave-Ayland wrote:


Mark Cave-Ayland (1):
   hw/sparc64: set iommu_platform=on for virtio devices attached 
to the sun4u machine


Richard Henderson (7):
   linux-user/sparc: Add more hwcap bits for sparc64
   target/sparc: Fix FEXPAND
   target/sparc: Fix FMUL8x16
   target/sparc: Fix FMUL8x16A{U,L}
   target/sparc: Fix FMULD8*X16
   target/sparc: Fix FPMERGE
   target/sparc: Split out do_ms16b


Should these "Fix" changes go to stable?


If the backport is clean, otherwise they fix a 17yo bug
so could just go in the next release IMHO.


Thomas Huth (4):
   target/sparc/cpu: Rename the CPU models with a "+" in their names
   target/sparc/cpu: Avoid spaces by default in the CPU names
   docs/system/target-sparc: Improve the Sparc documentation
   docs/about: Deprecate the old "UltraSparc" CPU names that 
contain a "+"

Thanks,

/mjt





[PATCH 5/5] hw/mips/boston: Implement multi core support

2024-05-06 Thread Jiaxun Yang
Implement multiple physical core support by passing topology
to CPS subsystem and generate cpu-map fdt node to decribe
new topology.

Signed-off-by: Jiaxun Yang 
---
 hw/mips/boston.c | 37 -
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/hw/mips/boston.c b/hw/mips/boston.c
index 1b44fb354c..4ed7d366fe 100644
--- a/hw/mips/boston.c
+++ b/hw/mips/boston.c
@@ -542,7 +542,10 @@ static const void *create_fdt(BostonState *s,
 qemu_fdt_setprop_cell(fdt, "/cpus", "#size-cells", 0x0);
 qemu_fdt_setprop_cell(fdt, "/cpus", "#address-cells", 0x1);
 
+qemu_fdt_add_subnode(fdt, "/cpus/cpu-map");
 for (cpu = 0; cpu < ms->smp.cpus; cpu++) {
+char *map_path;
+
 name = g_strdup_printf("/cpus/cpu@%d", cpu);
 qemu_fdt_add_subnode(fdt, name);
 qemu_fdt_setprop_string(fdt, name, "compatible", "img,mips");
@@ -550,6 +553,27 @@ static const void *create_fdt(BostonState *s,
 qemu_fdt_setprop_cell(fdt, name, "reg", cpu);
 qemu_fdt_setprop_string(fdt, name, "device_type", "cpu");
 qemu_fdt_setprop_cells(fdt, name, "clocks", clk_ph, 
FDT_BOSTON_CLK_CPU);
+qemu_fdt_setprop_cell(fdt, name, "phandle", 
qemu_fdt_alloc_phandle(fdt));
+
+if (ms->smp.threads > 1) {
+map_path = g_strdup_printf(
+"/cpus/cpu-map/socket%d/cluster%d/core%d/thread%d",
+cpu / (ms->smp.clusters * ms->smp.cores * ms->smp.threads),
+(cpu / (ms->smp.cores * ms->smp.threads)) % ms->smp.clusters,
+(cpu / ms->smp.threads) % ms->smp.cores,
+cpu % ms->smp.threads);
+} else {
+map_path = g_strdup_printf(
+"/cpus/cpu-map/socket%d/cluster%d/core%d",
+cpu / (ms->smp.clusters * ms->smp.cores),
+(cpu / ms->smp.cores) % ms->smp.clusters,
+cpu % ms->smp.cores);
+}
+
+qemu_fdt_add_path(fdt, map_path);
+qemu_fdt_setprop_phandle(fdt, map_path, "cpu", name);
+
+g_free(map_path);
 g_free(name);
 }
 
@@ -591,6 +615,15 @@ static const void *create_fdt(BostonState *s,
 g_free(name);
 g_free(gic_name);
 
+/* CM node */
+name = g_strdup_printf("/soc/cm@%" HWADDR_PRIx, memmap[BOSTON_CM].base);
+qemu_fdt_add_subnode(fdt, name);
+qemu_fdt_setprop_string(fdt, name, "compatible", "mti,mips-cm");
+qemu_fdt_setprop_cells(fdt, name, "reg", memmap[BOSTON_CM].base,
+memmap[BOSTON_CM].size);
+g_free(name);
+
+
 /* CDMM node */
 name = g_strdup_printf("/soc/cdmm@%" HWADDR_PRIx, 
memmap[BOSTON_CDMM].base);
 qemu_fdt_add_subnode(fdt, name);
@@ -703,7 +736,9 @@ static void boston_mach_init(MachineState *machine)
 object_initialize_child(OBJECT(machine), "cps", >cps, TYPE_MIPS_CPS);
 object_property_set_str(OBJECT(>cps), "cpu-type", machine->cpu_type,
 _fatal);
-object_property_set_uint(OBJECT(>cps), "num-vp", machine->smp.cpus,
+object_property_set_uint(OBJECT(>cps), "num-pcore", machine->smp.cores,
+_fatal);
+object_property_set_uint(OBJECT(>cps), "num-vp", machine->smp.threads,
 _fatal);
 qdev_connect_clock_in(DEVICE(>cps), "clk-in",
   qdev_get_clock_out(dev, "cpu-refclk"));

-- 
2.34.1




[PATCH 2/5] hw/msic/mips_cmgcr: Implement multicore functions

2024-05-06 Thread Jiaxun Yang
We implemented following functions to allow software
to probe and control VPs on secondary core

- Reading out pcore count and coherence state
- Two scratch GCRs for firmware
- Semaphore GCR for register locking
- Redirect block to other cores

Signed-off-by: Jiaxun Yang 
---
 hw/misc/mips_cmgcr.c | 168 +++
 include/hw/misc/mips_cmgcr.h |  87 +++---
 2 files changed, 215 insertions(+), 40 deletions(-)

diff --git a/hw/misc/mips_cmgcr.c b/hw/misc/mips_cmgcr.c
index 2703040f45..8c2d184f2c 100644
--- a/hw/misc/mips_cmgcr.c
+++ b/hw/misc/mips_cmgcr.c
@@ -73,14 +73,19 @@ static inline void update_gic_base(MIPSGCRState *gcr, 
uint64_t val)
 static uint64_t gcr_read(void *opaque, hwaddr addr, unsigned size)
 {
 MIPSGCRState *gcr = (MIPSGCRState *) opaque;
-MIPSGCRVPState *current_vps = >vps[current_cpu->cpu_index];
-MIPSGCRVPState *other_vps = >vps[current_vps->other];
+int redirect_corenum = mips_gcr_get_redirect_corenum(gcr);
+int redirect_vpid = mips_gcr_get_redirect_vpid(gcr);
+int current_corenum = mips_gcr_get_current_corenum(gcr);
+int current_vpid = mips_gcr_get_current_vpid(gcr);
+MIPSGCRPCoreState *current_pcore = >pcs[current_corenum];
+MIPSGCRVPState *current_vps = _pcore->vps[current_vpid];
+MIPSGCRPCoreState *other_pcore = >pcs[redirect_corenum];
+MIPSGCRVPState *other_vps = _pcore->vps[redirect_vpid];
 
 switch (addr) {
 /* Global Control Block Register */
 case GCR_CONFIG_OFS:
-/* Set PCORES to 0 */
-return 0;
+return gcr->num_pcores - 1;
 case GCR_BASE_OFS:
 return gcr->gcr_base;
 case GCR_REV_OFS:
@@ -96,7 +101,19 @@ static uint64_t gcr_read(void *opaque, hwaddr addr, 
unsigned size)
 case GCR_L2_CONFIG_OFS:
 /* L2 BYPASS */
 return GCR_L2_CONFIG_BYPASS_MSK;
+case GCR_SYS_CONFIG2_OFS:
+return gcr->num_vps << GCR_SYS_CONFIG2_MAXVP_SHF;
+case GCR_SCRATCH0_OFS:
+return gcr->scratch[0];
+case GCR_SCRATCH1_OFS:
+return gcr->scratch[1];
+case GCR_SEM_OFS:
+return gcr->sem;
 /* Core-Local and Core-Other Control Blocks */
+case MIPS_CLCB_OFS + GCR_CL_COH_EN_OFS:
+return current_pcore->coh_en;
+case MIPS_COCB_OFS + GCR_CL_COH_EN_OFS:
+return other_pcore->coh_en;
 case MIPS_CLCB_OFS + GCR_CL_CONFIG_OFS:
 case MIPS_COCB_OFS + GCR_CL_CONFIG_OFS:
 /* Set PVP to # of VPs - 1 */
@@ -105,10 +122,18 @@ static uint64_t gcr_read(void *opaque, hwaddr addr, 
unsigned size)
 return current_vps->reset_base;
 case MIPS_COCB_OFS + GCR_CL_RESETBASE_OFS:
 return other_vps->reset_base;
-case MIPS_CLCB_OFS + GCR_CL_OTHER_OFS:
-return current_vps->other;
-case MIPS_COCB_OFS + GCR_CL_OTHER_OFS:
-return other_vps->other;
+case MIPS_CLCB_OFS + GCR_CL_REDIRECT_OFS:
+return current_vps->redirect;
+case MIPS_COCB_OFS + GCR_CL_REDIRECT_OFS:
+return other_vps->redirect;
+case MIPS_CLCB_OFS + GCR_CL_ID_OFS:
+return current_corenum;
+case MIPS_COCB_OFS + GCR_CL_ID_OFS:
+return redirect_corenum;
+case MIPS_CLCB_OFS + GCR_CL_SCRATCH_OFS:
+return current_vps->scratch;
+case MIPS_COCB_OFS + GCR_CL_SCRATCH_OFS:
+return other_vps->scratch;
 default:
 qemu_log_mask(LOG_UNIMP, "Read %d bytes at GCR offset 0x%" HWADDR_PRIx
   "\n", size, addr);
@@ -123,12 +148,36 @@ static inline target_ulong 
get_exception_base(MIPSGCRVPState *vps)
 return (int32_t)(vps->reset_base & GCR_CL_RESET_BASE_RESETBASE_MSK);
 }
 
+static inline void set_redirect(MIPSGCRState *gcr,
+MIPSGCRVPState *vps, target_ulong data)
+{
+int new_vpid = data & GCR_CL_REDIRECT_VP_MSK;
+int new_coreid = (data & GCR_CL_REDIRECT_CORE_MSK) >> 
GCR_CL_REDIRECT_CORE_SHF;
+
+if (new_vpid >= gcr->num_vps) {
+return;
+}
+
+if (new_coreid >= gcr->num_pcores) {
+return;
+}
+
+vps->redirect = data & (GCR_CL_REDIRECT_VP_MSK | GCR_CL_REDIRECT_CORE_MSK);
+}
+
 /* Write GCR registers */
 static void gcr_write(void *opaque, hwaddr addr, uint64_t data, unsigned size)
 {
-MIPSGCRState *gcr = (MIPSGCRState *)opaque;
-MIPSGCRVPState *current_vps = >vps[current_cpu->cpu_index];
-MIPSGCRVPState *other_vps = >vps[current_vps->other];
+MIPSGCRState *gcr = (MIPSGCRState *) opaque;
+int redirect_corenum = mips_gcr_get_redirect_corenum(gcr);
+int redirect_vpid = mips_gcr_get_redirect_vpid(gcr);
+int redirect_vpnum = mips_gcr_get_redirect_vpnum(gcr);
+int current_corenum = mips_gcr_get_current_corenum(gcr);
+int current_vpid = mips_gcr_get_current_vpid(gcr);
+MIPSGCRPCoreState *current_pcore = >pcs[current_corenum];
+MIPSGCRVPState *current_vps = _pcore->vps[current_vpid];
+MIPSGCRPCoreState *other_pcore = >pcs[redirect_corenum];
+MIPSGCRVPState *other_vps = 

[PATCH 1/5] target/mips: Make globalnumber a CPU property

2024-05-06 Thread Jiaxun Yang
GlobalNumber marks topology information of a CPU instance.

Make it a CPU property to allow CPS to override topology information.

Signed-off-by: Jiaxun Yang 
---
 target/mips/cpu.c| 16 +++-
 target/mips/cpu.h| 10 +-
 target/mips/sysemu/machine.c |  5 ++---
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/target/mips/cpu.c b/target/mips/cpu.c
index bbe01d07dd..762000d09b 100644
--- a/target/mips/cpu.c
+++ b/target/mips/cpu.c
@@ -296,7 +296,6 @@ static void mips_cpu_reset_hold(Object *obj, ResetType type)
 env->CP0_Random = env->tlb->nb_tlb - 1;
 env->tlb->tlb_in_use = env->tlb->nb_tlb;
 env->CP0_Wired = 0;
-env->CP0_GlobalNumber = (cs->cpu_index & 0xFF) << CP0GN_VPId;
 env->CP0_EBase = KSEG0_BASE | (cs->cpu_index & 0x3FF);
 if (env->CP0_Config3 & (1 << CP0C3_CMGCR)) {
 env->CP0_CMGCRBase = 0x1fbf8000 >> 4;
@@ -484,6 +483,12 @@ static void mips_cpu_realizefn(DeviceState *dev, Error 
**errp)
 
 env->exception_base = (int32_t)0xBFC0;
 
+#if !defined(CONFIG_USER_ONLY)
+if (env->CP0_GlobalNumber == -1) {
+env->CP0_GlobalNumber = (cs->cpu_index & 0xFF) << CP0GN_VPId;
+}
+#endif
+
 #if defined(CONFIG_TCG) && !defined(CONFIG_USER_ONLY)
 mmu_init(env, env->cpu_model);
 #endif
@@ -563,6 +568,13 @@ static const TCGCPUOps mips_tcg_ops = {
 };
 #endif /* CONFIG_TCG */
 
+static Property mips_cpu_properties[] = {
+#if !defined(CONFIG_USER_ONLY)
+DEFINE_PROP_INT32("globalnumber", MIPSCPU, env.CP0_GlobalNumber, -1),
+#endif
+DEFINE_PROP_END_OF_LIST(),
+};
+
 static void mips_cpu_class_init(ObjectClass *c, void *data)
 {
 MIPSCPUClass *mcc = MIPS_CPU_CLASS(c);
@@ -592,6 +604,8 @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
 #ifdef CONFIG_TCG
 cc->tcg_ops = _tcg_ops;
 #endif /* CONFIG_TCG */
+
+device_class_set_props(dc, mips_cpu_properties);
 }
 
 static const TypeInfo mips_cpu_type_info = {
diff --git a/target/mips/cpu.h b/target/mips/cpu.h
index 3e906a175a..7499608678 100644
--- a/target/mips/cpu.h
+++ b/target/mips/cpu.h
@@ -612,8 +612,13 @@ typedef struct CPUArchState {
 # define CP0EnLo_RI 31
 # define CP0EnLo_XI 30
 #endif
-int32_t CP0_GlobalNumber;
+/* CP0_GlobalNumber is preserved across CPU reset. */
 #define CP0GN_VPId 0
+#define CP0GN_VPId_MASK (0xFFUL << CP0GN_VPId)
+#define CP0GN_CoreNum 8
+#define CP0GN_CoreNum_MASK (0xFUL << CP0GN_CoreNum)
+#define CP0GN_ClusterNum 16
+#define CP0GN_ClusterNum_MASK (0xFUL << CP0GN_ClusterNum)
 /*
  * CP0 Register 4
  */
@@ -1175,6 +1180,9 @@ typedef struct CPUArchState {
 struct {} end_reset_fields;
 
 /* Fields from here on are preserved across CPU reset. */
+#if !defined(CONFIG_USER_ONLY)
+int32_t CP0_GlobalNumber;
+#endif
 CPUMIPSMVPContext *mvp;
 #if !defined(CONFIG_USER_ONLY)
 CPUMIPSTLBContext *tlb;
diff --git a/target/mips/sysemu/machine.c b/target/mips/sysemu/machine.c
index 213fd637fc..235d640862 100644
--- a/target/mips/sysemu/machine.c
+++ b/target/mips/sysemu/machine.c
@@ -218,8 +218,8 @@ static const VMStateDescription vmstate_tlb = {
 
 const VMStateDescription vmstate_mips_cpu = {
 .name = "cpu",
-.version_id = 21,
-.minimum_version_id = 21,
+.version_id = 22,
+.minimum_version_id = 22,
 .post_load = cpu_post_load,
 .fields = (const VMStateField[]) {
 /* Active TC */
@@ -257,7 +257,6 @@ const VMStateDescription vmstate_mips_cpu = {
 VMSTATE_INT32(env.CP0_VPEOpt, MIPSCPU),
 VMSTATE_UINT64(env.CP0_EntryLo0, MIPSCPU),
 VMSTATE_UINT64(env.CP0_EntryLo1, MIPSCPU),
-VMSTATE_INT32(env.CP0_GlobalNumber, MIPSCPU),
 VMSTATE_UINTTL(env.CP0_Context, MIPSCPU),
 VMSTATE_INT32(env.CP0_MemoryMapID, MIPSCPU),
 VMSTATE_INT32(env.CP0_PageMask, MIPSCPU),

-- 
2.34.1




[PATCH 0/5] hw/mips: Proper multi core support

2024-05-06 Thread Jiaxun Yang
Hi all,

This series implemented propper multiple core support for MIPS
CPS systsm.

Previously all CPUs are being implemented as a smt thread in a
single core. Now it respects topology supplied in QEMU args.

To test:
Build a latest kernel with 64r6el_defconfig (tested on 6.6,
next-20240506).

Then run:
```
qemu-system-mips64el -M boston -cpu I6500 -kernel ~/linux-next/vmlinux -smp 
4,cores=2,threads=2 -append "console=ttyS0,115200" -nographic
```
In dmesg of guest kernel:
```
[0.00] VP topology {2,2} total 4
...
[0.085190] smp: Bringing up secondary CPUs ...
[0.090219] Primary instruction cache 64kB, VIPT, 4-way, linesize 64 bytes.
[0.095461] Primary data cache 64kB, 4-way, PIPT, no aliases, linesize 64 
bytes
[0.096658] CPU1 revision is: 0001b000 (MIPS I6500)
[0.096718] FPU revision is: 20f30300
[0.124711] Synchronize counters for CPU 1: done.
[0.940979] Primary instruction cache 64kB, VIPT, 4-way, linesize 64 bytes.
[0.941041] Primary data cache 64kB, 4-way, PIPT, no aliases, linesize 64 
bytes
[0.941256] CPU2 revision is: 0001b000 (MIPS I6500)
[0.941289] FPU revision is: 20f30300
[0.965322] Synchronize counters for CPU 2: done.
[1.260937] Primary instruction cache 64kB, VIPT, 4-way, linesize 64 bytes.
[1.261001] Primary data cache 64kB, 4-way, PIPT, no aliases, linesize 64 
bytes
[1.261172] CPU3 revision is: 0001b000 (MIPS I6500)
[1.261209] FPU revision is: 20f30300
[1.285390] Synchronize counters for CPU 3: done.
[1.346985] smp: Brought up 1 node, 4 CPUs
```

Please review.

Thanks

To: qemu-devel@nongnu.org
Cc: Philippe Mathieu-Daudé 

Signed-off-by: Jiaxun Yang 
---
Jiaxun Yang (5):
  target/mips: Make globalnumber a CPU property
  hw/msic/mips_cmgcr: Implement multicore functions
  hw/msic/mips_cpc: Implement multi core support
  hw/mips/cps: Implement multi core support
  hw/mips/boston: Implement multi core support

 hw/mips/boston.c |  37 +-
 hw/mips/cps.c|  66 ++---
 hw/misc/mips_cmgcr.c | 168 +++
 hw/misc/mips_cpc.c   |  97 ++---
 include/hw/mips/cps.h|   1 +
 include/hw/misc/mips_cmgcr.h |  87 +++---
 include/hw/misc/mips_cpc.h   |  15 +++-
 target/mips/cpu.c|  16 -
 target/mips/cpu.h|  10 ++-
 target/mips/sysemu/machine.c |   5 +-
 10 files changed, 403 insertions(+), 99 deletions(-)
---
base-commit: 248f6f62df073a3b4158fd0093863ab885feabb5
change-id: 20240506-mips-smp-9af9e71ad8c2

Best regards,
-- 
Jiaxun Yang 




[PATCH 4/5] hw/mips/cps: Implement multi core support

2024-05-06 Thread Jiaxun Yang
Implement multiple physical core support by creating
CPU devices accorading to the new topology and passing
pcore/vp information to CPC and CMGCR sub-devices.

Signed-off-by: Jiaxun Yang 
---
 hw/mips/cps.c | 66 +++
 include/hw/mips/cps.h |  1 +
 2 files changed, 41 insertions(+), 26 deletions(-)

diff --git a/hw/mips/cps.c b/hw/mips/cps.c
index 07b73b0a1f..6cf02379a9 100644
--- a/hw/mips/cps.c
+++ b/hw/mips/cps.c
@@ -73,31 +73,43 @@ static void mips_cps_realize(DeviceState *dev, Error **errp)
 return;
 }
 
-for (int i = 0; i < s->num_vp; i++) {
-MIPSCPU *cpu = MIPS_CPU(object_new(s->cpu_type));
-CPUMIPSState *env = >env;
+object_initialize_child(OBJECT(dev), "gcr", >gcr, TYPE_MIPS_GCR);
 
-/* All VPs are halted on reset. Leave powering up to CPC. */
-object_property_set_bool(OBJECT(cpu), "start-powered-off", true,
- _abort);
+for (int corenum = 0; corenum < s->num_pcore; corenum++) {
+for (int vpid = 0; vpid < s->num_vp; vpid++) {
+int vpnum = corenum * s->num_vp + vpid;
+int32_t globalnumber = (corenum << CP0GN_CoreNum) | vpid;
 
-/* All cores use the same clock tree */
-qdev_connect_clock_in(DEVICE(cpu), "clk-in", s->clock);
+MIPSCPU *cpu = MIPS_CPU(object_new(s->cpu_type));
+CPUMIPSState *env = >env;
 
-if (!qdev_realize_and_unref(DEVICE(cpu), NULL, errp)) {
-return;
-}
+/* All VPs are halted on reset. Leave powering up to CPC. */
+object_property_set_bool(OBJECT(cpu), "start-powered-off", true,
+_abort);
+
+object_property_set_int(OBJECT(cpu), "globalnumber", globalnumber,
+_abort);
+
+/* All cores use the same clock tree */
+qdev_connect_clock_in(DEVICE(cpu), "clk-in", s->clock);
+
+if (!qdev_realize_and_unref(DEVICE(cpu), NULL, errp)) {
+return;
+}
+
+g_assert(vpnum == CPU(cpu)->cpu_index);
 
-/* Init internal devices */
-cpu_mips_irq_init_cpu(cpu);
-cpu_mips_clock_init(cpu);
+/* Init internal devices */
+cpu_mips_irq_init_cpu(cpu);
+cpu_mips_clock_init(cpu);
 
-if (cpu_mips_itu_supported(env)) {
-itu_present = true;
-/* Attach ITC Tag to the VP */
-env->itc_tag = mips_itu_get_tag_region(>itu);
+if (cpu_mips_itu_supported(env)) {
+itu_present = true;
+/* Attach ITC Tag to the VP */
+env->itc_tag = mips_itu_get_tag_region(>itu);
+}
+qemu_register_reset(main_cpu_reset, cpu);
 }
-qemu_register_reset(main_cpu_reset, cpu);
 }
 
 /* Inter-Thread Communication Unit */
@@ -119,8 +131,12 @@ static void mips_cps_realize(DeviceState *dev, Error 
**errp)
 object_initialize_child(OBJECT(dev), "cpc", >cpc, TYPE_MIPS_CPC);
 object_property_set_uint(OBJECT(>cpc), "num-vp", s->num_vp,
 _abort);
+object_property_set_uint(OBJECT(>cpc), "num-pcore", s->num_pcore,
+_abort);
 object_property_set_int(OBJECT(>cpc), "vp-start-running", 1,
 _abort);
+object_property_set_link(OBJECT(>cpc), "gcr", OBJECT(>gcr),
+_abort);
 if (!sysbus_realize(SYS_BUS_DEVICE(>cpc), errp)) {
 return;
 }
@@ -130,7 +146,7 @@ static void mips_cps_realize(DeviceState *dev, Error **errp)
 
 /* Global Interrupt Controller */
 object_initialize_child(OBJECT(dev), "gic", >gic, TYPE_MIPS_GIC);
-object_property_set_uint(OBJECT(>gic), "num-vp", s->num_vp,
+object_property_set_uint(OBJECT(>gic), "num-vp", s->num_vp * 
s->num_pcore,
 _abort);
 object_property_set_uint(OBJECT(>gic), "num-irq", 128,
 _abort);
@@ -141,16 +157,13 @@ static void mips_cps_realize(DeviceState *dev, Error 
**errp)
 memory_region_add_subregion(>container, 0,
 sysbus_mmio_get_region(SYS_BUS_DEVICE(>gic), 
0));
 
-/* Global Configuration Registers */
-gcr_base = MIPS_CPU(first_cpu)->env.CP0_CMGCRBase << 4;
-
-object_initialize_child(OBJECT(dev), "gcr", >gcr, TYPE_MIPS_GCR);
+gcr_base = GCR_BASE_ADDR;
+object_property_set_uint(OBJECT(>gcr), "num-pcores", s->num_pcore,
+_abort);
 object_property_set_uint(OBJECT(>gcr), "num-vp", s->num_vp,
 _abort);
 object_property_set_int(OBJECT(>gcr), "gcr-rev", 0x800,
 _abort);
-object_property_set_int(OBJECT(>gcr), "gcr-base", gcr_base,
-_abort);
 object_property_set_link(OBJECT(>gcr), "gic", OBJECT(>gic.mr),
  

[PATCH 3/5] hw/msic/mips_cpc: Implement multi core support

2024-05-06 Thread Jiaxun Yang
Implement multiple physical core support for MIPS CPC
controller. Including some R/O configuration registers
and VP bring up support on multiple cores.

Signed-off-by: Jiaxun Yang 
---
 hw/misc/mips_cpc.c | 97 ++
 include/hw/misc/mips_cpc.h | 15 ++-
 2 files changed, 85 insertions(+), 27 deletions(-)

diff --git a/hw/misc/mips_cpc.c b/hw/misc/mips_cpc.c
index 1e8fd2e699..f6a2f29088 100644
--- a/hw/misc/mips_cpc.c
+++ b/hw/misc/mips_cpc.c
@@ -25,9 +25,25 @@
 #include "hw/sysbus.h"
 #include "migration/vmstate.h"
 
+#include "hw/misc/mips_cmgcr.h"
 #include "hw/misc/mips_cpc.h"
 #include "hw/qdev-properties.h"
 
+static inline int cpc_vpnum_to_corenum(MIPSCPCState *cpc, int vpnum)
+{
+return vpnum / cpc->num_vp;
+}
+
+static inline int cpc_vpnum_to_vpid(MIPSCPCState *cpc, int vpnum)
+{
+return vpnum % cpc->num_vp;
+}
+
+static inline MIPSCPCPCoreState *cpc_vpnum_to_pcs(MIPSCPCState *cpc, int vpnum)
+{
+return >pcs[cpc_vpnum_to_corenum(cpc, vpnum)];
+}
+
 static inline uint64_t cpc_vp_run_mask(MIPSCPCState *cpc)
 {
 return (1ULL << cpc->num_vp) - 1;
@@ -39,36 +55,41 @@ static void mips_cpu_reset_async_work(CPUState *cs, 
run_on_cpu_data data)
 
 cpu_reset(cs);
 cs->halted = 0;
-cpc->vp_running |= 1ULL << cs->cpu_index;
+cpc_vpnum_to_pcs(cpc, cs->cpu_index)->vp_running |=
+1 << cpc_vpnum_to_vpid(cpc, cs->cpu_index);
 }
 
-static void cpc_run_vp(MIPSCPCState *cpc, uint64_t vp_run)
+static void cpc_run_vp(MIPSCPCState *cpc, int pcore, uint64_t vp_run)
 {
-CPUState *cs = first_cpu;
+MIPSCPCPCoreState *pcs = >pcs[pcore];
 
-CPU_FOREACH(cs) {
-uint64_t i = 1ULL << cs->cpu_index;
-if (i & vp_run & ~cpc->vp_running) {
+for (int vpid = 0; vpid < cpc->num_vp; vpid++) {
+if ((1 << vpid) & vp_run & ~pcs->vp_running) {
+int vpnum = pcore * cpc->num_vp + vpid;
 /*
  * To avoid racing with a CPU we are just kicking off.
  * We do the final bit of preparation for the work in
  * the target CPUs context.
  */
-async_safe_run_on_cpu(cs, mips_cpu_reset_async_work,
-  RUN_ON_CPU_HOST_PTR(cpc));
+async_safe_run_on_cpu(qemu_get_cpu(vpnum),
+mips_cpu_reset_async_work,
+RUN_ON_CPU_HOST_PTR(cpc));
+pcs->vp_running |= 1 << vpid;
 }
 }
 }
 
-static void cpc_stop_vp(MIPSCPCState *cpc, uint64_t vp_stop)
+static void cpc_stop_vp(MIPSCPCState *cpc, int pcore, uint64_t vp_stop)
 {
-CPUState *cs = first_cpu;
+MIPSCPCPCoreState *pcs = >pcs[pcore];
+
+for (int vpid = 0; vpid < cpc->num_vp; vpid++) {
+if ((1 << vpid) & vp_stop & pcs->vp_running) {
+int vpnum = pcore * cpc->num_vp + vpid;
+CPUState *cs = qemu_get_cpu(vpnum);
 
-CPU_FOREACH(cs) {
-uint64_t i = 1ULL << cs->cpu_index;
-if (i & vp_stop & cpc->vp_running) {
 cpu_interrupt(cs, CPU_INTERRUPT_HALT);
-cpc->vp_running &= ~i;
+pcs->vp_running &= ~(1 << vpid);
 }
 }
 }
@@ -77,15 +98,20 @@ static void cpc_write(void *opaque, hwaddr offset, uint64_t 
data,
   unsigned size)
 {
 MIPSCPCState *s = opaque;
+int current_corenum = cpc_vpnum_to_corenum(s, current_cpu->cpu_index);
 
 switch (offset) {
 case CPC_CL_BASE_OFS + CPC_VP_RUN_OFS:
+cpc_run_vp(s, current_corenum, data);
+break;
 case CPC_CO_BASE_OFS + CPC_VP_RUN_OFS:
-cpc_run_vp(s, data & cpc_vp_run_mask(s));
+cpc_run_vp(s, mips_gcr_get_redirect_corenum(s->gcr), data);
 break;
 case CPC_CL_BASE_OFS + CPC_VP_STOP_OFS:
+cpc_stop_vp(s, current_corenum, data);
+break;
 case CPC_CO_BASE_OFS + CPC_VP_STOP_OFS:
-cpc_stop_vp(s, data & cpc_vp_run_mask(s));
+cpc_stop_vp(s, mips_gcr_get_redirect_corenum(s->gcr), data);
 break;
 default:
 qemu_log_mask(LOG_UNIMP,
@@ -101,9 +127,13 @@ static uint64_t cpc_read(void *opaque, hwaddr offset, 
unsigned size)
 MIPSCPCState *s = opaque;
 
 switch (offset) {
+case CPC_CL_BASE_OFS + CPC_CL_STAT_CONF_OFS:
+case CPC_CO_BASE_OFS + CPC_CL_STAT_CONF_OFS:
+return CPC_CL_STAT_CONF_SEQ_STATE_U6 << CPC_CL_STAT_CONF_SEQ_STATE_SHF;
 case CPC_CL_BASE_OFS + CPC_VP_RUNNING_OFS:
+return cpc_vpnum_to_pcs(s, current_cpu->cpu_index)->vp_running;
 case CPC_CO_BASE_OFS + CPC_VP_RUNNING_OFS:
-return s->vp_running;
+return s->pcs[mips_gcr_get_redirect_corenum(s->gcr)].vp_running;
 default:
 qemu_log_mask(LOG_UNIMP,
   "%s: Bad offset 0x%x\n",  __func__, (int)offset);
@@ -137,9 +167,11 @@ static void mips_cpc_realize(DeviceState *dev, Error 
**errp)
 if (s->vp_start_running > cpc_vp_run_mask(s)) {
 error_setg(errp,
 

Re: [PATCH-for-9.1 v2 2/3] migration: Remove RDMA protocol handling

2024-05-06 Thread Peter Xu
On Mon, May 06, 2024 at 12:08:43PM +0200, Jinpu Wang wrote:
> Hi Peter, hi Daniel,

Hi, Jinpu,

Thanks for sharing this test results.  Sounds like a great news.

What's your plan next?  Would it then be worthwhile / possible moving QEMU
into that direction?  Would that greatly simplify rdma code as Dan
mentioned?

Thanks,

> 
> On Fri, May 3, 2024 at 4:33 PM Peter Xu  wrote:
> >
> > On Fri, May 03, 2024 at 08:40:03AM +0200, Jinpu Wang wrote:
> > > I had a brief check in the rsocket changelog, there seems some
> > > improvement over time,
> > >  might be worth revisiting this. due to socket abstraction, we can't
> > > use some feature like
> > >  ODP, it won't be a small and easy task.
> >
> > It'll be good to know whether Dan's suggestion would work first, without
> > rewritting everything yet so far.  Not sure whether some perf test could
> > help with the rsocket APIs even without QEMU's involvements (or looking for
> > test data supporting / invalidate such conversions).
> >
> I did a quick test with iperf on 100 G environment and 40 G
> environment, in summary rsocket works pretty well.
> 
> iperf tests between 2 hosts with 40 G (IB),
> first  a few test with different num. of threads on top of ipoib
> interface, later with preload rsocket on top of same ipoib interface.
> 
> jw...@ps401a-914.nst:~$ iperf -p 52000 -c 10.43.3.145
> 
> Client connecting to 10.43.3.145, TCP port 52000
> TCP window size:  165 KByte (default)
> 
> [  3] local 10.43.3.146 port 55602 connected with 10.43.3.145 port 52000
> [ ID] Interval   Transfer Bandwidth
> [  3] 0.-10.0001 sec  2.85 GBytes  2.44 Gbits/sec
> jw...@ps401a-914.nst:~$ iperf -p 52000 -c 10.43.3.145 -P 2
> 
> Client connecting to 10.43.3.145, TCP port 52000
> TCP window size:  165 KByte (default)
> 
> [  4] local 10.43.3.146 port 39640 connected with 10.43.3.145 port 52000
> [  3] local 10.43.3.146 port 39626 connected with 10.43.3.145 port 52000
> [ ID] Interval   Transfer Bandwidth
> [  3] 0.-10.0012 sec  2.85 GBytes  2.45 Gbits/sec
> [  4] 0.-10.0026 sec  2.86 GBytes  2.45 Gbits/sec
> [SUM] 0.-10.0026 sec  5.71 GBytes  4.90 Gbits/sec
> [ CT] final connect times (min/avg/max/stdev) =
> 0.281/0.300/0.318/0.318 ms (tot/err) = 2/0
> jw...@ps401a-914.nst:~$ iperf -p 52000 -c 10.43.3.145 -P 4
> 
> Client connecting to 10.43.3.145, TCP port 52000
> TCP window size:  165 KByte (default)
> 
> [  4] local 10.43.3.146 port 46956 connected with 10.43.3.145 port 52000
> [  6] local 10.43.3.146 port 46978 connected with 10.43.3.145 port 52000
> [  3] local 10.43.3.146 port 46944 connected with 10.43.3.145 port 52000
> [  5] local 10.43.3.146 port 46962 connected with 10.43.3.145 port 52000
> [ ID] Interval   Transfer Bandwidth
> [  3] 0.-10.0017 sec  2.85 GBytes  2.45 Gbits/sec
> [  4] 0.-10.0015 sec  2.85 GBytes  2.45 Gbits/sec
> [  5] 0.-10.0026 sec  2.85 GBytes  2.45 Gbits/sec
> [  6] 0.-10.0005 sec  2.85 GBytes  2.45 Gbits/sec
> [SUM] 0.-10.0005 sec  11.4 GBytes  9.80 Gbits/sec
> [ CT] final connect times (min/avg/max/stdev) =
> 0.274/0.312/0.360/0.212 ms (tot/err) = 4/0
> jw...@ps401a-914.nst:~$ iperf -p 52000 -c 10.43.3.145 -P 8
> 
> Client connecting to 10.43.3.145, TCP port 52000
> TCP window size:  165 KByte (default)
> 
> [  7] local 10.43.3.146 port 35062 connected with 10.43.3.145 port 52000
> [  6] local 10.43.3.146 port 35058 connected with 10.43.3.145 port 52000
> [  8] local 10.43.3.146 port 35066 connected with 10.43.3.145 port 52000
> [  9] local 10.43.3.146 port 35074 connected with 10.43.3.145 port 52000
> [  3] local 10.43.3.146 port 35038 connected with 10.43.3.145 port 52000
> [ 12] local 10.43.3.146 port 35088 connected with 10.43.3.145 port 52000
> [  5] local 10.43.3.146 port 35048 connected with 10.43.3.145 port 52000
> [  4] local 10.43.3.146 port 35050 connected with 10.43.3.145 port 52000
> [ ID] Interval   Transfer Bandwidth
> [  4] 0.-10.0005 sec  2.85 GBytes  2.44 Gbits/sec
> [  8] 0.-10.0011 sec  2.85 GBytes  2.45 Gbits/sec
> [  5] 0.-10. sec  2.85 GBytes  2.45 Gbits/sec
> [ 12] 0.-10.0021 sec  2.85 GBytes  2.44 Gbits/sec
> [  3] 0.-10.0003 sec  2.85 GBytes  2.44 Gbits/sec
> [  7] 0.-10.0065 sec  2.50 GBytes  2.14 Gbits/sec
> [  9] 0.-10.0077 sec  2.52 GBytes  2.16 Gbits/sec
> [  6] 0.-10.0003 sec  2.85 GBytes  2.44 Gbits/sec
> [SUM] 0.-10.0003 sec  22.1 GBytes  19.0 Gbits/sec
> [ CT] final connect times (min/avg/max/stdev) =
> 0.096/0.226/0.339/0.109 ms 

Re: [PATCH-for-9.1 v2 2/3] migration: Remove RDMA protocol handling

2024-05-06 Thread Peter Xu
On Mon, May 06, 2024 at 02:06:28AM +, Gonglei (Arei) wrote:
> Hi, Peter

Hey, Lei,

Happy to see you around again after years.

> RDMA features high bandwidth, low latency (in non-blocking lossless
> network), and direct remote memory access by bypassing the CPU (As you
> know, CPU resources are expensive for cloud vendors, which is one of the
> reasons why we introduced offload cards.), which TCP does not have.

It's another cost to use offload cards, v.s. preparing more cpu resources?

> In some scenarios where fast live migration is needed (extremely short
> interruption duration and migration duration) is very useful. To this
> end, we have also developed RDMA support for multifd.

Will any of you upstream that work?  I'm curious how intrusive would it be
when adding it to multifd, if it can keep only 5 exported functions like
what rdma.h does right now it'll be pretty nice.  We also want to make sure
it works with arbitrary sized loads and buffers, e.g. vfio is considering
to add IO loads to multifd channels too.

One thing to note that the question here is not about a pure performance
comparison between rdma and nics only.  It's about help us make a decision
on whether to drop rdma, iow, even if rdma performs well, the community
still has the right to drop it if nobody can actively work and maintain it.
It's just that if nics can perform as good it's more a reason to drop,
unless companies can help to provide good support and work together.

Thanks,

-- 
Peter Xu




Re: [PATCH 2/3] vfio/migration: Emit VFIO device migration state change QAPI event

2024-05-06 Thread Peter Xu
On Mon, May 06, 2024 at 11:38:00AM -0300, Fabiano Rosas wrote:
> Markus Armbruster  writes:
> 
> > Peter, Fabiano, I'd like to hear your opinion on the issue discussed
> > below.
> >
> > Avihai Horon  writes:
> >
> >> On 02/05/2024 13:22, Joao Martins wrote:
> >>> External email: Use caution opening links or attachments
> >>>
> >>>
> >>> On 01/05/2024 13:28, Avihai Horon wrote:
>  On 01/05/2024 14:50, Joao Martins wrote:
> > External email: Use caution opening links or attachments
> >
> >
> > On 30/04/2024 06:16, Avihai Horon wrote:
> >> Emit VFIO device migration state change QAPI event when a VFIO device
> >> changes its migration state. This can be used by management 
> >> applications
> >> to get updates on the current state of the VFIO device for their own
> >> purposes.
> >>
> >> A new per VFIO device capability, "migration-events", is added so 
> >> events
> >> can be enabled only for the required devices. It is disabled by 
> >> default.
> >>
> >> Signed-off-by: Avihai Horon
> >> ---
> >>include/hw/vfio/vfio-common.h |  1 +
> >>hw/vfio/migration.c   | 44 
> >> +++
> >>hw/vfio/pci.c |  2 ++
> >>3 files changed, 47 insertions(+)
> >>
> >> diff --git a/include/hw/vfio/vfio-common.h 
> >> b/include/hw/vfio/vfio-common.h
> >> index b9da6c08ef..3ec5f2425e 100644
> >> --- a/include/hw/vfio/vfio-common.h
> >> +++ b/include/hw/vfio/vfio-common.h
> >> @@ -115,6 +115,7 @@ typedef struct VFIODevice {
> >>bool no_mmap;
> >>bool ram_block_discard_allowed;
> >>OnOffAuto enable_migration;
> >> +bool migration_events;
> >>VFIODeviceOps *ops;
> >>unsigned int num_irqs;
> >>unsigned int num_regions;
> >> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> >> index 06ae40969b..6bbccf6545 100644
> >> --- a/hw/vfio/migration.c
> >> +++ b/hw/vfio/migration.c
> >> @@ -24,6 +24,7 @@
> >>#include "migration/register.h"
> >>#include "migration/blocker.h"
> >>#include "qapi/error.h"
> >> +#include "qapi/qapi-events-vfio.h"
> >>#include "exec/ramlist.h"
> >>#include "exec/ram_addr.h"
> >>#include "pci.h"
> >> @@ -80,6 +81,46 @@ static const char *mig_state_to_str(enum
> >> vfio_device_mig_state state)
> >>}
> >>}
> >>
> >> +static VFIODeviceMigState
> >> +mig_state_to_qapi_state(enum vfio_device_mig_state state)
> >> +{
> >> +switch (state) {
> >> +case VFIO_DEVICE_STATE_STOP:
> >> +return QAPI_VFIO_DEVICE_MIG_STATE_STOP;
> >> +case VFIO_DEVICE_STATE_RUNNING:
> >> +return QAPI_VFIO_DEVICE_MIG_STATE_RUNNING;
> >> +case VFIO_DEVICE_STATE_STOP_COPY:
> >> +return QAPI_VFIO_DEVICE_MIG_STATE_STOP_COPY;
> >> +case VFIO_DEVICE_STATE_RESUMING:
> >> +return QAPI_VFIO_DEVICE_MIG_STATE_RESUMING;
> >> +case VFIO_DEVICE_STATE_RUNNING_P2P:
> >> +return QAPI_VFIO_DEVICE_MIG_STATE_RUNNING_P2P;
> >> +case VFIO_DEVICE_STATE_PRE_COPY:
> >> +return QAPI_VFIO_DEVICE_MIG_STATE_PRE_COPY;
> >> +case VFIO_DEVICE_STATE_PRE_COPY_P2P:
> >> +return QAPI_VFIO_DEVICE_MIG_STATE_PRE_COPY_P2P;
> >> +default:
> >> +g_assert_not_reached();
> >> +}
> >> +}
> >> +
> >> +static void vfio_migration_send_state_change_event(VFIODevice 
> >> *vbasedev)
> >> +{
> >> +VFIOMigration *migration = vbasedev->migration;
> >> +const char *id;
> >> +Object *obj;
> >> +
> >> +if (!vbasedev->migration_events) {
> >> +return;
> >> +}
> >> +
> > Shouldn't this leap frog migrate_events() capability instead of 
> > introducing its
> > vfio equivalent i.e.
> >
> >   if (!migrate_events()) {
> >   return;
> >   }
> >
> > ?
> 
>  I used a per VFIO device cap so the events can be fine tuned for each 
>  device
>  (maybe one device should send events while the other not).
>  This gives the most flexibility and I don't think it complicates the
>  configuration (one downside, though, is that it can't be enabled/disabled
>  dynamically during runtime).
> 
> >>> Right.
> >>>
>  I don't think events add much overhead, so if you prefer a global cap, I 
>  can
>  change it.
>  However, I'm not sure overloading the existing migrate_events() is valid?
> 
> >>> migration 'events' capability just means we will have some migration 
> >>> events
> >>> emited via QAPI monitor for: 1) general global status and 2) for each 
> >>> migration
> >>> pass (both with different event names=.
> >>
> >> Yes, it's already overloaded.
> >>
> >> In migration QAPI it 

[PATCH 5/6] vhost, vhost-user: Add VIRTIO_F_IN_ORDER to vhost feature bits

2024-05-06 Thread Jonah Palmer via
Add support for the VIRTIO_F_IN_ORDER feature across a variety of vhost
devices.

The inclusion of VIRTIO_F_IN_ORDER in the feature bits arrays for these
devices ensures that the backend is capable of offering and providing
support for this feature, and that it can be disabled if the backend
does not support it.

Tested-by: Lei Yang 
Acked-by: Eugenio Pérez 
Signed-off-by: Jonah Palmer 
---
 hw/block/vhost-user-blk.c| 1 +
 hw/net/vhost_net.c   | 2 ++
 hw/scsi/vhost-scsi.c | 1 +
 hw/scsi/vhost-user-scsi.c| 1 +
 hw/virtio/vhost-user-fs.c| 1 +
 hw/virtio/vhost-user-vsock.c | 1 +
 net/vhost-vdpa.c | 1 +
 7 files changed, 8 insertions(+)

diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index 9e6bbc6950..1dd0a8ef63 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -51,6 +51,7 @@ static const int user_feature_bits[] = {
 VIRTIO_F_RING_PACKED,
 VIRTIO_F_IOMMU_PLATFORM,
 VIRTIO_F_RING_RESET,
+VIRTIO_F_IN_ORDER,
 VHOST_INVALID_FEATURE_BIT
 };
 
diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index fd1a93701a..eb0b1c06e5 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -48,6 +48,7 @@ static const int kernel_feature_bits[] = {
 VIRTIO_F_IOMMU_PLATFORM,
 VIRTIO_F_RING_PACKED,
 VIRTIO_F_RING_RESET,
+VIRTIO_F_IN_ORDER,
 VIRTIO_NET_F_HASH_REPORT,
 VHOST_INVALID_FEATURE_BIT
 };
@@ -76,6 +77,7 @@ static const int user_feature_bits[] = {
 VIRTIO_F_IOMMU_PLATFORM,
 VIRTIO_F_RING_PACKED,
 VIRTIO_F_RING_RESET,
+VIRTIO_F_IN_ORDER,
 VIRTIO_NET_F_RSS,
 VIRTIO_NET_F_HASH_REPORT,
 VIRTIO_NET_F_GUEST_USO4,
diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c
index ae26bc19a4..40e7630191 100644
--- a/hw/scsi/vhost-scsi.c
+++ b/hw/scsi/vhost-scsi.c
@@ -38,6 +38,7 @@ static const int kernel_feature_bits[] = {
 VIRTIO_RING_F_EVENT_IDX,
 VIRTIO_SCSI_F_HOTPLUG,
 VIRTIO_F_RING_RESET,
+VIRTIO_F_IN_ORDER,
 VHOST_INVALID_FEATURE_BIT
 };
 
diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c
index a63b1f4948..1d59951ab7 100644
--- a/hw/scsi/vhost-user-scsi.c
+++ b/hw/scsi/vhost-user-scsi.c
@@ -36,6 +36,7 @@ static const int user_feature_bits[] = {
 VIRTIO_RING_F_EVENT_IDX,
 VIRTIO_SCSI_F_HOTPLUG,
 VIRTIO_F_RING_RESET,
+VIRTIO_F_IN_ORDER,
 VHOST_INVALID_FEATURE_BIT
 };
 
diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c
index cca2cd41be..9243dbb128 100644
--- a/hw/virtio/vhost-user-fs.c
+++ b/hw/virtio/vhost-user-fs.c
@@ -33,6 +33,7 @@ static const int user_feature_bits[] = {
 VIRTIO_F_RING_PACKED,
 VIRTIO_F_IOMMU_PLATFORM,
 VIRTIO_F_RING_RESET,
+VIRTIO_F_IN_ORDER,
 
 VHOST_INVALID_FEATURE_BIT
 };
diff --git a/hw/virtio/vhost-user-vsock.c b/hw/virtio/vhost-user-vsock.c
index 9431b9792c..cc7e4e47b4 100644
--- a/hw/virtio/vhost-user-vsock.c
+++ b/hw/virtio/vhost-user-vsock.c
@@ -21,6 +21,7 @@ static const int user_feature_bits[] = {
 VIRTIO_RING_F_INDIRECT_DESC,
 VIRTIO_RING_F_EVENT_IDX,
 VIRTIO_F_NOTIFY_ON_EMPTY,
+VIRTIO_F_IN_ORDER,
 VHOST_INVALID_FEATURE_BIT
 };
 
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index 85e73dd6a7..ed3185acfa 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -62,6 +62,7 @@ const int vdpa_feature_bits[] = {
 VIRTIO_F_RING_PACKED,
 VIRTIO_F_RING_RESET,
 VIRTIO_F_VERSION_1,
+VIRTIO_F_IN_ORDER,
 VIRTIO_NET_F_CSUM,
 VIRTIO_NET_F_CTRL_GUEST_OFFLOADS,
 VIRTIO_NET_F_CTRL_MAC_ADDR,
-- 
2.39.3




[PATCH 0/6] virtio,vhost: Add VIRTIO_F_IN_ORDER support

2024-05-06 Thread Jonah Palmer
The goal of these patches is to add support to a variety of virtio and
vhost devices for the VIRTIO_F_IN_ORDER transport feature. This feature
indicates that all buffers are used by the device in the same order in
which they were made available by the driver.

These patches attempt to implement a generalized, non-device-specific
solution to support this feature.

The core feature behind this solution is a buffer mechanism in the form
of a VirtQueue's used_elems VirtQueueElement array. This allows devices
who always use buffers in-order by default to have a minimal overhead
impact. Devices that may not always use buffers in-order likely will
experience a performance hit. How large that performance hit is will
depend on how frequent elements are completed out-of-order.

A VirtQueue whose device who uses this feature will use its used_elems
VirtQueueElement array to hold used VirtQueueElements. The index that
used elements are placed in used_elems is the same index on the
used/descriptor ring that would satisfy the in-order requirement. In
other words, used elements are placed in their in-order locations on
used_elems and are only written to the used/descriptor ring once the
elements on used_elems are able to continue their expected order.

To differentiate between a "used" and "unused" element on the used_elems
array (a "used" element being an element that has returned from
processing and an "unused" element being an element that has not yet
been processed), we added a boolean 'filled' member to the
VirtQueueElement struct. This flag is set to true when the element comes
back from processing (virtqueue_ordered_fill) and then set back to false
once it's been written to the used/descriptor ring
(virtqueue_ordered_flush).

---
v1: Move series from RFC to PATCH for submission.

Jonah Palmer (6):
  virtio: Add bool to VirtQueueElement
  virtio: virtqueue_pop - VIRTIO_F_IN_ORDER support
  virtio: virtqueue_ordered_fill - VIRTIO_F_IN_ORDER support
  virtio: virtqueue_ordered_flush - VIRTIO_F_IN_ORDER support
  vhost,vhost-user: Add VIRTIO_F_IN_ORDER to vhost feature bits
  virtio: Add VIRTIO_F_IN_ORDER property definition

 hw/block/vhost-user-blk.c|   1 +
 hw/net/vhost_net.c   |   2 +
 hw/scsi/vhost-scsi.c |   1 +
 hw/scsi/vhost-user-scsi.c|   1 +
 hw/virtio/vhost-user-fs.c|   1 +
 hw/virtio/vhost-user-vsock.c |   1 +
 hw/virtio/virtio.c   | 118 ++-
 include/hw/virtio/virtio.h   |   5 +-
 net/vhost-vdpa.c |   1 +
 9 files changed, 127 insertions(+), 4 deletions(-)

-- 
2.39.3




[PATCH 3/6] virtio: virtqueue_ordered_fill - VIRTIO_F_IN_ORDER support

2024-05-06 Thread Jonah Palmer
Add VIRTIO_F_IN_ORDER feature support for virtqueue_fill operations.

The goal of the virtqueue_fill operation when the VIRTIO_F_IN_ORDER
feature has been negotiated is to search for this now-used element,
set its length, and mark the element as filled in the VirtQueue's
used_elems array.

By marking the element as filled, it will indicate that this element is
ready to be flushed, so long as the element is in-order.

Tested-by: Lei Yang 
Signed-off-by: Jonah Palmer 
---
 hw/virtio/virtio.c | 26 +-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index e6eb1bb453..064046b5e2 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -873,6 +873,28 @@ static void virtqueue_packed_fill(VirtQueue *vq, const 
VirtQueueElement *elem,
 vq->used_elems[idx].ndescs = elem->ndescs;
 }
 
+static void virtqueue_ordered_fill(VirtQueue *vq, const VirtQueueElement *elem,
+   unsigned int len)
+{
+unsigned int i = vq->used_idx;
+
+/* Search for element in vq->used_elems */
+while (i != vq->last_avail_idx) {
+/* Found element, set length and mark as filled */
+if (vq->used_elems[i].index == elem->index) {
+vq->used_elems[i].len = len;
+vq->used_elems[i].filled = true;
+break;
+}
+
+i += vq->used_elems[i].ndescs;
+
+if (i >= vq->vring.num) {
+i -= vq->vring.num;
+}
+}
+}
+
 static void virtqueue_packed_fill_desc(VirtQueue *vq,
const VirtQueueElement *elem,
unsigned int idx,
@@ -923,7 +945,9 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement 
*elem,
 return;
 }
 
-if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) {
+if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_IN_ORDER)) {
+virtqueue_ordered_fill(vq, elem, len);
+} else if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) {
 virtqueue_packed_fill(vq, elem, len, idx);
 } else {
 virtqueue_split_fill(vq, elem, len, idx);
-- 
2.39.3




[PATCH 1/6] virtio: Add bool to VirtQueueElement

2024-05-06 Thread Jonah Palmer
Add the boolean 'filled' member to the VirtQueueElement structure. The
use of this boolean will signify if the element has been written to the
used / descriptor ring or not. This boolean is used to support the
VIRTIO_F_IN_ORDER feature.

Tested-by: Lei Yang 
Signed-off-by: Jonah Palmer 
---
 include/hw/virtio/virtio.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 7d5ffdc145..9ed9c3763c 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -69,6 +69,7 @@ typedef struct VirtQueueElement
 unsigned int ndescs;
 unsigned int out_num;
 unsigned int in_num;
+bool filled;
 hwaddr *in_addr;
 hwaddr *out_addr;
 struct iovec *in_sg;
-- 
2.39.3




[PATCH 6/6] virtio: Add VIRTIO_F_IN_ORDER property definition

2024-05-06 Thread Jonah Palmer
Extend the virtio device property definitions to include the
VIRTIO_F_IN_ORDER feature.

The default state of this feature is disabled, allowing it to be
explicitly enabled where it's supported.

Tested-by: Lei Yang 
Acked-by: Eugenio Pérez 
Signed-off-by: Jonah Palmer 
---
 include/hw/virtio/virtio.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 9ed9c3763c..30c23400e3 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -370,7 +370,9 @@ typedef struct VirtIORNGConf VirtIORNGConf;
 DEFINE_PROP_BIT64("packed", _state, _field, \
   VIRTIO_F_RING_PACKED, false), \
 DEFINE_PROP_BIT64("queue_reset", _state, _field, \
-  VIRTIO_F_RING_RESET, true)
+  VIRTIO_F_RING_RESET, true), \
+DEFINE_PROP_BIT64("in_order", _state, _field, \
+  VIRTIO_F_IN_ORDER, false)
 
 hwaddr virtio_queue_get_desc_addr(VirtIODevice *vdev, int n);
 bool virtio_queue_enabled_legacy(VirtIODevice *vdev, int n);
-- 
2.39.3




[PATCH 2/6] virtio: virtqueue_pop - VIRTIO_F_IN_ORDER support

2024-05-06 Thread Jonah Palmer
Add VIRTIO_F_IN_ORDER feature support in virtqueue_split_pop and
virtqueue_packed_pop.

VirtQueueElements popped from the available/descritpor ring are added to
the VirtQueue's used_elems array in-order and in the same fashion as
they would be added the used and descriptor rings, respectively.

This will allow us to keep track of the current order, what elements
have been written, as well as an element's essential data after being
processed.

Tested-by: Lei Yang 
Signed-off-by: Jonah Palmer 
---
 hw/virtio/virtio.c | 17 -
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 893a072c9d..e6eb1bb453 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -1506,7 +1506,7 @@ static void *virtqueue_alloc_element(size_t sz, unsigned 
out_num, unsigned in_nu
 
 static void *virtqueue_split_pop(VirtQueue *vq, size_t sz)
 {
-unsigned int i, head, max;
+unsigned int i, j, head, max;
 VRingMemoryRegionCaches *caches;
 MemoryRegionCache indirect_desc_cache;
 MemoryRegionCache *desc_cache;
@@ -1539,6 +1539,8 @@ static void *virtqueue_split_pop(VirtQueue *vq, size_t sz)
 goto done;
 }
 
+j = vq->last_avail_idx;
+
 if (!virtqueue_get_head(vq, vq->last_avail_idx++, )) {
 goto done;
 }
@@ -1630,6 +1632,12 @@ static void *virtqueue_split_pop(VirtQueue *vq, size_t 
sz)
 elem->in_sg[i] = iov[out_num + i];
 }
 
+if (virtio_vdev_has_feature(vdev, VIRTIO_F_IN_ORDER)) {
+vq->used_elems[j].index = elem->index;
+vq->used_elems[j].len = elem->len;
+vq->used_elems[j].ndescs = elem->ndescs;
+}
+
 vq->inuse++;
 
 trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num);
@@ -1758,6 +1766,13 @@ static void *virtqueue_packed_pop(VirtQueue *vq, size_t 
sz)
 
 elem->index = id;
 elem->ndescs = (desc_cache == _desc_cache) ? 1 : elem_entries;
+
+if (virtio_vdev_has_feature(vdev, VIRTIO_F_IN_ORDER)) {
+vq->used_elems[vq->last_avail_idx].index = elem->index;
+vq->used_elems[vq->last_avail_idx].len = elem->len;
+vq->used_elems[vq->last_avail_idx].ndescs = elem->ndescs;
+}
+
 vq->last_avail_idx += elem->ndescs;
 vq->inuse += elem->ndescs;
 
-- 
2.39.3




[PATCH 4/6] virtio: virtqueue_ordered_flush - VIRTIO_F_IN_ORDER support

2024-05-06 Thread Jonah Palmer
Add VIRTIO_F_IN_ORDER feature support for virtqueue_flush operations.

The goal of the virtqueue_flush operation when the VIRTIO_F_IN_ORDER
feature has been negotiated is to write elements to the used/descriptor
ring in-order and then update used_idx.

The function iterates through the VirtQueueElement used_elems array
in-order starting at vq->used_idx. If the element is valid (filled), the
element is written to the used/descriptor ring. This process continues
until we find an invalid (not filled) element.

If any elements were written, the used_idx is updated.

Tested-by: Lei Yang 
Signed-off-by: Jonah Palmer 
---
 hw/virtio/virtio.c | 75 +-
 1 file changed, 74 insertions(+), 1 deletion(-)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 064046b5e2..0efed2c88e 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -1006,6 +1006,77 @@ static void virtqueue_packed_flush(VirtQueue *vq, 
unsigned int count)
 }
 }
 
+static void virtqueue_ordered_flush(VirtQueue *vq)
+{
+unsigned int i = vq->used_idx;
+unsigned int ndescs = 0;
+uint16_t old = vq->used_idx;
+bool packed;
+VRingUsedElem uelem;
+
+packed = virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED);
+
+if (packed) {
+if (unlikely(!vq->vring.desc)) {
+return;
+}
+} else if (unlikely(!vq->vring.used)) {
+return;
+}
+
+/* First expected in-order element isn't ready, nothing to do */
+if (!vq->used_elems[i].filled) {
+return;
+}
+
+/* Write first expected in-order element to used ring (split VQs) */
+if (!packed) {
+uelem.id = vq->used_elems[i].index;
+uelem.len = vq->used_elems[i].len;
+vring_used_write(vq, , i);
+}
+
+ndescs += vq->used_elems[i].ndescs;
+i += ndescs;
+if (i >= vq->vring.num) {
+i -= vq->vring.num;
+}
+
+/* Search for more filled elements in-order */
+while (vq->used_elems[i].filled) {
+if (packed) {
+virtqueue_packed_fill_desc(vq, >used_elems[i], ndescs, false);
+} else {
+uelem.id = vq->used_elems[i].index;
+uelem.len = vq->used_elems[i].len;
+vring_used_write(vq, , i);
+}
+
+vq->used_elems[i].filled = false;
+ndescs += vq->used_elems[i].ndescs;
+i += ndescs;
+if (i >= vq->vring.num) {
+i -= vq->vring.num;
+}
+}
+
+if (packed) {
+virtqueue_packed_fill_desc(vq, >used_elems[vq->used_idx], 0, true);
+vq->used_idx += ndescs;
+if (vq->used_idx >= vq->vring.num) {
+vq->used_idx -= vq->vring.num;
+vq->used_wrap_counter ^= 1;
+vq->signalled_used_valid = false;
+}
+} else {
+vring_used_idx_set(vq, i);
+if (unlikely((int16_t)(i - vq->signalled_used) < (uint16_t)(i - old))) 
{
+vq->signalled_used_valid = false;
+}
+}
+vq->inuse -= ndescs;
+}
+
 void virtqueue_flush(VirtQueue *vq, unsigned int count)
 {
 if (virtio_device_disabled(vq->vdev)) {
@@ -1013,7 +1084,9 @@ void virtqueue_flush(VirtQueue *vq, unsigned int count)
 return;
 }
 
-if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) {
+if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_IN_ORDER)) {
+virtqueue_ordered_flush(vq);
+} else if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) {
 virtqueue_packed_flush(vq, count);
 } else {
 virtqueue_split_flush(vq, count);
-- 
2.39.3




  1   2   3   4   >