Re: [PATCH 1/3] vfio: Make VFIOIOMMUClass::attach_device() and its wrapper return bool
On 5/7/24 04:09, Duan, Zhenzhong wrote: -Original Message- From: Cédric Le Goater Subject: Re: [PATCH 1/3] vfio: Make VFIOIOMMUClass::attach_device() and its wrapper return bool On 5/6/24 10:33, Zhenzhong Duan wrote: Make VFIOIOMMUClass::attach_device() and its wrapper function vfio_attach_device() return bool. This is to follow the coding standand to return bool if 'Error **' is used to pass error. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan --- include/hw/vfio/vfio-common.h | 4 ++-- include/hw/vfio/vfio-container-base.h | 4 ++-- hw/vfio/ap.c | 6 ++ hw/vfio/ccw.c | 6 ++ hw/vfio/common.c | 4 ++-- hw/vfio/container.c | 14 +++--- hw/vfio/iommufd.c | 11 +-- hw/vfio/pci.c | 8 +++- hw/vfio/platform.c| 7 +++ 9 files changed, 28 insertions(+), 36 deletions(-) diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio- common.h index b9da6c08ef..a7b6fc8f46 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -198,8 +198,8 @@ void vfio_region_exit(VFIORegion *region); void vfio_region_finalize(VFIORegion *region); void vfio_reset_handler(void *opaque); struct vfio_device_info *vfio_get_device_info(int fd); -int vfio_attach_device(char *name, VFIODevice *vbasedev, - AddressSpace *as, Error **errp); +bool vfio_attach_device(char *name, VFIODevice *vbasedev, +AddressSpace *as, Error **errp); void vfio_detach_device(VFIODevice *vbasedev); int vfio_kvm_device_add_fd(int fd, Error **errp); diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio- container-base.h index 3582d5f97a..c839cfd9cb 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -118,8 +118,8 @@ struct VFIOIOMMUClass { int (*dma_unmap)(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb); -int (*attach_device)(const char *name, VFIODevice *vbasedev, - AddressSpace *as, Error **errp); +bool (*attach_device)(const char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp); void (*detach_device)(VFIODevice *vbasedev); /* migration feature */ int (*set_dirty_page_tracking)(const VFIOContainerBase *bcontainer, diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index 7c4caa5938..d50600b702 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -156,7 +156,6 @@ static void vfio_ap_unregister_irq_notifier(VFIOAPDevice *vapdev, static void vfio_ap_realize(DeviceState *dev, Error **errp) { ERRP_GUARD(); -int ret; Error *err = NULL; VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev); VFIODevice *vbasedev = >vdev; @@ -165,9 +164,8 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp) return; } -ret = vfio_attach_device(vbasedev->name, vbasedev, - _space_memory, errp); -if (ret) { +if (!vfio_attach_device(vbasedev->name, vbasedev, +_space_memory, errp)) { goto error; } diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index 90e4a53437..782bd4bed7 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -580,7 +580,6 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp) S390CCWDeviceClass *cdc = S390_CCW_DEVICE_GET_CLASS(cdev); VFIODevice *vbasedev = >vdev; Error *err = NULL; -int ret; /* Call the class init function for subchannel. */ if (cdc->realize) { @@ -594,9 +593,8 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp) return; } -ret = vfio_attach_device(cdev->mdevid, vbasedev, - _space_memory, errp); -if (ret) { +if (!vfio_attach_device(cdev->mdevid, vbasedev, +_space_memory, errp)) { goto out_attach_dev_err; } diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 8f9cbdc026..890d30910e 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1492,8 +1492,8 @@ retry: return info; } -int vfio_attach_device(char *name, VFIODevice *vbasedev, - AddressSpace *as, Error **errp) +bool vfio_attach_device(char *name, VFIODevice *vbasedev, +AddressSpace *as, Error **errp) { const VFIOIOMMUClass *ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY)); I think vfio_attach_device() can be cleaned up a little further : ret = ops->attach_device(name, vbasedev, as, errp); if (ret < 0) { return ret; } Not understand this. I have both ops->attach_device() and vfio_attach_device() return
[PATCH v2 0/3] Upgrade ACPI SPCR table to support SPCR table version 4 format
Update the SPCR table to accommodate the SPCR Table version 4 [1]. The SPCR table has been modified to adhere to the version 4 format [2]. Meanwhile, the virt SPCR golden reference files have been updated to accommodate the SPCR Table version 4. This patch series depends on Sunil's patch series [3], where Bios-Table-Test is now supported by both ARM and RISC-V. [1]: https://learn.microsoft.com/en-us/windows-hardware/drivers/serports/serial-port-console-redirection-table [2]: https://github.com/acpica/acpica/pull/931 [3]: https://lore.kernel.org/all/20240315130519.2378765-1-suni...@ventanamicro.com/ Changes in v2: - Utilizes a three-patch approach to modify the ACPI pre-built binary files required by the Bios-Table-Test. - Rebases and incorporates changes to support both ARM and RISC-V ACPI pre-built binary files. Sia Jee Heng (3): qtest: allow SPCR acpi table changes hw/acpi: Upgrade ACPI SPCR table to support SPCR table version 4 format tests/qtest/bios-tables-test: Update virt SPCR golden references hw/acpi/aml-build.c | 14 +++--- hw/arm/virt-acpi-build.c | 10 -- hw/riscv/virt-acpi-build.c| 12 +--- include/hw/acpi/acpi-defs.h | 7 +-- include/hw/acpi/aml-build.h | 2 +- tests/data/acpi/virt/aarch64/SPCR | Bin 80 -> 90 bytes tests/data/acpi/virt/riscv64/SPCR | Bin 80 -> 90 bytes 7 files changed, 34 insertions(+), 11 deletions(-) -- 2.34.1
[PATCH v2 3/3] tests/qtest/bios-tables-test: Update virt SPCR golden references
Update the virt SPCR golden reference files to accommodate the SPCR Table version 4 [1], utilizing the iasl binary compiled from the latest ACPICA repository. The SPCR table has been modified to adhere to the version 4 format [2]. [1]: https://learn.microsoft.com/en-us/windows-hardware/drivers/serports/serial-port-console-redirection-table [2]: https://github.com/acpica/acpica/pull/931 Diffs from iasl: /* * Intel ACPI Component Architecture * AML/ASL+ Disassembler version 20240322 (64-bit version) * Copyright (c) 2000 - 2023 Intel Corporation * - * Disassembly of tests/data/acpi/virt/aarch64/SPCR + * Disassembly of /tmp/aml-DVTAN2 * * ACPI Data Table [SPCR] * * Format: [HexOffset DecimalOffset ByteLength] FieldName : FieldValue (in hex) */ [000h 004h] Signature : "SPCR"[Serial Port Console Redirection Table] -[004h 0004 004h]Table Length : 0050 -[008h 0008 001h]Revision : 02 -[009h 0009 001h]Checksum : B1 +[004h 0004 004h]Table Length : 005A +[008h 0008 001h]Revision : 04 +[009h 0009 001h]Checksum : 1D [00Ah 0010 006h] Oem ID : "BOCHS " [010h 0016 008h]Oem Table ID : "BXPC" [018h 0024 004h]Oem Revision : 0001 [01Ch 0028 004h] Asl Compiler ID : "BXPC" [020h 0032 004h] Asl Compiler Revision : 0001 [024h 0036 001h] Interface Type : 03 [025h 0037 003h]Reserved : 00 [028h 0040 00Ch]Serial Port Register : [Generic Address Structure] [028h 0040 001h]Space ID : 00 [SystemMemory] [029h 0041 001h] Bit Width : 20 [02Ah 0042 001h] Bit Offset : 00 [02Bh 0043 001h]Encoded Access Width : 03 [DWord Access:32] [02Ch 0044 008h] Address : 0900 [035h 0053 001h] PCAT-compatible IRQ : 00 [036h 0054 004h] Interrupt : 0021 [03Ah 0058 001h] Baud Rate : 03 [03Bh 0059 001h] Parity : 00 [03Ch 0060 001h] Stop Bits : 01 [03Dh 0061 001h]Flow Control : 02 [03Eh 0062 001h] Terminal Type : 00 [03Fh 0063 001h]Language : 00 [040h 0064 002h] PCI Device ID : [042h 0066 002h] PCI Vendor ID : [044h 0068 001h] PCI Bus : 00 [045h 0069 001h] PCI Device : 00 [046h 0070 001h]PCI Function : 00 [047h 0071 004h] PCI Flags : [04Bh 0075 001h] PCI Segment : 00 [04Ch 0076 004h] Uart Clock Freq : -/ ACPI table terminates in the middle of a data structure! (dump table) -CurrentOffset: 50, TableLength: 50 ***/ \ No newline at end of file +[050h 0080 004h] Precise Baud rate : +[054h 0084 002h] NameSpaceStringLength : 0002 +[056h 0086 002h] NameSpaceStringOffset : 0058 +[058h 0088 002h] NamespaceString : "." + +Raw Table Data: Length 90 (0x5A) + +: 53 50 43 52 5A 00 00 00 04 1D 42 4F 43 48 53 20 // SPCRZ.BOCHS +0010: 42 58 50 43 20 20 20 20 01 00 00 00 42 58 50 43 // BXPCBXPC +0020: 01 00 00 00 03 00 00 00 00 20 00 03 00 00 00 09 // . .. +0030: 00 00 00 00 08 00 21 00 00 00 03 00 01 02 00 00 // ..!. +0040: FF FF FF FF 00 00 00 00 00 00 00 00 00 00 00 00 // +0050: 00 00 00 00 02 00 58 00 2E 00// ..X... /* * Intel ACPI Component Architecture * AML/ASL+ Disassembler version 20200925 (64-bit version) * Copyright (c) 2000 - 2020 Intel Corporation * - * Disassembly of tests/data/acpi/virt/riscv64/SPCR, Mon May 6 05:34:22 2024 + * Disassembly of /tmp/aml-27E8M2, Mon May 6 05:34:22 2024 * * ACPI Data Table [SPCR] * * Format: [HexOffset DecimalOffset ByteLength] FieldName : FieldValue */ [000h 4]Signature : "SPCR"[Serial Port Console Redirection table] -[004h 0004 4] Table Length : 0050 -[008h 0008 1] Revision : 02 -[009h 0009 1] Checksum : B9 +[004h 0004 4] Table Length : 005A +[008h 0008 1] Revision : 04 +[009h 0009 1] Checksum : 25 [00Ah 0010 6] Oem ID : "BOCHS " [010h 0016 8] Oem Table ID : "BXPC" [018h 0024 4] Oem Revision : 0001 [01Ch 0028 4] Asl Compiler ID : "BXPC" [020h 0032 4]Asl Compiler Revision : 0001 [024h 0036 1] Interface Type : 00 [025h 0037 3] Reserved : 00 [028h 0040 12] Serial Port Register : [Generic Address Structure] [028h 0040 1]
[PATCH v2 1/3] qtest: allow SPCR acpi table changes
Signed-off-by: Sia Jee Heng --- tests/qtest/bios-tables-test-allowed-diff.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h index dfb8523c8b..3f12ca546b 100644 --- a/tests/qtest/bios-tables-test-allowed-diff.h +++ b/tests/qtest/bios-tables-test-allowed-diff.h @@ -1 +1,3 @@ /* List of comma-separated changed AML files to ignore */ +"tests/data/acpi/virt/riscv64/SPCR", +"tests/data/acpi/virt/aarch64/SPCR", -- 2.34.1
[PATCH v2 2/3] hw/acpi: Upgrade ACPI SPCR table to support SPCR table version 4 format
Update the SPCR table to accommodate the SPCR Table version 4 [1]. The SPCR table has been modified to adhere to the version 4 format [2]. [1]: https://learn.microsoft.com/en-us/windows-hardware/drivers/serports/serial-port-console-redirection-table [2]: https://github.com/acpica/acpica/pull/931 Signed-off-by: Sia Jee Heng --- hw/acpi/aml-build.c | 14 +++--- hw/arm/virt-acpi-build.c| 10 -- hw/riscv/virt-acpi-build.c | 12 +--- include/hw/acpi/acpi-defs.h | 7 +-- include/hw/acpi/aml-build.h | 2 +- 5 files changed, 34 insertions(+), 11 deletions(-) diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c index 6d4517cfbe..7c43573eef 100644 --- a/hw/acpi/aml-build.c +++ b/hw/acpi/aml-build.c @@ -1996,7 +1996,7 @@ static void build_processor_hierarchy_node(GArray *tbl, uint32_t flags, void build_spcr(GArray *table_data, BIOSLinker *linker, const AcpiSpcrData *f, const uint8_t rev, -const char *oem_id, const char *oem_table_id) +const char *oem_id, const char *oem_table_id, const char *name) { AcpiTable table = { .sig = "SPCR", .rev = rev, .oem_id = oem_id, .oem_table_id = oem_table_id }; @@ -2042,8 +2042,16 @@ void build_spcr(GArray *table_data, BIOSLinker *linker, build_append_int_noprefix(table_data, f->pci_flags, 4); /* PCI Segment */ build_append_int_noprefix(table_data, f->pci_segment, 1); -/* Reserved */ -build_append_int_noprefix(table_data, 0, 4); +/* UartClkFreq */ +build_append_int_noprefix(table_data, f->uart_clk_freq, 4); +/* PreciseBaudrate */ +build_append_int_noprefix(table_data, f->precise_baudrate, 4); +/* NameSpaceStringLength */ +build_append_int_noprefix(table_data, f->namespace_string_length, 2); +/* NameSpaceStringOffset */ +build_append_int_noprefix(table_data, f->namespace_string_offset, 2); +/* NamespaceString[] */ +g_array_append_vals(table_data, name, f->namespace_string_length); acpi_table_end(linker, ); } diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 6a1bde61ce..cb345e8659 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -428,11 +428,12 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) /* * Serial Port Console Redirection Table (SPCR) - * Rev: 1.07 + * Rev: 1.10 */ static void spcr_setup(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) { +const char name[] = "."; AcpiSpcrData serial = { .interface_type = 3, /* ARM PL011 UART */ .base_addr.id = AML_AS_SYSTEM_MEMORY, @@ -456,9 +457,14 @@ spcr_setup(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) .pci_function = 0, .pci_flags = 0, .pci_segment = 0, +.uart_clk_freq = 0, +.precise_baudrate = 0, +.namespace_string_length = sizeof(name), +.namespace_string_offset = 88, }; -build_spcr(table_data, linker, , 2, vms->oem_id, vms->oem_table_id); +build_spcr(table_data, linker, , 4, vms->oem_id, vms->oem_table_id, + name); } /* diff --git a/hw/riscv/virt-acpi-build.c b/hw/riscv/virt-acpi-build.c index 0925528160..5fa3942491 100644 --- a/hw/riscv/virt-acpi-build.c +++ b/hw/riscv/virt-acpi-build.c @@ -176,14 +176,15 @@ acpi_dsdt_add_uart(Aml *scope, const MemMapEntry *uart_memmap, /* * Serial Port Console Redirection Table (SPCR) - * Rev: 1.07 + * Rev: 1.10 */ static void spcr_setup(GArray *table_data, BIOSLinker *linker, RISCVVirtState *s) { +const char name[] = "."; AcpiSpcrData serial = { -.interface_type = 0, /* 16550 compatible */ +.interface_type = 0x12, /* 16550 compatible */ .base_addr.id = AML_AS_SYSTEM_MEMORY, .base_addr.width = 32, .base_addr.offset = 0, @@ -205,9 +206,14 @@ spcr_setup(GArray *table_data, BIOSLinker *linker, RISCVVirtState *s) .pci_function = 0, .pci_flags = 0, .pci_segment = 0, +.uart_clk_freq = 0, +.precise_baudrate = 0, +.namespace_string_length = sizeof(name), +.namespace_string_offset = 88, }; -build_spcr(table_data, linker, , 2, s->oem_id, s->oem_table_id); +build_spcr(table_data, linker, , 4, s->oem_id, s->oem_table_id, + name); } /* RHCT Node[N] starts at offset 56 */ diff --git a/include/hw/acpi/acpi-defs.h b/include/hw/acpi/acpi-defs.h index 0e6e82b339..2e6e341998 100644 --- a/include/hw/acpi/acpi-defs.h +++ b/include/hw/acpi/acpi-defs.h @@ -112,7 +112,6 @@ typedef struct AcpiSpcrData { uint8_t flow_control; uint8_t terminal_type; uint8_t language; -uint8_t reserved1; uint16_t pci_device_id;/* Must be 0x if not PCI device */ uint16_t pci_vendor_id;/* Must be 0x if not PCI device */ uint8_t pci_bus; @@ -120,7 +119,11 @@ typedef struct AcpiSpcrData {
Re: [PATCH-for-9.1 v2 2/3] migration: Remove RDMA protocol handling
Hi Peter, hi Daniel, On Mon, May 6, 2024 at 5:29 PM Peter Xu wrote: > > On Mon, May 06, 2024 at 12:08:43PM +0200, Jinpu Wang wrote: > > Hi Peter, hi Daniel, > > Hi, Jinpu, > > Thanks for sharing this test results. Sounds like a great news. > > What's your plan next? Would it then be worthwhile / possible moving QEMU > into that direction? Would that greatly simplify rdma code as Dan > mentioned? I'm rather not familiar with QEMU migration yet, from the test result, I think it's a possible direction, just we need to at least based on a rather recent release like rdma-core v33 with proper 'fork' support. Maybe Dan or you could give more detail about what you have in mind for using rsocket as a replacement for the future. We will also look into the implementation details in the meantime. Thx! J > > Thanks, > > > > > On Fri, May 3, 2024 at 4:33 PM Peter Xu wrote: > > > > > > On Fri, May 03, 2024 at 08:40:03AM +0200, Jinpu Wang wrote: > > > > I had a brief check in the rsocket changelog, there seems some > > > > improvement over time, > > > > might be worth revisiting this. due to socket abstraction, we can't > > > > use some feature like > > > > ODP, it won't be a small and easy task. > > > > > > It'll be good to know whether Dan's suggestion would work first, without > > > rewritting everything yet so far. Not sure whether some perf test could > > > help with the rsocket APIs even without QEMU's involvements (or looking > > > for > > > test data supporting / invalidate such conversions). > > > > > I did a quick test with iperf on 100 G environment and 40 G > > environment, in summary rsocket works pretty well. > > > > iperf tests between 2 hosts with 40 G (IB), > > first a few test with different num. of threads on top of ipoib > > interface, later with preload rsocket on top of same ipoib interface. > > > > jw...@ps401a-914.nst:~$ iperf -p 52000 -c 10.43.3.145 > > > > Client connecting to 10.43.3.145, TCP port 52000 > > TCP window size: 165 KByte (default) > > > > [ 3] local 10.43.3.146 port 55602 connected with 10.43.3.145 port 52000 > > [ ID] Interval Transfer Bandwidth > > [ 3] 0.-10.0001 sec 2.85 GBytes 2.44 Gbits/sec > > jw...@ps401a-914.nst:~$ iperf -p 52000 -c 10.43.3.145 -P 2 > > > > Client connecting to 10.43.3.145, TCP port 52000 > > TCP window size: 165 KByte (default) > > > > [ 4] local 10.43.3.146 port 39640 connected with 10.43.3.145 port 52000 > > [ 3] local 10.43.3.146 port 39626 connected with 10.43.3.145 port 52000 > > [ ID] Interval Transfer Bandwidth > > [ 3] 0.-10.0012 sec 2.85 GBytes 2.45 Gbits/sec > > [ 4] 0.-10.0026 sec 2.86 GBytes 2.45 Gbits/sec > > [SUM] 0.-10.0026 sec 5.71 GBytes 4.90 Gbits/sec > > [ CT] final connect times (min/avg/max/stdev) = > > 0.281/0.300/0.318/0.318 ms (tot/err) = 2/0 > > jw...@ps401a-914.nst:~$ iperf -p 52000 -c 10.43.3.145 -P 4 > > > > Client connecting to 10.43.3.145, TCP port 52000 > > TCP window size: 165 KByte (default) > > > > [ 4] local 10.43.3.146 port 46956 connected with 10.43.3.145 port 52000 > > [ 6] local 10.43.3.146 port 46978 connected with 10.43.3.145 port 52000 > > [ 3] local 10.43.3.146 port 46944 connected with 10.43.3.145 port 52000 > > [ 5] local 10.43.3.146 port 46962 connected with 10.43.3.145 port 52000 > > [ ID] Interval Transfer Bandwidth > > [ 3] 0.-10.0017 sec 2.85 GBytes 2.45 Gbits/sec > > [ 4] 0.-10.0015 sec 2.85 GBytes 2.45 Gbits/sec > > [ 5] 0.-10.0026 sec 2.85 GBytes 2.45 Gbits/sec > > [ 6] 0.-10.0005 sec 2.85 GBytes 2.45 Gbits/sec > > [SUM] 0.-10.0005 sec 11.4 GBytes 9.80 Gbits/sec > > [ CT] final connect times (min/avg/max/stdev) = > > 0.274/0.312/0.360/0.212 ms (tot/err) = 4/0 > > jw...@ps401a-914.nst:~$ iperf -p 52000 -c 10.43.3.145 -P 8 > > > > Client connecting to 10.43.3.145, TCP port 52000 > > TCP window size: 165 KByte (default) > > > > [ 7] local 10.43.3.146 port 35062 connected with 10.43.3.145 port 52000 > > [ 6] local 10.43.3.146 port 35058 connected with 10.43.3.145 port 52000 > > [ 8] local 10.43.3.146 port 35066 connected with 10.43.3.145 port 52000 > > [ 9] local 10.43.3.146 port 35074 connected with 10.43.3.145 port 52000 > > [ 3] local 10.43.3.146 port 35038 connected with 10.43.3.145 port 52000 > > [ 12] local 10.43.3.146 port 35088 connected with 10.43.3.145 port 52000 > > [ 5] local 10.43.3.146 port 35048 connected with 10.43.3.145 port 52000 > > [ 4] local 10.43.3.146 port 35050 connected with
Re: [PATCH 1/2] ppc/pnv: Begin a more complete ADU LPC model for POWER9/10
On Fri May 3, 2024 at 3:44 PM AEST, Cédric Le Goater wrote: > On 5/3/24 06:51, Nicholas Piggin wrote: > > On Thu May 2, 2024 at 6:47 PM AEST, Cédric Le Goater wrote: > >> On 5/1/24 14:39, Nicholas Piggin wrote: > >>> On Wed Apr 17, 2024 at 9:25 PM AEST, Cédric Le Goater wrote: > Hello Nick, > > On 4/17/24 13:02, Nicholas Piggin wrote: > > This implements a framework for an ADU unit model. > > > > The ADU unit actually implements XSCOM, which is the bridge between MMIO > > and PIB. However it also includes control and status registers and other > > functions that are exposed as PIB (xscom) registers. > > > > To keep things simple, pnv_xscom.c remains the XSCOM bridge > > implementation, and pnv_adu.c implements the ADU registers and other > > functions. > > > > So far, just the ADU no-op registers in the pnv_xscom.c default handler > > are moved over to the adu model. > > > > Signed-off-by: Nicholas Piggin > > --- > > include/hw/ppc/pnv_adu.h | 34 > > include/hw/ppc/pnv_chip.h | 3 + > > include/hw/ppc/pnv_xscom.h | 6 ++ > > hw/ppc/pnv.c | 16 ++ > > hw/ppc/pnv_adu.c | 111 > > + > > hw/ppc/pnv_xscom.c | 9 --- > > hw/ppc/meson.build | 1 + > > hw/ppc/trace-events| 4 ++ > > 8 files changed, 175 insertions(+), 9 deletions(-) > > create mode 100644 include/hw/ppc/pnv_adu.h > > create mode 100644 hw/ppc/pnv_adu.c > > > > diff --git a/include/hw/ppc/pnv_adu.h b/include/hw/ppc/pnv_adu.h > > new file mode 100644 > > index 00..9dc91857a9 > > --- /dev/null > > +++ b/include/hw/ppc/pnv_adu.h > > @@ -0,0 +1,34 @@ > > +/* > > + * QEMU PowerPC PowerNV Emulation of some ADU behaviour > > + * > > + * Copyright (c) 2024, IBM Corporation. > > + * > > + * SPDX-License-Identifier: LGPL-2.1-or-later > > > Did you mean GPL-2.0-or-later ? > >>> > >>> Hey Cedric, > >>> > >>> Thanks for reviewing, I've been away so sorry for the late reply. > >>> > >>> It just came from one of the headers I copied which was LGPL. But > >>> there's really nothing much in it and could find a GPL header to > >>> copy. Is GPL-2.0-or-later preferred? > >> > >> I would since all pnv models are GPL. > > > > Some of pnv is actually LGPL. > > I was grepping for 'LGPL' and not 'Lesser' ... Indeed you are right. > Most files miss an SPDX-License-Identifier tag also. > > > That's okay I'll change to GPL. > > LGPL is more relaxed if the code needs to be used in libraries, but > I am not sure it applies to the PNV models. What would you prefer ? GPL seems to be more common and I don't see a need for LGPL here, so maybe GPL? We could probably switch all LGPL pnv over to GPL if we wanted to. I think LGPL permits such relicensing. Will leave this discussion for another time though. Thanks, Nick
Re: [PATCH] spapr: Migrate ail-mode-3 spapr cap
On 5/6/24 17:26, Nicholas Piggin wrote: This cap did not add the migration code when it was introduced. This results in migration failure when changing the default using the command line. Cc: qemu-sta...@nongnu.org Fixes: ccc5a4c5e10 ("spapr: Add SPAPR_CAP_AIL_MODE_3 for AIL mode 3 support for H_SET_MODE hcall") Signed-off-by: Nicholas Piggin Reviewed-by: Harsh Prateek Bora --- include/hw/ppc/spapr.h | 1 + hw/ppc/spapr.c | 1 + hw/ppc/spapr_caps.c| 1 + 3 files changed, 3 insertions(+) diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h index 4aaf23d28f..f6de3e9972 100644 --- a/include/hw/ppc/spapr.h +++ b/include/hw/ppc/spapr.h @@ -1004,6 +1004,7 @@ extern const VMStateDescription vmstate_spapr_cap_large_decr; extern const VMStateDescription vmstate_spapr_cap_ccf_assist; extern const VMStateDescription vmstate_spapr_cap_fwnmi; extern const VMStateDescription vmstate_spapr_cap_rpt_invalidate; +extern const VMStateDescription vmstate_spapr_cap_ail_mode_3; extern const VMStateDescription vmstate_spapr_wdt; static inline uint8_t spapr_get_cap(SpaprMachineState *spapr, int cap) diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index d2d1e310a3..065f58ec93 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -2169,6 +2169,7 @@ static const VMStateDescription vmstate_spapr = { _spapr_cap_fwnmi, _spapr_fwnmi, _spapr_cap_rpt_invalidate, +_spapr_cap_ail_mode_3, _spapr_cap_nested_papr, NULL } diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c index 0a15415a1d..2f74923560 100644 --- a/hw/ppc/spapr_caps.c +++ b/hw/ppc/spapr_caps.c @@ -974,6 +974,7 @@ SPAPR_CAP_MIG_STATE(large_decr, SPAPR_CAP_LARGE_DECREMENTER); SPAPR_CAP_MIG_STATE(ccf_assist, SPAPR_CAP_CCF_ASSIST); SPAPR_CAP_MIG_STATE(fwnmi, SPAPR_CAP_FWNMI); SPAPR_CAP_MIG_STATE(rpt_invalidate, SPAPR_CAP_RPT_INVALIDATE); +SPAPR_CAP_MIG_STATE(ail_mode_3, SPAPR_CAP_AIL_MODE_3); void spapr_caps_init(SpaprMachineState *spapr) {
[PATCH v2 1/6] hw/loongarch: Refine acpi srat table for numa memory
One LoongArch virt machine platform, there is limitation for memory map information. The minimum memory size is 256M and minimum memory size for numa node0 is 256M also. With qemu numa qtest, it is possible that memory size of numa node0 is 128M. Limitations for minimum memory size for both total memory and numa node0 is removed for acpi srat table creation. Signed-off-by: Bibo Mao --- hw/loongarch/acpi-build.c | 58 +++ 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/hw/loongarch/acpi-build.c b/hw/loongarch/acpi-build.c index e5ab1080af..d0247d93ee 100644 --- a/hw/loongarch/acpi-build.c +++ b/hw/loongarch/acpi-build.c @@ -165,8 +165,9 @@ static void build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) { int i, arch_id, node_id; -uint64_t mem_len, mem_base; -int nb_numa_nodes = machine->numa_state->num_nodes; +hwaddr len, base, gap; +NodeInfo *numa_info; +int nodes, nb_numa_nodes = machine->numa_state->num_nodes; LoongArchMachineState *lams = LOONGARCH_MACHINE(machine); MachineClass *mc = MACHINE_GET_CLASS(lams); const CPUArchIdList *arch_ids = mc->possible_cpu_arch_ids(machine); @@ -195,35 +196,44 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) build_append_int_noprefix(table_data, 0, 4); /* Reserved */ } -/* Node0 */ -build_srat_memory(table_data, VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE, - 0, MEM_AFFINITY_ENABLED); -mem_base = VIRT_HIGHMEM_BASE; -if (!nb_numa_nodes) { -mem_len = machine->ram_size - VIRT_LOWMEM_SIZE; -} else { -mem_len = machine->numa_state->nodes[0].node_mem - VIRT_LOWMEM_SIZE; +base = VIRT_LOWMEM_BASE; +gap = VIRT_LOWMEM_SIZE; +numa_info = machine->numa_state->nodes; +nodes = nb_numa_nodes; +if (!nodes) { +nodes = 1; } -if (mem_len) -build_srat_memory(table_data, mem_base, mem_len, 0, MEM_AFFINITY_ENABLED); - -/* Node1 - Nodemax */ -if (nb_numa_nodes) { -mem_base += mem_len; -for (i = 1; i < nb_numa_nodes; ++i) { -if (machine->numa_state->nodes[i].node_mem > 0) { -build_srat_memory(table_data, mem_base, - machine->numa_state->nodes[i].node_mem, i, - MEM_AFFINITY_ENABLED); -mem_base += machine->numa_state->nodes[i].node_mem; -} + +for (i = 0; i < nodes; i++) { +if (nb_numa_nodes) { +len = numa_info[i].node_mem; +} else { +len = machine->ram_size; +} + +/* + * memory for the node splited into two part + * lowram: [base, +gap) + * highram: [VIRT_HIGHMEM_BASE, +(len - gap)) + */ +if (len >= gap) { +build_srat_memory(table_data, base, len, i, MEM_AFFINITY_ENABLED); +len -= gap; +base = VIRT_HIGHMEM_BASE; +gap = machine->ram_size - VIRT_LOWMEM_SIZE; +} + +if (len) { +build_srat_memory(table_data, base, len, i, MEM_AFFINITY_ENABLED); +base += len; +gap -= len; } } if (machine->device_memory) { build_srat_memory(table_data, machine->device_memory->base, memory_region_size(>device_memory->mr), - nb_numa_nodes - 1, + nodes - 1, MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED); } -- 2.39.3
[PATCH v2 6/6] tests/qtest: Add numa test for loongarch system
Add numa test case for loongarch system, it passes to run with command "make check-qtest", after the following patch is applied. https://lore.kernel.org/all/20240319022606.2994565-1-maob...@loongson.cn/ Signed-off-by: Bibo Mao --- tests/qtest/meson.build | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/qtest/meson.build b/tests/qtest/meson.build index 6f2f594ace..a72436df65 100644 --- a/tests/qtest/meson.build +++ b/tests/qtest/meson.build @@ -256,6 +256,8 @@ qtests_s390x = \ qtests_riscv32 = \ (config_all_devices.has_key('CONFIG_SIFIVE_E_AON') ? ['sifive-e-aon-watchdog-test'] : []) +qtests_loongarch64 = ['numa-test'] + qos_test_ss = ss.source_set() qos_test_ss.add( 'ac97-test.c', -- 2.39.3
[PATCH v2 4/6] hw/loongarch: Refine system dram memory region
For system dram memory region, it is not necessary to use numa node information. There is only low memory region and high memory region. Remove numa node information for ddr memory region here, it can reduce memory region number about LoongArch virt machine. Signed-off-by: Bibo Mao --- hw/loongarch/virt.c | 55 ++--- 1 file changed, 17 insertions(+), 38 deletions(-) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index aca8290795..5abfe0a9d7 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -974,18 +974,13 @@ static void loongarch_init(MachineState *machine) { LoongArchCPU *lacpu; const char *cpu_model = machine->cpu_type; -ram_addr_t offset = 0; -ram_addr_t ram_size = machine->ram_size; -uint64_t highram_size = 0, phyAddr = 0; MemoryRegion *address_space_mem = get_system_memory(); LoongArchMachineState *lams = LOONGARCH_MACHINE(machine); -int nb_numa_nodes = machine->numa_state->num_nodes; -NodeInfo *numa_info = machine->numa_state->nodes; int i; +hwaddr base, size, ram_size = machine->ram_size; const CPUArchIdList *possible_cpus; MachineClass *mc = MACHINE_GET_CLASS(machine); CPUState *cpu; -char *ramName = NULL; if (!cpu_model) { cpu_model = LOONGARCH_CPU_TYPE_NAME("la464"); @@ -1020,41 +1015,27 @@ static void loongarch_init(MachineState *machine) fdt_add_memory_nodes(machine); fw_cfg_add_memory(machine); -/* Node0 memory */ -memory_region_init_alias(>lowmem, NULL, "loongarch.node0.lowram", - machine->ram, offset, VIRT_LOWMEM_SIZE); -memory_region_add_subregion(address_space_mem, phyAddr, >lowmem); - -offset += VIRT_LOWMEM_SIZE; -if (nb_numa_nodes > 0) { -assert(numa_info[0].node_mem > VIRT_LOWMEM_SIZE); -highram_size = numa_info[0].node_mem - VIRT_LOWMEM_SIZE; -} else { -highram_size = ram_size - VIRT_LOWMEM_SIZE; +size = ram_size; +base = VIRT_LOWMEM_BASE; +if (size > VIRT_LOWMEM_SIZE) { +size = VIRT_LOWMEM_SIZE; } -phyAddr = VIRT_HIGHMEM_BASE; -memory_region_init_alias(>highmem, NULL, "loongarch.node0.highram", - machine->ram, offset, highram_size); -memory_region_add_subregion(address_space_mem, phyAddr, >highmem); - -/* Node1 - Nodemax memory */ -offset += highram_size; -phyAddr += highram_size; - -for (i = 1; i < nb_numa_nodes; i++) { -MemoryRegion *nodemem = g_new(MemoryRegion, 1); -ramName = g_strdup_printf("loongarch.node%d.ram", i); -memory_region_init_alias(nodemem, NULL, ramName, machine->ram, - offset, numa_info[i].node_mem); -memory_region_add_subregion(address_space_mem, phyAddr, nodemem); -offset += numa_info[i].node_mem; -phyAddr += numa_info[i].node_mem; + +memory_region_init_alias(>lowmem, NULL, "loongarch.lowram", + machine->ram, base, size); +memory_region_add_subregion(address_space_mem, base, >lowmem); +base += size; +if (ram_size - size) { +base = VIRT_HIGHMEM_BASE; +memory_region_init_alias(>highmem, NULL, "loongarch.highram", +machine->ram, VIRT_LOWMEM_BASE + size, ram_size - size); +memory_region_add_subregion(address_space_mem, base, >highmem); +base += ram_size - size; } /* initialize device memory address space */ if (machine->ram_size < machine->maxram_size) { ram_addr_t device_mem_size = machine->maxram_size - machine->ram_size; -hwaddr device_mem_base; if (machine->ram_slots > ACPI_MAX_RAM_SLOTS) { error_report("unsupported amount of memory slots: %"PRIu64, @@ -1068,9 +1049,7 @@ static void loongarch_init(MachineState *machine) "%d bytes", TARGET_PAGE_SIZE); exit(EXIT_FAILURE); } -/* device memory base is the top of high memory address. */ -device_mem_base = ROUND_UP(VIRT_HIGHMEM_BASE + highram_size, 1 * GiB); -machine_memory_devices_init(machine, device_mem_base, device_mem_size); +machine_memory_devices_init(machine, base, device_mem_size); } /* load the BIOS image. */ -- 2.39.3
[PATCH v2 3/6] hw/loongarch: Refine fwcfg memory map
Memory map table for fwcfg is used for UEFI BIOS, UEFI BIOS uses the first entry from fwcfg memory map as the first memory HOB, the second memory HOB will be used if the first memory HOB is used up. Memory map table for fwcfg does not care about numa node, however in generic the first memory HOB is part of numa node0, so that runtime memory of UEFI which is allocated from the first memory HOB is located at numa node0. Signed-off-by: Bibo Mao --- hw/loongarch/virt.c | 60 ++--- 1 file changed, 57 insertions(+), 3 deletions(-) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index db76bc94f8..aca8290795 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -914,6 +914,62 @@ static const MemoryRegionOps loongarch_qemu_ops = { }, }; +static void fw_cfg_add_memory(MachineState *ms) +{ +hwaddr base, size, ram_size, gap; +int nb_numa_nodes, nodes; +NodeInfo *numa_info; + +ram_size = ms->ram_size; +base = VIRT_LOWMEM_BASE; +gap = VIRT_LOWMEM_SIZE; +nodes = nb_numa_nodes = ms->numa_state->num_nodes; +numa_info = ms->numa_state->nodes; +if (!nodes) { +nodes = 1; +} + +/* add fw_cfg memory map of node0 */ +if (nb_numa_nodes) { +size = numa_info[0].node_mem; +} else { +size = ram_size; +} + +if (size >= gap) { +memmap_add_entry(base, gap, 1); +size -= gap; +base = VIRT_HIGHMEM_BASE; +gap = ram_size - VIRT_LOWMEM_SIZE; +} + +if (size) { +memmap_add_entry(base, size, 1); +base += size; +} + +if (nodes < 2) { +return; +} + +/* add fw_cfg memory map of other nodes */ +size = ram_size - numa_info[0].node_mem; +gap = VIRT_LOWMEM_BASE + VIRT_LOWMEM_SIZE; +if (base < gap && (base + size) > gap) { +/* + * memory map for the maining nodes splited into two part + * lowram: [base, +(gap - base)) + * highram: [VIRT_HIGHMEM_BASE, +(size - (gap - base))) + */ +memmap_add_entry(base, gap - base, 1); +size -= gap - base; +base = VIRT_HIGHMEM_BASE; +} + + if (size) +memmap_add_entry(base, size, 1); +} + static void loongarch_init(MachineState *machine) { LoongArchCPU *lacpu; @@ -962,9 +1018,9 @@ static void loongarch_init(MachineState *machine) fdt_add_cpu_nodes(lams); fdt_add_memory_nodes(machine); +fw_cfg_add_memory(machine); /* Node0 memory */ -memmap_add_entry(VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE, 1); memory_region_init_alias(>lowmem, NULL, "loongarch.node0.lowram", machine->ram, offset, VIRT_LOWMEM_SIZE); memory_region_add_subregion(address_space_mem, phyAddr, >lowmem); @@ -977,7 +1033,6 @@ static void loongarch_init(MachineState *machine) highram_size = ram_size - VIRT_LOWMEM_SIZE; } phyAddr = VIRT_HIGHMEM_BASE; -memmap_add_entry(phyAddr, highram_size, 1); memory_region_init_alias(>highmem, NULL, "loongarch.node0.highram", machine->ram, offset, highram_size); memory_region_add_subregion(address_space_mem, phyAddr, >highmem); @@ -992,7 +1047,6 @@ static void loongarch_init(MachineState *machine) memory_region_init_alias(nodemem, NULL, ramName, machine->ram, offset, numa_info[i].node_mem); memory_region_add_subregion(address_space_mem, phyAddr, nodemem); -memmap_add_entry(phyAddr, numa_info[i].node_mem, 1); offset += numa_info[i].node_mem; phyAddr += numa_info[i].node_mem; } -- 2.39.3
[PATCH v2 5/6] hw/loongarch: Remove minimum and default memory size
Some qtest test cases such as numa use default memory size of generic machine class, which is 128M by fault. Here generic default memory size is used, and also remove minimum memory size which is 1G originally. Signed-off-by: Bibo Mao --- hw/loongarch/virt.c | 5 - 1 file changed, 5 deletions(-) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 5abfe0a9d7..36477172ea 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -986,10 +986,6 @@ static void loongarch_init(MachineState *machine) cpu_model = LOONGARCH_CPU_TYPE_NAME("la464"); } -if (ram_size < 1 * GiB) { -error_report("ram_size must be greater than 1G."); -exit(1); -} create_fdt(lams); /* Create IOCSR space */ @@ -1284,7 +1280,6 @@ static void loongarch_class_init(ObjectClass *oc, void *data) mc->desc = "Loongson-3A5000 LS7A1000 machine"; mc->init = loongarch_init; -mc->default_ram_size = 1 * GiB; mc->default_cpu_type = LOONGARCH_CPU_TYPE_NAME("la464"); mc->default_ram_id = "loongarch.ram"; mc->max_cpus = LOONGARCH_MAX_CPUS; -- 2.39.3
[PATCH v2 0/6] hw/loongarch: Refine numa memory map
One LoongArch virt machine platform, there is limitation for memory map information. The minimum memory size is 256M and minimum memory size for numa node0 is 256M also. With qemu numa qtest, it is possible that memory size of numa node0 is 128M. Limitations for minimum memory size for both total memory and numa node0 is removed here, including acpi srat table, fadt memory map table and fw_cfg memory map table. Also remove numa node about memory region, there is only low memory region and how memory region. --- v1 ... v2: 1. Refresh the patch based on the latest qemu version, solve the confliction issue. 2. Add numa test case for loongarch system. --- Bibo Mao (6): hw/loongarch: Refine acpi srat table for numa memory hw/loongarch: Refine fadt memory table for numa memory hw/loongarch: Refine fwcfg memory map hw/loongarch: Refine system dram memory region hw/loongarch: Remove minimum and default memory size tests/qtest: Add numa test for loongarch system hw/loongarch/acpi-build.c | 58 +++-- hw/loongarch/virt.c | 167 +++--- tests/qtest/meson.build | 2 + 3 files changed, 154 insertions(+), 73 deletions(-) base-commit: 248f6f62df073a3b4158fd0093863ab885feabb5 -- 2.39.3
[PATCH v2 2/6] hw/loongarch: Refine fadt memory table for numa memory
One LoongArch virt machine platform, there is limitation for memory map information. The minimum memory size is 256M and minimum memory size for numa node0 is 256M also. With qemu numa qtest, it is possible that memory size of numa node0 is 128M. Limitations for minimum memory size for both total memory and numa node0 is removed for fadt numa memory table creation. Signed-off-by: Bibo Mao --- hw/loongarch/virt.c | 47 ++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index c0999878df..db76bc94f8 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -473,6 +473,48 @@ static void fdt_add_memory_node(MachineState *ms, g_free(nodename); } +static void fdt_add_memory_nodes(MachineState *ms) +{ +hwaddr base, size, ram_size, gap; +int i, nb_numa_nodes, nodes; +NodeInfo *numa_info; + +ram_size = ms->ram_size; +base = VIRT_LOWMEM_BASE; +gap = VIRT_LOWMEM_SIZE; +nodes = nb_numa_nodes = ms->numa_state->num_nodes; +numa_info = ms->numa_state->nodes; +if (!nodes) { +nodes = 1; +} + +for (i = 0; i < nodes; i++) { +if (nb_numa_nodes) { +size = numa_info[i].node_mem; +} else { +size = ram_size; +} + +/* + * memory for the node splited into two part + * lowram: [base, +gap) + * highram: [VIRT_HIGHMEM_BASE, +(len - gap)) + */ +if (size >= gap) { +fdt_add_memory_node(ms, base, gap, i); +size -= gap; +base = VIRT_HIGHMEM_BASE; +gap = ram_size - VIRT_LOWMEM_SIZE; +} + +if (size) { +fdt_add_memory_node(ms, base, size, i); +base += size; +gap -= size; +} +} +} + static void virt_build_smbios(LoongArchMachineState *lams) { MachineState *ms = MACHINE(lams); @@ -919,9 +961,10 @@ static void loongarch_init(MachineState *machine) } fdt_add_cpu_nodes(lams); +fdt_add_memory_nodes(machine); + /* Node0 memory */ memmap_add_entry(VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE, 1); -fdt_add_memory_node(machine, VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE, 0); memory_region_init_alias(>lowmem, NULL, "loongarch.node0.lowram", machine->ram, offset, VIRT_LOWMEM_SIZE); memory_region_add_subregion(address_space_mem, phyAddr, >lowmem); @@ -935,7 +978,6 @@ static void loongarch_init(MachineState *machine) } phyAddr = VIRT_HIGHMEM_BASE; memmap_add_entry(phyAddr, highram_size, 1); -fdt_add_memory_node(machine, phyAddr, highram_size, 0); memory_region_init_alias(>highmem, NULL, "loongarch.node0.highram", machine->ram, offset, highram_size); memory_region_add_subregion(address_space_mem, phyAddr, >highmem); @@ -951,7 +993,6 @@ static void loongarch_init(MachineState *machine) offset, numa_info[i].node_mem); memory_region_add_subregion(address_space_mem, phyAddr, nodemem); memmap_add_entry(phyAddr, numa_info[i].node_mem, 1); -fdt_add_memory_node(machine, phyAddr, numa_info[i].node_mem, i); offset += numa_info[i].node_mem; phyAddr += numa_info[i].node_mem; } -- 2.39.3
Re: [PATCH v3 1/1] accel/kvm: Fix segmentation fault
on 5/7/2024 10:50 AM, Masato Imai wrote: > When the KVM acceleration parameter is not set, executing calc_dirty_rate > with the -r or -b option results in a segmentation fault due to accessing > a null kvm_state pointer in the kvm_dirty_ring_enabled function. This > commit adds a null check for kvm_status to prevent segmentation faults. > > Signed-off-by: Masato Imai LGTM, Tested-by: Li Zhijian > --- > accel/kvm/kvm-all.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c > index c0be9f5eed..544293be8a 100644 > --- a/accel/kvm/kvm-all.c > +++ b/accel/kvm/kvm-all.c > @@ -2329,7 +2329,7 @@ bool kvm_vcpu_id_is_valid(int vcpu_id) > > bool kvm_dirty_ring_enabled(void) > { > -return kvm_state->kvm_dirty_ring_size ? true : false; > +return kvm_state && kvm_state->kvm_dirty_ring_size; > } > > static void query_stats_cb(StatsResultList **result, StatsTarget target,
[PATCH v3 1/1] accel/kvm: Fix segmentation fault
When the KVM acceleration parameter is not set, executing calc_dirty_rate with the -r or -b option results in a segmentation fault due to accessing a null kvm_state pointer in the kvm_dirty_ring_enabled function. This commit adds a null check for kvm_status to prevent segmentation faults. Signed-off-by: Masato Imai --- accel/kvm/kvm-all.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index c0be9f5eed..544293be8a 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -2329,7 +2329,7 @@ bool kvm_vcpu_id_is_valid(int vcpu_id) bool kvm_dirty_ring_enabled(void) { -return kvm_state->kvm_dirty_ring_size ? true : false; +return kvm_state && kvm_state->kvm_dirty_ring_size; } static void query_stats_cb(StatsResultList **result, StatsTarget target, -- 2.34.1
[PATCH v3 0/1] accel/kvm: Fix segmentation fault
Changes from v2: - avoid segfault in kvm/accel instead of migration/dirtyrate v2: https://lore.kernel.org/qemu-devel/20240423091306.754432-1-...@sfc.wide.ad.jp Masato Imai (1): accel/kvm: Fix segmentation fault accel/kvm/kvm-all.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- 2.34.1
RE: [PATCH v3 00/19] Add a host IOMMU device abstraction to check with vIOMMU
>-Original Message- >From: Jason Gunthorpe >Subject: Re: [PATCH v3 00/19] Add a host IOMMU device abstraction to >check with vIOMMU > >On Mon, May 06, 2024 at 02:30:47AM +, Duan, Zhenzhong wrote: > >> I'm not clear how useful multiple iommufd instances support are. >> One possible benefit is for security? It may bring a slightly fine-grained >> isolation in kernel. > >No. I don't think there is any usecase, it is only harmful. OK, so we need to limit QEMU to only one iommufd instance. In cdev series, we support mix of legacy and iommufd backend and multiple iommufd backend instances for flexibility. We need to make a choice to have this limitation only for nesting series or globally(including cdev). May I ask what harmfulness we may have? Thanks Zhenzhong
[PATCH] hw/loongarch/virt: Fix memory leak
The char pointer 'ramName' point to a block of memory, but never free it. Use 'g_autofree' to automatically free it. Resolves: Coverity CID 1544773 Fixes: 0cf1478d6 ("hw/loongarch: Add numa support") Signed-off-by: Song Gao --- hw/loongarch/virt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index c0999878df..ea5100be6d 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -887,7 +887,6 @@ static void loongarch_init(MachineState *machine) const CPUArchIdList *possible_cpus; MachineClass *mc = MACHINE_GET_CLASS(machine); CPUState *cpu; -char *ramName = NULL; if (!cpu_model) { cpu_model = LOONGARCH_CPU_TYPE_NAME("la464"); @@ -946,7 +945,7 @@ static void loongarch_init(MachineState *machine) for (i = 1; i < nb_numa_nodes; i++) { MemoryRegion *nodemem = g_new(MemoryRegion, 1); -ramName = g_strdup_printf("loongarch.node%d.ram", i); +g_autofree char *ramName = g_strdup_printf("loongarch.node%d.ram", i); memory_region_init_alias(nodemem, NULL, ramName, machine->ram, offset, numa_info[i].node_mem); memory_region_add_subregion(address_space_mem, phyAddr, nodemem); -- 2.25.1
RE: [PATCH 1/3] vfio: Make VFIOIOMMUClass::attach_device() and its wrapper return bool
>-Original Message- >From: Cédric Le Goater >Subject: Re: [PATCH 1/3] vfio: Make VFIOIOMMUClass::attach_device() and >its wrapper return bool > >On 5/6/24 10:33, Zhenzhong Duan wrote: >> Make VFIOIOMMUClass::attach_device() and its wrapper function >> vfio_attach_device() return bool. >> >> This is to follow the coding standand to return bool if 'Error **' >> is used to pass error. >> >> Suggested-by: Cédric Le Goater >> Signed-off-by: Zhenzhong Duan >> --- >> include/hw/vfio/vfio-common.h | 4 ++-- >> include/hw/vfio/vfio-container-base.h | 4 ++-- >> hw/vfio/ap.c | 6 ++ >> hw/vfio/ccw.c | 6 ++ >> hw/vfio/common.c | 4 ++-- >> hw/vfio/container.c | 14 +++--- >> hw/vfio/iommufd.c | 11 +-- >> hw/vfio/pci.c | 8 +++- >> hw/vfio/platform.c| 7 +++ >> 9 files changed, 28 insertions(+), 36 deletions(-) >> >> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio- >common.h >> index b9da6c08ef..a7b6fc8f46 100644 >> --- a/include/hw/vfio/vfio-common.h >> +++ b/include/hw/vfio/vfio-common.h >> @@ -198,8 +198,8 @@ void vfio_region_exit(VFIORegion *region); >> void vfio_region_finalize(VFIORegion *region); >> void vfio_reset_handler(void *opaque); >> struct vfio_device_info *vfio_get_device_info(int fd); >> -int vfio_attach_device(char *name, VFIODevice *vbasedev, >> - AddressSpace *as, Error **errp); >> +bool vfio_attach_device(char *name, VFIODevice *vbasedev, >> +AddressSpace *as, Error **errp); >> void vfio_detach_device(VFIODevice *vbasedev); >> >> int vfio_kvm_device_add_fd(int fd, Error **errp); >> diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio- >container-base.h >> index 3582d5f97a..c839cfd9cb 100644 >> --- a/include/hw/vfio/vfio-container-base.h >> +++ b/include/hw/vfio/vfio-container-base.h >> @@ -118,8 +118,8 @@ struct VFIOIOMMUClass { >> int (*dma_unmap)(const VFIOContainerBase *bcontainer, >>hwaddr iova, ram_addr_t size, >>IOMMUTLBEntry *iotlb); >> -int (*attach_device)(const char *name, VFIODevice *vbasedev, >> - AddressSpace *as, Error **errp); >> +bool (*attach_device)(const char *name, VFIODevice *vbasedev, >> + AddressSpace *as, Error **errp); >> void (*detach_device)(VFIODevice *vbasedev); >> /* migration feature */ >> int (*set_dirty_page_tracking)(const VFIOContainerBase *bcontainer, >> diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c >> index 7c4caa5938..d50600b702 100644 >> --- a/hw/vfio/ap.c >> +++ b/hw/vfio/ap.c >> @@ -156,7 +156,6 @@ static void >vfio_ap_unregister_irq_notifier(VFIOAPDevice *vapdev, >> static void vfio_ap_realize(DeviceState *dev, Error **errp) >> { >> ERRP_GUARD(); >> -int ret; >> Error *err = NULL; >> VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev); >> VFIODevice *vbasedev = >vdev; >> @@ -165,9 +164,8 @@ static void vfio_ap_realize(DeviceState *dev, Error >**errp) >> return; >> } >> >> -ret = vfio_attach_device(vbasedev->name, vbasedev, >> - _space_memory, errp); >> -if (ret) { >> +if (!vfio_attach_device(vbasedev->name, vbasedev, >> +_space_memory, errp)) { >> goto error; >> } >> >> diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c >> index 90e4a53437..782bd4bed7 100644 >> --- a/hw/vfio/ccw.c >> +++ b/hw/vfio/ccw.c >> @@ -580,7 +580,6 @@ static void vfio_ccw_realize(DeviceState *dev, >Error **errp) >> S390CCWDeviceClass *cdc = S390_CCW_DEVICE_GET_CLASS(cdev); >> VFIODevice *vbasedev = >vdev; >> Error *err = NULL; >> -int ret; >> >> /* Call the class init function for subchannel. */ >> if (cdc->realize) { >> @@ -594,9 +593,8 @@ static void vfio_ccw_realize(DeviceState *dev, >Error **errp) >> return; >> } >> >> -ret = vfio_attach_device(cdev->mdevid, vbasedev, >> - _space_memory, errp); >> -if (ret) { >> +if (!vfio_attach_device(cdev->mdevid, vbasedev, >> +_space_memory, errp)) { >> goto out_attach_dev_err; >> } >> >> diff --git a/hw/vfio/common.c b/hw/vfio/common.c >> index 8f9cbdc026..890d30910e 100644 >> --- a/hw/vfio/common.c >> +++ b/hw/vfio/common.c >> @@ -1492,8 +1492,8 @@ retry: >> return info; >> } >> >> -int vfio_attach_device(char *name, VFIODevice *vbasedev, >> - AddressSpace *as, Error **errp) >> +bool vfio_attach_device(char *name, VFIODevice *vbasedev, >> +AddressSpace *as, Error **errp) >> { >> const VFIOIOMMUClass *ops = >> >VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY)); > > >I think
RE: [PATCH-for-9.1 v2 2/3] migration: Remove RDMA protocol handling
Hello, > -Original Message- > From: Peter Xu [mailto:pet...@redhat.com] > Sent: Monday, May 6, 2024 11:18 PM > To: Gonglei (Arei) > Cc: Daniel P. Berrangé ; Markus Armbruster > ; Michael Galaxy ; Yu Zhang > ; Zhijian Li (Fujitsu) ; Jinpu Wang > ; Elmar Gerdes ; > qemu-devel@nongnu.org; Yuval Shaia ; Kevin Wolf > ; Prasanna Kumar Kalever > ; Cornelia Huck ; > Michael Roth ; Prasanna Kumar Kalever > ; integrat...@gluster.org; Paolo Bonzini > ; qemu-bl...@nongnu.org; de...@lists.libvirt.org; > Hanna Reitz ; Michael S. Tsirkin ; > Thomas Huth ; Eric Blake ; Song > Gao ; Marc-André Lureau > ; Alex Bennée ; > Wainer dos Santos Moschetta ; Beraldo Leal > ; Pannengyuan ; > Xiexiangyou > Subject: Re: [PATCH-for-9.1 v2 2/3] migration: Remove RDMA protocol handling > > On Mon, May 06, 2024 at 02:06:28AM +, Gonglei (Arei) wrote: > > Hi, Peter > > Hey, Lei, > > Happy to see you around again after years. > Haha, me too. > > RDMA features high bandwidth, low latency (in non-blocking lossless > > network), and direct remote memory access by bypassing the CPU (As you > > know, CPU resources are expensive for cloud vendors, which is one of > > the reasons why we introduced offload cards.), which TCP does not have. > > It's another cost to use offload cards, v.s. preparing more cpu resources? > Software and hardware offload converged architecture is the way to go for all cloud vendors (Including comprehensive benefits in terms of performance, cost, security, and innovation speed), it's not just a matter of adding the resource of a DPU card. > > In some scenarios where fast live migration is needed (extremely short > > interruption duration and migration duration) is very useful. To this > > end, we have also developed RDMA support for multifd. > > Will any of you upstream that work? I'm curious how intrusive would it be > when adding it to multifd, if it can keep only 5 exported functions like what > rdma.h does right now it'll be pretty nice. We also want to make sure it > works > with arbitrary sized loads and buffers, e.g. vfio is considering to add IO > loads to > multifd channels too. > In fact, we sent the patchset to the community in 2021. Pls see: https://lore.kernel.org/all/20210203185906.GT2950@work-vm/T/ > One thing to note that the question here is not about a pure performance > comparison between rdma and nics only. It's about help us make a decision > on whether to drop rdma, iow, even if rdma performs well, the community still > has the right to drop it if nobody can actively work and maintain it. > It's just that if nics can perform as good it's more a reason to drop, unless > companies can help to provide good support and work together. > We are happy to provide the necessary review and maintenance work for RDMA if the community needs it. CC'ing Chuan Zheng. Regards, -Gonglei
Re: [RFC PATCH v3 3/5] KVM: x86: Add notifications for Heki policy configuration and violation
On Mon, May 06, 2024, Mickaël Salaün wrote: > On Fri, May 03, 2024 at 07:03:21AM GMT, Sean Christopherson wrote: > > > --- > > > > > > Changes since v1: > > > * New patch. Making user space aware of Heki properties was requested by > > > Sean Christopherson. > > > > No, I suggested having userspace _control_ the pinning[*], not merely be > > notified > > of pinning. > > > > : IMO, manipulation of protections, both for memory (this patch) and CPU > > state > > : (control registers in the next patch) should come from userspace. I > > have no > > : objection to KVM providing plumbing if necessary, but I think userspace > > needs to > > : to have full control over the actual state. > > : > > : One of the things that caused Intel's control register pinning series to > > stall > > : out was how to handle edge cases like kexec() and reboot. Deferring to > > userspace > > : means the kernel doesn't need to define policy, e.g. when to unprotect > > memory, > > : and avoids questions like "should userspace be able to overwrite pinned > > control > > : registers". > > : > > : And like the confidential VM use case, keeping userspace in the loop is > > a big > > : beneifit, e.g. the guest can't circumvent protections by coercing > > userspace into > > : writing to protected memory. > > > > I stand by that suggestion, because I don't see a sane way to handle things > > like > > kexec() and reboot without having a _much_ more sophisticated policy than > > would > > ever be acceptable in KVM. > > > > I think that can be done without KVM having any awareness of CR pinning > > whatsoever. > > E.g. userspace just needs to ability to intercept CR writes and inject > > #GPs. Off > > the cuff, I suspect the uAPI could look very similar to MSR filtering. > > E.g. I bet > > userspace could enforce MSR pinning without any new KVM uAPI at all. > > > > [*] https://lore.kernel.org/all/zfuyhpuhtmbyd...@google.com > > OK, I had concern about the control not directly coming from the guest, > especially in the case of pKVM and confidential computing, but I get you Hardware-based CoCo is completely out of scope, because KVM has zero visibility into the guest (well, SNP technically allows trapping CR0/CR4, but KVM really shouldn't intercept CR0/CR4 for SNP guests). And more importantly, _KVM_ doesn't define any policies for CoCo VMs. KVM might help enforce policies that are defined by hardware/firmware, but KVM doesn't define any of its own. If pKVM on x86 comes along, then KVM will likely get in the business of defining policy, but until that happens, KVM needs to stay firmly out of the picture. > point. It should indeed be quite similar to the MSR filtering on the > userspace side, except that we need another interface for the guest to > request such change (i.e. self-protection). > > Would it be OK to keep this new KVM_HC_LOCK_CR_UPDATE hypercall but > forward the request to userspace with a VM exit instead? That would > also enable userspace to get the request and directly configure the CR > pinning with the same VM exit. No? Maybe? I strongly suspect that full support will need a richer set of APIs than a single hypercall. E.g. to handle kexec(), suspend+resume, emulated SMM, and so on and so forth. And that's just for CR pinning. And hypercalls are hampered by the fact that VMCALL/VMMCALL don't allow for delegation or restriction, i.e. there's no way for the guest to communicate to the hypervisor that a less privileged component is allowed to perform some action, nor is there a way for the guest to say some chunk of CPL0 code *isn't* allowed to request transition. Delegation and restriction all has to be done out-of-band. It'd probably be more annoying to setup initially, but I think a synthetic device with an MMIO-based interface would be more powerful and flexible in the long run. Then userspace can evolve without needing to wait for KVM to catch up. Actually, potential bad/crazy idea. Why does the _host_ need to define policy? Linux already knows what assets it wants to (un)protect and when. What's missing is a way for the guest kernel to effectively deprivilege and re-authenticate itself as needed. We've been tossing around the idea of paired VMs+vCPUs to support VTLs and SEV's VMPLs, what if we usurped/piggybacked those ideas, with a bit of pKVM mixed in? Borrowing VTL terminology, where VTL0 is the least privileged, userspace launches the VM at VTL0. At some point, the guest triggers the deprivileging sequence and userspace creates VTL1. Userpace also provides a way for VTL0 restrict access to its memory, e.g. to effectively make the page tables for the kernel's direct map writable only from VTL1, to make kernel text RO (or XO), etc. And VTL0 could then also completely remove its access to code that changes CR0/CR4. It would obviously require a _lot_ more upfront work, e.g. to isolate the kernel text that modifies CR0/CR4 so that it can be removed
Re: [PULL 3/5] hw/loongarch: Add numa support
在 2024/5/3 下午8:50, Peter Maydell 写道: On Fri, 16 Jun 2023 at 11:03, Song Gao wrote: From: Tianrui Zhao 1. Implement some functions for LoongArch numa support; 2. Implement fdt_add_memory_node() for fdt; 3. build_srat() fills node_id and adds build numa memory. Reviewed-by: Song Gao Signed-off-by: Tianrui Zhao Signed-off-by: Song Gao Message-Id: <20230613122613.2471743-1-zhaotian...@loongson.cn> Hi; Coverity has pointed out a memory leak in this commit (CID 1544773): @@ -799,17 +823,43 @@ static void loongarch_init(MachineState *machine) machine->possible_cpus->cpus[i].cpu = OBJECT(cpu); } fdt_add_cpu_nodes(lams); -/* Add memory region */ -memory_region_init_alias(>lowmem, NULL, "loongarch.lowram", - machine->ram, 0, 256 * MiB); -memory_region_add_subregion(address_space_mem, offset, >lowmem); -offset += 256 * MiB; -memmap_add_entry(0, 256 * MiB, 1); -highram_size = ram_size - 256 * MiB; -memory_region_init_alias(>highmem, NULL, "loongarch.highmem", - machine->ram, offset, highram_size); -memory_region_add_subregion(address_space_mem, 0x9000, >highmem); -memmap_add_entry(0x9000, highram_size, 1); + +/* Node0 memory */ +memmap_add_entry(VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE, 1); +fdt_add_memory_node(machine, VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE, 0); +memory_region_init_alias(>lowmem, NULL, "loongarch.node0.lowram", + machine->ram, offset, VIRT_LOWMEM_SIZE); +memory_region_add_subregion(address_space_mem, phyAddr, >lowmem); + +offset += VIRT_LOWMEM_SIZE; +if (nb_numa_nodes > 0) { +assert(numa_info[0].node_mem > VIRT_LOWMEM_SIZE); +highram_size = numa_info[0].node_mem - VIRT_LOWMEM_SIZE; +} else { +highram_size = ram_size - VIRT_LOWMEM_SIZE; +} +phyAddr = VIRT_HIGHMEM_BASE; +memmap_add_entry(phyAddr, highram_size, 1); +fdt_add_memory_node(machine, phyAddr, highram_size, 0); +memory_region_init_alias(>highmem, NULL, "loongarch.node0.highram", + machine->ram, offset, highram_size); +memory_region_add_subregion(address_space_mem, phyAddr, >highmem); + +/* Node1 - Nodemax memory */ +offset += highram_size; +phyAddr += highram_size; + +for (i = 1; i < nb_numa_nodes; i++) { +MemoryRegion *nodemem = g_new(MemoryRegion, 1); +ramName = g_strdup_printf("loongarch.node%d.ram", i); +memory_region_init_alias(nodemem, NULL, ramName, machine->ram, + offset, numa_info[i].node_mem); +memory_region_add_subregion(address_space_mem, phyAddr, nodemem); +memmap_add_entry(phyAddr, numa_info[i].node_mem, 1); +fdt_add_memory_node(machine, phyAddr, numa_info[i].node_mem, i); +offset += numa_info[i].node_mem; +phyAddr += numa_info[i].node_mem; In this loop, we allocate memory via g_strdup_printf(), but never free it. The nicest fix for this is to use the g_autofree mechanism so that the memory is automatically freed when execution reaches the end of the block: g_autofree ramName = g_strdup_printf("", ...); Thank you. I will fix it. Thanks Song Gao thanks -- PMM
Re: [PATCH v3 1/5] hw/loongarch: Rename LOONGARCH_MACHINE with VIRT_MACHINE
On 2024/5/6 下午2:09, maobibo wrote: On 2024/5/6 下午12:24, Thomas Huth wrote: On 06/05/2024 05.02, Bibo Mao wrote: On LoongArch system, there is only virt machine type now, name LOONGARCH_MACHINE is confused, rename it with VIRT_MACHINE. Machine name about Other real hw boards can be added in future. Signed-off-by: Bibo Mao --- ... @@ -1245,7 +1244,7 @@ static void loongarch_class_init(ObjectClass *oc, void *data) static const TypeInfo loongarch_machine_types[] = { { - .name = TYPE_LOONGARCH_MACHINE, + .name = TYPE_VIRT_MACHINE, .parent = TYPE_MACHINE, .instance_size = sizeof(LoongArchMachineState), .class_init = loongarch_class_init, diff --git a/include/hw/loongarch/virt.h b/include/hw/loongarch/virt.h index 4e14bf6060..5ea2f0370d 100644 --- a/include/hw/loongarch/virt.h +++ b/include/hw/loongarch/virt.h @@ -73,8 +73,8 @@ struct LoongArchMachineState { struct loongarch_boot_info bootinfo; }; -#define TYPE_LOONGARCH_MACHINE MACHINE_TYPE_NAME("virt") -OBJECT_DECLARE_SIMPLE_TYPE(LoongArchMachineState, LOONGARCH_MACHINE) +#define TYPE_VIRT_MACHINE MACHINE_TYPE_NAME("virt") +OBJECT_DECLARE_SIMPLE_TYPE(LoongArchMachineState, VIRT_MACHINE) bool loongarch_is_acpi_enabled(LoongArchMachineState *lams); void loongarch_acpi_setup(LoongArchMachineState *lams); #endif Hi, there are currently some efforts going on to create the possibility to link a QEMU binary that contains all targets in one binary. Since we already have a TYPE_VIRT_MACHINE for other targets, I wonder whether it might be better to use LOONGARCH_VIRT_MACHINE than just VIRT_MACHINE here? Philippe, could you comment on this? It is great if there is one QEMU binary which supports different targets. And LOONGARCH_VIRT_MACHINE is ok for me. Hi Thomas, Philippe, Does machine name "virt" need be changed if LOONGARCH_VIRT_MACHINE is used? There will be compatible issues if "virt" machine type is not suggested to use. However CPU type "max" is not widely used now, can we get different architectures from CPU type rather than machine type for one QEMU binary which supports different targets? Regards Bibo Mao Regards Bibo Mao Thomas
RE: [PATCH v2] mem/cxl_type3: support 3, 6, 12 and 16 interleave ways
> -Original Message- > From: Jonathan Cameron > Sent: Tuesday, April 30, 2024 10:43 PM > To: Yao, Xingtao/姚 幸涛 > Cc: fan...@samsung.com; qemu-devel@nongnu.org > Subject: Re: [PATCH v2] mem/cxl_type3: support 3, 6, 12 and 16 interleave ways > > On Wed, 24 Apr 2024 01:36:56 + > "Xingtao Yao (Fujitsu)" wrote: > > > ping. > > > > > -Original Message- > > > From: Yao Xingtao > > > Sent: Sunday, April 7, 2024 11:07 AM > > > To: jonathan.came...@huawei.com; fan...@samsung.com > > > Cc: qemu-devel@nongnu.org; Yao, Xingtao/姚 幸涛 > > > > Subject: [PATCH v2] mem/cxl_type3: support 3, 6, 12 and 16 interleave ways > > > > > > Since the kernel does not check the interleave capability, a > > > 3-way, 6-way, 12-way or 16-way region can be create normally. > > > > > > Applications can access the memory of 16-way region normally because > > > qemu can convert hpa to dpa correctly for the power of 2 interleave > > > ways, after kernel implementing the check, this kind of region will > > > not be created any more. > > > > > > For non power of 2 interleave ways, applications could not access the > > > memory normally and may occur some unexpected behaviors, such as > > > segmentation fault. > > > > > > So implements this feature is needed. > > > > > > Link: > > > > https://lore.kernel.org/linux-cxl/3e84b919-7631-d1db-3e1d-33000f3f3868@fujits > > > u.com/ > > > Signed-off-by: Yao Xingtao > > > --- > > > hw/mem/cxl_type3.c | 18 ++ > > > 1 file changed, 14 insertions(+), 4 deletions(-) > > > > > > diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c > > > index b0a7e9f11b..d6ef784e96 100644 > > > --- a/hw/mem/cxl_type3.c > > > +++ b/hw/mem/cxl_type3.c > > > @@ -805,10 +805,17 @@ static bool cxl_type3_dpa(CXLType3Dev *ct3d, > hwaddr > > > host_addr, uint64_t *dpa) > > > continue; > > > } > > > > > > -*dpa = dpa_base + > > > -((MAKE_64BIT_MASK(0, 8 + ig) & hpa_offset) | > > > - ((MAKE_64BIT_MASK(8 + ig + iw, 64 - 8 - ig - iw) & > > > hpa_offset) > > > - >> iw)); > > > +if (iw < 8) { > > > +*dpa = dpa_base + > > > +((MAKE_64BIT_MASK(0, 8 + ig) & hpa_offset) | > > > + ((MAKE_64BIT_MASK(8 + ig + iw, 64 - 8 - ig - iw) & > hpa_offset) > > > + >> iw)); > > > +} else { > > > +*dpa = dpa_base + > > > +((MAKE_64BIT_MASK(0, 8 + ig) & hpa_offset) | > > > + MAKE_64BIT_MASK(ig + iw, 64 - ig - iw) & hpa_offset) > > > + >> (ig + iw)) / 3) << (ig + 8))); > > > +} > > > > > > return true; > > > } > > > @@ -906,6 +913,9 @@ static void ct3d_reset(DeviceState *dev) > > > uint32_t *write_msk = > ct3d->cxl_cstate.crb.cache_mem_regs_write_mask; > > > > > > cxl_component_register_init_common(reg_state, write_msk, > > > CXL2_TYPE3_DEVICE); > > > +ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, > > > 3_6_12_WAY, 1); > > > +ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, > > > 16_WAY, 1); > > > + > > Why here rather than in hdm_reg_init_common()? > It's constant data and is currently being set to 0 in there. according to the CXL specifications (8.2.4.20.1 CXL HDM Decoder Capability Register (Offset 00h)), this feature is only applicable to cxl.mem, upstream switch port and CXL host bridges shall hardwrite these bits to 0. so I think it would be more appropriate to set these bits here. > > > > cxl_device_register_init_t3(ct3d); > > > > > > /* > > > -- > > > 2.37.3 > >
RE: [PATCH V8 6/8] physmem: Add helper function to destroy CPU AddressSpace
> From: Peter Maydell > Sent: Monday, May 6, 2024 10:29 AM > To: Salil Mehta > > On Mon, 6 May 2024 at 10:06, Salil Mehta > wrote: > > > > Hi Peter, > > > > Thanks for the review. > > > > > From: Peter Maydell When do we need to > > > destroy a single address space in this way that means we need to > > > keep a count of how many ASes the CPU currently has? The commit > > > message talks about the case when we unrealize the whole CPU > > > object, but in that situation you can just throw away all the ASes > > > at once (eg by calling some > > > cpu_destroy_address_spaces() function from > cpu_common_unrealizefn()). > > > > > > Yes, maybe, we can destroy all at once from common leg as well. I'd > > prefer this to be done from the arch specific function for ARM to > > maintain the clarity & symmetry of initialization and > > un-initialization legs. For now, all of these address space destruction is > happening in context to the arm_cpu_unrealizefn(). > > > > It’s a kind of trade-off between little more code and clarity but I'm > > open to further suggestions. > > > > > > > > > > Also, if we're leaking stuff here by failing to destroy it, is that > > > a problem for existing CPU types like x86 that we can already hotplug? > > > > No we are not. We are taking care of these in the ARM arch specific > > legs within functions arm_cpu_(un)realizefn(). > > How can you be taking care of *x86* CPU types in the Arm unrealize? Sorry, yes, I missed to reply that clearly. There was indeed a leak with x86 reported by Phillipe/David last year. In fact, Phillipe floated a patch last year for this. I thought it was fixed already as part of cpu_common_unrealize() but I just checked and realized that the below proposed changed still isn’t part of the mainline https://lore.kernel.org/qemu-devel/20230918160257.30127-9-phi...@linaro.org/ I can definitely add a common CPU AddressSpace destruction leg as part of this patch if in case arch specific CPU unrealize does not cleans up its CPU AddressSpace? Thanks Salil. > > thanks > -- PMM
Re: [PATCH V1 06/26] migration: precreate vmstate for exec
Steve Sistare writes: > Provide migration_precreate_save for saving precreate vmstate across exec. > Create a memfd, save its value in the environment, and serialize state > to it. Reverse the process in migration_precreate_load. > > Signed-off-by: Steve Sistare > --- > include/migration/misc.h | 5 ++ > migration/meson.build| 1 + > migration/precreate.c| 139 > +++ > 3 files changed, 145 insertions(+) > create mode 100644 migration/precreate.c > > diff --git a/include/migration/misc.h b/include/migration/misc.h > index c9e200f..cf30351 100644 > --- a/include/migration/misc.h > +++ b/include/migration/misc.h > @@ -56,6 +56,11 @@ AnnounceParameters *migrate_announce_params(void); > > void dump_vmstate_json_to_file(FILE *out_fp); > > +/* migration/precreate.c */ > +int migration_precreate_save(Error **errp); > +void migration_precreate_unsave(void); > +int migration_precreate_load(Error **errp); > + > /* migration/migration.c */ > void migration_object_init(void); > void migration_shutdown(void); > diff --git a/migration/meson.build b/migration/meson.build > index f76b1ba..50e7cb2 100644 > --- a/migration/meson.build > +++ b/migration/meson.build > @@ -26,6 +26,7 @@ system_ss.add(files( >'ram-compress.c', >'options.c', >'postcopy-ram.c', > + 'precreate.c', >'savevm.c', >'socket.c', >'tls.c', > diff --git a/migration/precreate.c b/migration/precreate.c > new file mode 100644 > index 000..0bf5e1f > --- /dev/null > +++ b/migration/precreate.c > @@ -0,0 +1,139 @@ > +/* > + * Copyright (c) 2022, 2024 Oracle and/or its affiliates. > + * > + * This work is licensed under the terms of the GNU GPL, version 2 or later. > + * See the COPYING file in the top-level directory. > + */ > + > +#include "qemu/osdep.h" > +#include "qemu/cutils.h" > +#include "qemu/memfd.h" > +#include "qapi/error.h" > +#include "io/channel-file.h" > +#include "migration/misc.h" > +#include "migration/qemu-file.h" > +#include "migration/savevm.h" > + > +#define PRECREATE_STATE_NAME "QEMU_PRECREATE_STATE" > + > +static QEMUFile *qemu_file_new_fd_input(int fd, const char *name) > +{ > +g_autoptr(QIOChannelFile) fioc = qio_channel_file_new_fd(fd); > +QIOChannel *ioc = QIO_CHANNEL(fioc); > +qio_channel_set_name(ioc, name); > +return qemu_file_new_input(ioc); > +} > + > +static QEMUFile *qemu_file_new_fd_output(int fd, const char *name) > +{ > +g_autoptr(QIOChannelFile) fioc = qio_channel_file_new_fd(fd); > +QIOChannel *ioc = QIO_CHANNEL(fioc); > +qio_channel_set_name(ioc, name); > +return qemu_file_new_output(ioc); > +} > + > +static int memfd_create_named(const char *name, Error **errp) > +{ > +int mfd; > +char val[16]; > + > +mfd = memfd_create(name, 0); > +if (mfd < 0) { > +error_setg_errno(errp, errno, "memfd_create failed"); > +return -1; > +} > + > +/* Remember mfd in environment for post-exec load */ > +qemu_clear_cloexec(mfd); > +snprintf(val, sizeof(val), "%d", mfd); > +g_setenv(name, val, 1); > + > +return mfd; > +} > + > +static int memfd_find_named(const char *name, int *mfd_p, Error **errp) > +{ > +const char *val = g_getenv(name); > + > +if (!val) { > +*mfd_p = -1; > +return 0; /* No memfd was created, not an error */ > +} > +g_unsetenv(name); > +if (qemu_strtoi(val, NULL, 10, mfd_p)) { > +error_setg(errp, "Bad %s env value %s", PRECREATE_STATE_NAME, val); > +return -1; > +} > +lseek(*mfd_p, 0, SEEK_SET); > +return 0; > +} > + > +static void memfd_delete_named(const char *name) > +{ > +int mfd; > +const char *val = g_getenv(name); > + > +if (val) { > +g_unsetenv(name); > +if (!qemu_strtoi(val, NULL, 10, )) { > +close(mfd); > +} > +} > +} > + > +static QEMUFile *qemu_file_new_memfd_output(const char *name, Error **errp) > +{ > +int mfd = memfd_create_named(name, errp); > + > +if (mfd < 0) { > +return NULL; > +} > + > +return qemu_file_new_fd_output(mfd, name); > +} > + > +static QEMUFile *qemu_file_new_memfd_input(const char *name, Error **errp) > +{ > +int ret, mfd; > + > +ret = memfd_find_named(name, , errp); > +if (ret || mfd < 0) { > +return NULL; > +} > + > +return qemu_file_new_fd_input(mfd, name); > +} > + > +int migration_precreate_save(Error **errp) > +{ > +QEMUFile *f = qemu_file_new_memfd_output(PRECREATE_STATE_NAME, errp); > + > +if (!f) { > +return -1; > +} else if (qemu_savevm_precreate_save(f, errp)) { > +memfd_delete_named(PRECREATE_STATE_NAME); > +return -1; > +} else { > +/* Do not close f, as mfd must remain open. */ > +return 0; > +} > +} > + > +void migration_precreate_unsave(void) > +{ > +memfd_delete_named(PRECREATE_STATE_NAME); > +} > + > +int migration_precreate_load(Error **errp) > +{ > +
Re: [PATCH V1 01/26] oslib: qemu_clear_cloexec
Steve Sistare writes: +cc dgilbert, marcandre > Define qemu_clear_cloexec, analogous to qemu_set_cloexec. > > Signed-off-by: Steve Sistare > Reviewed-by: Dr. David Alan Gilbert > Reviewed-by: Marc-André Lureau A v1 patch with two reviews already, from people from another company and they're not in CC. Looks suspicious. =) Here's a fresh one, hopefully it won't spend another 4 years in the drawer: Reviewed-by: Fabiano Rosas > --- > include/qemu/osdep.h | 9 + > util/oslib-posix.c | 9 + > util/oslib-win32.c | 4 > 3 files changed, 22 insertions(+) > > diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h > index c7053cd..b58f312 100644 > --- a/include/qemu/osdep.h > +++ b/include/qemu/osdep.h > @@ -660,6 +660,15 @@ ssize_t qemu_write_full(int fd, const void *buf, size_t > count) > > void qemu_set_cloexec(int fd); > > +/* > + * Clear FD_CLOEXEC for a descriptor. > + * > + * The caller must guarantee that no other fork+exec's occur before the > + * exec that is intended to inherit this descriptor, eg by suspending CPUs > + * and blocking monitor commands. > + */ > +void qemu_clear_cloexec(int fd); > + > /* Return a dynamically allocated directory path that is appropriate for > storing > * local state. > * > diff --git a/util/oslib-posix.c b/util/oslib-posix.c > index e764416..614c3e5 100644 > --- a/util/oslib-posix.c > +++ b/util/oslib-posix.c > @@ -272,6 +272,15 @@ int qemu_socketpair(int domain, int type, int protocol, > int sv[2]) > return ret; > } > > +void qemu_clear_cloexec(int fd) > +{ > +int f; > +f = fcntl(fd, F_GETFD); > +assert(f != -1); > +f = fcntl(fd, F_SETFD, f & ~FD_CLOEXEC); > +assert(f != -1); > +} > + > char * > qemu_get_local_state_dir(void) > { > diff --git a/util/oslib-win32.c b/util/oslib-win32.c > index b623830..c3e969a 100644 > --- a/util/oslib-win32.c > +++ b/util/oslib-win32.c > @@ -222,6 +222,10 @@ void qemu_set_cloexec(int fd) > { > } > > +void qemu_clear_cloexec(int fd) > +{ > +} > + > int qemu_get_thread_id(void) > { > return GetCurrentThreadId();
Re: [PATCH V1 03/26] migration: SAVEVM_FOREACH
Steve Sistare writes: > Define an abstraction SAVEVM_FOREACH to loop over all savevm state > handlers, and replace QTAILQ_FOREACH. Define variants for ALL so > we can loop over all handlers vs a subset of handlers in a subsequent > patch, but at this time there is no distinction between the two. > No functional change. > > Signed-off-by: Steve Sistare > --- > migration/savevm.c | 55 > +++--- > 1 file changed, 32 insertions(+), 23 deletions(-) > > diff --git a/migration/savevm.c b/migration/savevm.c > index 4509482..6829ba3 100644 > --- a/migration/savevm.c > +++ b/migration/savevm.c > @@ -237,6 +237,15 @@ static SaveState savevm_state = { > .global_section_id = 0, > }; > > +#define SAVEVM_FOREACH(se, entry)\ > +QTAILQ_FOREACH(se, _state.handlers, entry)\ > + > +#define SAVEVM_FOREACH_ALL(se, entry)\ > +QTAILQ_FOREACH(se, _state.handlers, entry) This feels worse than SAVEVM_FOREACH_NOT_PRECREATED. We'll have to keep coming back to the definition to figure out which FOREACH is the real deal. > + > +#define SAVEVM_FOREACH_SAFE_ALL(se, entry, new_se) \ > +QTAILQ_FOREACH_SAFE(se, _state.handlers, entry, new_se) > + > static SaveStateEntry *find_se(const char *idstr, uint32_t instance_id); > > static bool should_validate_capability(int capability) > @@ -674,7 +683,7 @@ static uint32_t calculate_new_instance_id(const char > *idstr) > SaveStateEntry *se; > uint32_t instance_id = 0; > > -QTAILQ_FOREACH(se, _state.handlers, entry) { > +SAVEVM_FOREACH_ALL(se, entry) { In this patch we can't have both instances... > if (strcmp(idstr, se->idstr) == 0 > && instance_id <= se->instance_id) { > instance_id = se->instance_id + 1; > @@ -690,7 +699,7 @@ static int calculate_compat_instance_id(const char *idstr) > SaveStateEntry *se; > int instance_id = 0; > > -QTAILQ_FOREACH(se, _state.handlers, entry) { > +SAVEVM_FOREACH(se, entry) { ...otherwise one of the two changes will go undocumented because the actual reason for it will only be described in the next patch. > if (!se->compat) { > continue; > } > @@ -816,7 +825,7 @@ void unregister_savevm(VMStateIf *obj, const char *idstr, > void *opaque) > } > pstrcat(id, sizeof(id), idstr); > > -QTAILQ_FOREACH_SAFE(se, _state.handlers, entry, new_se) { > +SAVEVM_FOREACH_SAFE_ALL(se, entry, new_se) { > if (strcmp(se->idstr, id) == 0 && se->opaque == opaque) { > savevm_state_handler_remove(se); > g_free(se->compat); > @@ -939,7 +948,7 @@ void vmstate_unregister(VMStateIf *obj, const > VMStateDescription *vmsd, > { > SaveStateEntry *se, *new_se; > > -QTAILQ_FOREACH_SAFE(se, _state.handlers, entry, new_se) { > +SAVEVM_FOREACH_SAFE_ALL(se, entry, new_se) { > if (se->vmsd == vmsd && se->opaque == opaque) { > savevm_state_handler_remove(se); > g_free(se->compat); > @@ -1223,7 +1232,7 @@ bool qemu_savevm_state_blocked(Error **errp) > { > SaveStateEntry *se; > > -QTAILQ_FOREACH(se, _state.handlers, entry) { > +SAVEVM_FOREACH(se, entry) { > if (se->vmsd && se->vmsd->unmigratable) { > error_setg(errp, "State blocked by non-migratable device '%s'", > se->idstr); > @@ -1237,7 +1246,7 @@ void qemu_savevm_non_migratable_list(strList **reasons) > { > SaveStateEntry *se; > > -QTAILQ_FOREACH(se, _state.handlers, entry) { > +SAVEVM_FOREACH(se, entry) { > if (se->vmsd && se->vmsd->unmigratable) { > QAPI_LIST_PREPEND(*reasons, >g_strdup_printf("non-migratable device: %s", > @@ -1276,7 +1285,7 @@ bool qemu_savevm_state_guest_unplug_pending(void) > { > SaveStateEntry *se; > > -QTAILQ_FOREACH(se, _state.handlers, entry) { > +SAVEVM_FOREACH(se, entry) { > if (se->vmsd && se->vmsd->dev_unplug_pending && > se->vmsd->dev_unplug_pending(se->opaque)) { > return true; > @@ -1291,7 +1300,7 @@ int qemu_savevm_state_prepare(Error **errp) > SaveStateEntry *se; > int ret; > > -QTAILQ_FOREACH(se, _state.handlers, entry) { > +SAVEVM_FOREACH(se, entry) { > if (!se->ops || !se->ops->save_prepare) { > continue; > } > @@ -1321,7 +1330,7 @@ int qemu_savevm_state_setup(QEMUFile *f, Error **errp) > json_writer_start_array(ms->vmdesc, "devices"); > > trace_savevm_state_setup(); > -QTAILQ_FOREACH(se, _state.handlers, entry) { > +SAVEVM_FOREACH(se, entry) { > if (se->vmsd && se->vmsd->early_setup) { > ret = vmstate_save(f, se, ms->vmdesc, errp); > if (ret) { > @@ -1365,7 +1374,7 @@ int qemu_savevm_state_resume_prepare(MigrationState *s) > >
Re: [PATCH v2 04/33] accel/tcg: Reorg translator_ld*
On 25/4/24 01:31, Richard Henderson wrote: Reorg translator_access into translator_ld, with a more memcpy-ish interface. If both pages are in ram, do not go through the caller's slow path. Assert that the access is within the two pages that we are prepared to protect, per TranslationBlock. Allow access prior to pc_first, so long as it is within the first page. Signed-off-by: Richard Henderson --- accel/tcg/translator.c | 189 ++--- 1 file changed, 101 insertions(+), 88 deletions(-) uint64_t translator_ldq(CPUArchState *env, DisasContextBase *db, vaddr pc) { -uint64_t ret, plug; -void *p = translator_access(env, db, pc, sizeof(ret)); +uint64_t raw, tgt; -if (p) { -plugin_insn_append(pc, p, sizeof(ret)); -return ldq_p(p); +if (translator_ld(env, db, , pc, sizeof(raw))) { +tgt = tswap64(raw); +} else { +tgt = cpu_ldl_code(env, pc); cpu_ldq_code() ? +raw = tswap64(tgt); } -ret = cpu_ldq_code(env, pc); -plug = tswap64(ret); -plugin_insn_append(pc, , sizeof(ret)); -return ret; +plugin_insn_append(pc, , sizeof(raw)); +return tgt; }
Re: [PATCH v2 32/33] target/s390x: Use translator_lduw in get_next_pc
On 25/4/24 01:31, Richard Henderson wrote: Signed-off-by: Richard Henderson --- target/s390x/tcg/translate.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) Reviewed-by: Philippe Mathieu-Daudé
Re: [PATCH v2 15/33] plugins: Merge alloc_tcg_plugin_context into plugin_gen_tb_start
On 25/4/24 01:31, Richard Henderson wrote: We don't need to allocate plugin context at startup, we can wait until we actually use it. Signed-off-by: Richard Henderson --- accel/tcg/plugin-gen.c | 36 tcg/tcg.c | 11 --- 2 files changed, 20 insertions(+), 27 deletions(-) Reviewed-by: Philippe Mathieu-Daudé
Re: [PATCH 02/57] target/arm: Split out gengvec64.c
On 6/5/24 03:03, Richard Henderson wrote: Split some routines out of translate-a64.c and translate-sve.c that are used by both. Signed-off-by: Richard Henderson --- target/arm/tcg/translate-a64.h | 4 + target/arm/tcg/gengvec64.c | 190 + target/arm/tcg/translate-a64.c | 26 - target/arm/tcg/translate-sve.c | 145 + target/arm/tcg/meson.build | 1 + 5 files changed, 197 insertions(+), 169 deletions(-) create mode 100644 target/arm/tcg/gengvec64.c Reviewed using git-diff --color-moved=dimmed-zebra Reviewed-by: Philippe Mathieu-Daudé
Re: [PATCH 01/57] target/arm: Split out gengvec.c
On 6/5/24 03:03, Richard Henderson wrote: Signed-off-by: Richard Henderson --- target/arm/tcg/translate.h |5 + target/arm/tcg/gengvec.c | 1612 target/arm/tcg/translate.c | 1588 --- target/arm/tcg/meson.build |1 + 4 files changed, 1618 insertions(+), 1588 deletions(-) create mode 100644 target/arm/tcg/gengvec.c Reviewed using git-diff --color-moved=dimmed-zebra Reviewed-by: Philippe Mathieu-Daudé
Re: [PATCH] gitlab: Rename ubuntu-22.04-s390x-all to *-system
On 6/5/24 22:23, Richard Henderson wrote: We already build the linux-user binaries with ubuntu-22.04-s390x-all-linux, so there's no need to do it again. Signed-off-by: Richard Henderson --- .gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) Reviewed-by: Philippe Mathieu-Daudé
Re: [PATCH V1 04/26] migration: delete unused parameter mis
Steve Sistare writes: > Signed-off-by: Steve Sistare Reviewed-by: Fabiano Rosas
Re: [PATCH v8 0/5] Support message-based DMA in vfio-user server
On Mon, May 6, 2024 at 4:44 PM Peter Xu wrote: > On Thu, Mar 28, 2024 at 08:53:36AM +0100, Mattias Nissler wrote: > > Stefan, to the best of my knowledge this is fully reviewed and ready > > to go in - can you kindly pick it up or advise in case there's > > something I missed? Thanks! > > Fails cross-compile on mipsel: > > https://gitlab.com/peterx/qemu/-/jobs/6787790601 Ah, bummer, thanks for reporting. 4GB of bounce buffer should be plenty, so switching to 32 bit atomics seems a good idea at first glance. I'll take a closer look tomorrow and send a respin with a fix.
Re: [PATCH v8 0/5] Support message-based DMA in vfio-user server
On Mon, May 6, 2024 at 5:01 PM Stefan Hajnoczi wrote: > On Thu, 28 Mar 2024 at 03:54, Mattias Nissler > wrote: > > > > Stefan, to the best of my knowledge this is fully reviewed and ready > > to go in - can you kindly pick it up or advise in case there's > > something I missed? Thanks! > > This code is outside the areas that I maintain. I think it would make > sense for Jag to merge it and send a pull request as vfio-user > maintainer. OK, thanks for following up, I'll check with Jag.
[PATCH] gitlab: Rename ubuntu-22.04-s390x-all to *-system
We already build the linux-user binaries with ubuntu-22.04-s390x-all-linux, so there's no need to do it again. Signed-off-by: Richard Henderson --- .gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml b/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml index 3a2b1e1d24..c3f16d77bb 100644 --- a/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml +++ b/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml @@ -21,7 +21,7 @@ ubuntu-22.04-s390x-all-linux: - make --output-sync check-tcg - make --output-sync -j`nproc` check -ubuntu-22.04-s390x-all: +ubuntu-22.04-s390x-all-system: extends: .custom_runner_template needs: [] stage: build @@ -35,7 +35,7 @@ ubuntu-22.04-s390x-all: script: - mkdir build - cd build - - ../configure --disable-libssh + - ../configure --disable-libssh --disable-user || { cat config.log meson-logs/meson-log.txt; exit 1; } - make --output-sync -j`nproc` - make --output-sync -j`nproc` check -- 2.34.1
[PATCH] gitlab: Drop --static from s390x linux-user build
The host does not have the correct libraries installed for static pie, which causes host/guest address space interference for some tests. There's no real gain from linking statically, so drop it. Signed-off-by: Richard Henderson --- Per my suggestion in https://lore.kernel.org/qemu-devel/50c27a9f-fd75-4f8e-9a2d-488d8df4f...@linaro.org r~ --- .gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml b/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml index 105981879f..3a2b1e1d24 100644 --- a/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml +++ b/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml @@ -2,7 +2,7 @@ # setup by the scripts/ci/setup/build-environment.yml task # "Install basic packages to build QEMU on Ubuntu 22.04" -ubuntu-22.04-s390x-all-linux-static: +ubuntu-22.04-s390x-all-linux: extends: .custom_runner_template needs: [] stage: build @@ -15,7 +15,7 @@ ubuntu-22.04-s390x-all-linux-static: script: - mkdir build - cd build - - ../configure --enable-debug --static --disable-system + - ../configure --enable-debug-tcg --disable-system --disable-tools --disable-docs || { cat config.log meson-logs/meson-log.txt; exit 1; } - make --output-sync -j`nproc` - make --output-sync check-tcg -- 2.34.1
Re: qemu-img cache modes with Linux cgroup v1
Hey, just FYI about tmpfs, during some development on Fedora 39 I noticed O_DIRECT is now supported on tmpfs (as opposed to our CI which runs Centos 9 Stream). `qemu-img convert -t none -O raw tests/images/cirros-qcow2.img /tmp/cirros.raw` where /tmp is indeed a tmpfs. I might be missing something so feel free to call that out On Tue, Aug 1, 2023 at 6:38 PM Stefan Hajnoczi wrote: > Hi Daniel, > I agree with your points. > > Stefan >
Re: [PULL 00/12] qemu-sparc queue 20240506
On 5/6/24 04:44, Mark Cave-Ayland wrote: The following changes since commit 248f6f62df073a3b4158fd0093863ab885feabb5: Merge tag 'pull-axp-20240504' ofhttps://gitlab.com/rth7680/qemu into staging (2024-05-04 08:39:46 -0700) are available in the Git repository at: https://github.com/mcayland/qemu.git tags/qemu-sparc-20240506 for you to fetch changes up to d6f898cf85c92389182d22f0bcc3a11d7194fc94: target/sparc: Split out do_ms16b (2024-05-05 21:02:48 +0100) qemu-sparc queue - Default to modern virtio with iommu_platform enabled for sun4u - Fixes for various VIS instructions from Richard - CPU name updates from Thomas Applied, thanks. Please update https://wiki.qemu.org/ChangeLog/9.1 as appropriate. r~
Re: [PULL 0/7] QAPI patches patches for 2024-05-06
On 5/6/24 04:02, Markus Armbruster wrote: The following changes since commit 248f6f62df073a3b4158fd0093863ab885feabb5: Merge tag 'pull-axp-20240504' ofhttps://gitlab.com/rth7680/qemu into staging (2024-05-04 08:39:46 -0700) are available in the Git repository at: https://repo.or.cz/qemu/armbru.git tags/pull-qapi-2024-05-06 for you to fetch changes up to 285a8f209af7b4992aa91e8bea03a303fb6406ab: qapi: Simplify QAPISchemaVariants @tag_member (2024-05-06 12:38:27 +0200) QAPI patches patches for 2024-05-06 Applied, thanks. Please update https://wiki.qemu.org/ChangeLog/9.1 as appropriate. r~
Re: [PULL 00/28] Accelerator patches for 2024-05-06
On 5/6/24 05:37, Philippe Mathieu-Daudé wrote: The following changes since commit 248f6f62df073a3b4158fd0093863ab885feabb5: Merge tag 'pull-axp-20240504' ofhttps://gitlab.com/rth7680/qemu into staging (2024-05-04 08:39:46 -0700) are available in the Git repository at: https://github.com/philmd/qemu.git tags/accel-next-20240506 for you to fetch changes up to c984d1d8916df8abac71325a5a135cd851b2106a: MAINTAINERS: Update my email address (2024-05-06 14:33:49 +0200) Accelerator patches - Extract page-protection definitions to page-protection.h - Rework in accel/tcg in preparation of extracting TCG fields from CPUState - More uses of get_task_state() in user emulation - Xen refactors in preparation for adding multiple map caches (Juergen & Edgar) - MAINTAINERS updates (Aleksandar and Bin) Applied, thanks. Please update https://wiki.qemu.org/ChangeLog/9.1 as appropriate. r~
[PATCH] target/sh4: Update DisasContextBase.insn_start
Match the extra inserts of INDEX_op_insn_start, fixing the db->num_insns != 1 assert in translator_loop. Fixes: dcd092a0636 ("accel/tcg: Improve can_do_io management") Signed-off-by: Richard Henderson --- target/sh4/translate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/target/sh4/translate.c b/target/sh4/translate.c index e599ab9d1a..b3282f3ac7 100644 --- a/target/sh4/translate.c +++ b/target/sh4/translate.c @@ -2189,6 +2189,7 @@ static void decode_gusa(DisasContext *ctx, CPUSH4State *env) */ for (i = 1; i < max_insns; ++i) { tcg_gen_insn_start(pc + i * 2, ctx->envflags); +ctx->base.insn_start = tcg_last_op(); } } #endif -- 2.34.1
[PATCH 2/2] aio: warn about iohandler_ctx special casing
The main loop has two AioContexts: qemu_aio_context and iohandler_ctx. The main loop runs them both, but nested aio_poll() calls on qemu_aio_context exclude iohandler_ctx. Which one should qemu_get_current_aio_context() return when called from the main loop? Document that it's always qemu_aio_context. This has subtle effects on functions that use qemu_get_current_aio_context(). For example, aio_co_reschedule_self() does not work when moving from iohandler_ctx to qemu_aio_context because qemu_get_current_aio_context() does not differentiate these two AioContexts. Document this in order to reduce the chance of future bugs. Signed-off-by: Stefan Hajnoczi --- include/block/aio.h | 6 ++ 1 file changed, 6 insertions(+) diff --git a/include/block/aio.h b/include/block/aio.h index 8378553eb9..4ee81936ed 100644 --- a/include/block/aio.h +++ b/include/block/aio.h @@ -629,6 +629,9 @@ void aio_co_schedule(AioContext *ctx, Coroutine *co); * * Move the currently running coroutine to new_ctx. If the coroutine is already * running in new_ctx, do nothing. + * + * Note that this function cannot reschedule from iohandler_ctx to + * qemu_aio_context. */ void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx); @@ -661,6 +664,9 @@ void aio_co_enter(AioContext *ctx, Coroutine *co); * If called from an IOThread this will be the IOThread's AioContext. If * called from the main thread or with the "big QEMU lock" taken it * will be the main loop AioContext. + * + * Note that the return value is never the main loop's iohandler_ctx and the + * return value is the main loop AioContext instead. */ AioContext *qemu_get_current_aio_context(void); -- 2.45.0
[PATCH 1/2] Revert "monitor: use aio_co_reschedule_self()"
Commit 1f25c172f837 ("monitor: use aio_co_reschedule_self()") was a code cleanup that uses aio_co_reschedule_self() instead of open coding coroutine rescheduling. Bug RHEL-34618 was reported and Kevin Wolf identified the root cause. I missed that aio_co_reschedule_self() -> qemu_get_current_aio_context() only knows about qemu_aio_context/IOThread AioContexts and not about iohandler_ctx. It does not function correctly when going back from the iohandler_ctx to qemu_aio_context. Go back to open coding the AioContext transitions to avoid this bug. This reverts commit 1f25c172f83704e350c0829438d832384084a74d. Buglink: https://issues.redhat.com/browse/RHEL-34618 Signed-off-by: Stefan Hajnoczi --- qapi/qmp-dispatch.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/qapi/qmp-dispatch.c b/qapi/qmp-dispatch.c index f3488afeef..176b549473 100644 --- a/qapi/qmp-dispatch.c +++ b/qapi/qmp-dispatch.c @@ -212,7 +212,8 @@ QDict *coroutine_mixed_fn qmp_dispatch(const QmpCommandList *cmds, QObject *requ * executing the command handler so that it can make progress if it * involves an AIO_WAIT_WHILE(). */ -aio_co_reschedule_self(qemu_get_aio_context()); +aio_co_schedule(qemu_get_aio_context(), qemu_coroutine_self()); +qemu_coroutine_yield(); } monitor_set_cur(qemu_coroutine_self(), cur_mon); @@ -226,7 +227,9 @@ QDict *coroutine_mixed_fn qmp_dispatch(const QmpCommandList *cmds, QObject *requ * Move back to iohandler_ctx so that nested event loops for * qemu_aio_context don't start new monitor commands. */ -aio_co_reschedule_self(iohandler_get_aio_context()); +aio_co_schedule(iohandler_get_aio_context(), +qemu_coroutine_self()); +qemu_coroutine_yield(); } } else { /* -- 2.45.0
[PATCH 0/2] Revert "monitor: use aio_co_reschedule_self()"
This series fixes RHEL-34618 "qemu crash on Assertion `luringcb->co->ctx == s->aio_context' failed when do block_resize on hotplug disk with aio=io_uring": https://issues.redhat.com/browse/RHEL-34618 Kevin identified commit 1f25c172f837 ("monitor: use aio_co_reschedule_self()") as the root cause. There is a subtlety regarding how qemu_get_current_aio_context() returns qemu_aio_context even though we may be running in iohandler_ctx. Revert commit 1f25c172f837, it was just intended as a code cleanup. Stefan Hajnoczi (2): Revert "monitor: use aio_co_reschedule_self()" aio: warn about iohandler_ctx special casing include/block/aio.h | 6 ++ qapi/qmp-dispatch.c | 7 +-- 2 files changed, 11 insertions(+), 2 deletions(-) -- 2.45.0
Re: Intention to work on GSoC project
Hi, It's been a while since I last gave an update. Sorry about that. I am ready to get my hands dirty and start with the implementation. I have gone through the source of linux's drivers/virtio/virtio_ring.c [1], and QEMU's hw/virtio/virtio.c [2] and hw/virtio/vhost-shadow-virtqueue.c [3]. Before actually starting I would like to make sure I am on the right track. In vhost-shadow-virtqueue.c, there's a function "vhost_svq_add" which in turn calls "vhost_svq_add_split". Shall I start by implementing a mechanism to check if the feature bit "VIRTIO_F_RING_PACKED" is set (using "virtio_vdev_has_feature")? And if it's supported, "vhost_svq_add" should call "vhost_svq_add_packed". Following this, I can then start implementing "vhost_svq_add_packed" and progress from there. What are your thoughts on this? Thanks, Sahil [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/virtio/virtio.c [2] https://gitlab.com/qemu-project/qemu/-/blob/master/hw/virtio/virtio.c [3] https://gitlab.com/qemu-project/qemu/-/blob/master/hw/virtio/vhost-shadow-virtqueue.c
RE: [PATCH 2/4] target/hexagon: idef-parser remove undefined functions
> -Original Message- > From: Anton Johansson > Sent: Monday, May 6, 2024 1:31 PM > To: qemu-devel@nongnu.org > Cc: a...@rev.ng; ltaylorsimp...@gmail.com; bc...@quicinc.com > Subject: [PATCH 2/4] target/hexagon: idef-parser remove undefined > functions > > Signed-off-by: Anton Johansson > --- > target/hexagon/idef-parser/parser-helpers.h | 13 - > 1 file changed, 13 deletions(-) Reviewed-by: Taylor Simpson
RE: [PATCH 3/4] target/hexagon: idef-parser fix leak of init_list
> -Original Message- > From: Anton Johansson > Sent: Monday, May 6, 2024 1:31 PM > To: qemu-devel@nongnu.org > Cc: a...@rev.ng; ltaylorsimp...@gmail.com; bc...@quicinc.com > Subject: [PATCH 3/4] target/hexagon: idef-parser fix leak of init_list > > gen_inst_init_args() is called for instructions using a predicate as an rvalue. > Upon first call, the list of arguments which might need initialization init_list is > freed to indicate that they have been processed. For instructions without an > rvalue predicate, > gen_inst_init_args() isn't called and init_list will never be freed. > > Free init_list from free_instruction() if it hasn't already been freed. > > Signed-off-by: Anton Johansson > --- > target/hexagon/idef-parser/parser-helpers.c | 7 +++ > 1 file changed, 7 insertions(+) > > diff --git a/target/hexagon/idef-parser/parser-helpers.c > b/target/hexagon/idef-parser/parser-helpers.c > index 95f2b43076..bae01c2bb8 100644 > --- a/target/hexagon/idef-parser/parser-helpers.c > +++ b/target/hexagon/idef-parser/parser-helpers.c > @@ -2121,6 +2121,13 @@ void free_instruction(Context *c) > g_string_free(g_array_index(c->inst.strings, GString*, i), TRUE); > } > g_array_free(c->inst.strings, TRUE); > +/* > + * Free list of arguments that might need initialization, if they haven't > + * already been free'd. > + */ > +if (c->inst.init_list) { > +g_array_free(c->inst.init_list, TRUE); > +} > /* Free INAME token value */ > g_string_free(c->inst.name, TRUE); > /* Free variables and registers */ Why not do this in gen_inst_init_args just before this? /* Free argument init list once we have initialized everything */ g_array_free(c->inst.init_list, TRUE); c->inst.init_list = NULL; Taylor
RE: [PATCH 4/4] target/hexagon: idef-parser simplify predicate init
> -Original Message- > From: Anton Johansson > Sent: Monday, May 6, 2024 1:31 PM > To: qemu-devel@nongnu.org > Cc: a...@rev.ng; ltaylorsimp...@gmail.com; bc...@quicinc.com > Subject: [PATCH 4/4] target/hexagon: idef-parser simplify predicate init > > Only predicate instruction arguments need to be initialized by idef-parser. > This commit removes registers from the init_list and simplifies > gen_inst_init_args() slightly. > > Signed-off-by: Anton Johansson > --- > target/hexagon/idef-parser/idef-parser.y| 2 -- > target/hexagon/idef-parser/parser-helpers.c | 20 +--- > 2 files changed, 9 insertions(+), 13 deletions(-) > diff --git a/target/hexagon/idef-parser/parser-helpers.c > b/target/hexagon/idef-parser/parser-helpers.c > index bae01c2bb8..33e8f82007 100644 > --- a/target/hexagon/idef-parser/parser-helpers.c > +++ b/target/hexagon/idef-parser/parser-helpers.c > @@ -1652,26 +1652,24 @@ void gen_inst(Context *c, GString *iname) > > void gen_inst_init_args(Context *c, YYLTYPE *locp) { > +/* If init_list is NULL arguments have already been initialized */ > if (!c->inst.init_list) { > return; > } > > for (unsigned i = 0; i < c->inst.init_list->len; i++) { > HexValue *val = _array_index(c->inst.init_list, HexValue, i); > -if (val->type == REGISTER_ARG) { > -/* Nothing to do here */ > -} else if (val->type == PREDICATE) { > -char suffix = val->is_dotnew ? 'N' : 'V'; > -EMIT_HEAD(c, "tcg_gen_movi_i%u(P%c%c, 0);\n", val->bit_width, > - val->pred.id, suffix); > -} else { > -yyassert(c, locp, false, "Invalid arg type!"); > -} > +yyassert(c, locp, val->type == PREDICATE, > + "Only predicates need to be initialized!"); > +char suffix = val->is_dotnew ? 'N' : 'V'; Declarations should be at the beginning of the function per QEMU coding standards. > +EMIT_HEAD(c, "tcg_gen_movi_i%u(P%c%c, 0);\n", val->bit_width, Since you know this is a predicate, the bit_width will always be 32. You can hard-code that instead of using %u. > + val->pred.id, suffix); > } > > /* Free argument init list once we have initialized everything */ Taylor
RE: [PATCH 1/4] target/hexagon: idef-parser remove unused defines
> -Original Message- > From: Anton Johansson > Sent: Monday, May 6, 2024 1:31 PM > To: qemu-devel@nongnu.org > Cc: a...@rev.ng; ltaylorsimp...@gmail.com; bc...@quicinc.com > Subject: [PATCH 1/4] target/hexagon: idef-parser remove unused defines > > Before switching to GArray/g_string_printf we used fixed size arrays for > output buffers and instructions arguments among other things. > > Macros defining the sizes of these buffers were left behind, remove them. > > Signed-off-by: Anton Johansson > --- > target/hexagon/idef-parser/idef-parser.h | 10 -- > 1 file changed, 10 deletions(-) Reviewed-by: Taylor Simpson
[PATCH 2/4] target/hexagon: idef-parser remove undefined functions
Signed-off-by: Anton Johansson --- target/hexagon/idef-parser/parser-helpers.h | 13 - 1 file changed, 13 deletions(-) diff --git a/target/hexagon/idef-parser/parser-helpers.h b/target/hexagon/idef-parser/parser-helpers.h index 7c58087169..2087d534a9 100644 --- a/target/hexagon/idef-parser/parser-helpers.h +++ b/target/hexagon/idef-parser/parser-helpers.h @@ -143,8 +143,6 @@ void commit(Context *c); #define OUT(c, locp, ...) FOR_EACH((c), (locp), OUT_IMPL, __VA_ARGS__) -const char *cmp_swap(Context *c, YYLTYPE *locp, const char *type); - /** * Temporary values creation */ @@ -236,8 +234,6 @@ HexValue gen_extract_op(Context *c, HexValue *index, HexExtract *extract); -HexValue gen_read_reg(Context *c, YYLTYPE *locp, HexValue *reg); - void gen_write_reg(Context *c, YYLTYPE *locp, HexValue *reg, HexValue *value); void gen_assign(Context *c, @@ -274,13 +270,6 @@ HexValue gen_ctpop_op(Context *c, YYLTYPE *locp, HexValue *src); HexValue gen_rotl(Context *c, YYLTYPE *locp, HexValue *src, HexValue *n); -HexValue gen_deinterleave(Context *c, YYLTYPE *locp, HexValue *mixed); - -HexValue gen_interleave(Context *c, -YYLTYPE *locp, -HexValue *odd, -HexValue *even); - HexValue gen_carry_from_add(Context *c, YYLTYPE *locp, HexValue *op1, @@ -349,8 +338,6 @@ HexValue gen_rvalue_ternary(Context *c, YYLTYPE *locp, HexValue *cond, const char *cond_to_str(TCGCond cond); -void emit_header(Context *c); - void emit_arg(Context *c, YYLTYPE *locp, HexValue *arg); void emit_footer(Context *c); -- 2.44.0
[PATCH 4/4] target/hexagon: idef-parser simplify predicate init
Only predicate instruction arguments need to be initialized by idef-parser. This commit removes registers from the init_list and simplifies gen_inst_init_args() slightly. Signed-off-by: Anton Johansson --- target/hexagon/idef-parser/idef-parser.y| 2 -- target/hexagon/idef-parser/parser-helpers.c | 20 +--- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/target/hexagon/idef-parser/idef-parser.y b/target/hexagon/idef-parser/idef-parser.y index cd2612eb8c..9ffb9f9699 100644 --- a/target/hexagon/idef-parser/idef-parser.y +++ b/target/hexagon/idef-parser/idef-parser.y @@ -233,8 +233,6 @@ code : '{' statements '}' argument_decl : REG { emit_arg(c, &@1, &$1); -/* Enqueue register into initialization list */ -g_array_append_val(c->inst.init_list, $1); } | PRED { diff --git a/target/hexagon/idef-parser/parser-helpers.c b/target/hexagon/idef-parser/parser-helpers.c index bae01c2bb8..33e8f82007 100644 --- a/target/hexagon/idef-parser/parser-helpers.c +++ b/target/hexagon/idef-parser/parser-helpers.c @@ -1652,26 +1652,24 @@ void gen_inst(Context *c, GString *iname) /* - * Initialize declared but uninitialized registers, but only for - * non-conditional instructions + * Initialize declared but uninitialized instruction arguments. Only needed for + * predicate arguments, initialization of registers is handled by the Hexagon + * frontend. */ void gen_inst_init_args(Context *c, YYLTYPE *locp) { +/* If init_list is NULL arguments have already been initialized */ if (!c->inst.init_list) { return; } for (unsigned i = 0; i < c->inst.init_list->len; i++) { HexValue *val = _array_index(c->inst.init_list, HexValue, i); -if (val->type == REGISTER_ARG) { -/* Nothing to do here */ -} else if (val->type == PREDICATE) { -char suffix = val->is_dotnew ? 'N' : 'V'; -EMIT_HEAD(c, "tcg_gen_movi_i%u(P%c%c, 0);\n", val->bit_width, - val->pred.id, suffix); -} else { -yyassert(c, locp, false, "Invalid arg type!"); -} +yyassert(c, locp, val->type == PREDICATE, + "Only predicates need to be initialized!"); +char suffix = val->is_dotnew ? 'N' : 'V'; +EMIT_HEAD(c, "tcg_gen_movi_i%u(P%c%c, 0);\n", val->bit_width, + val->pred.id, suffix); } /* Free argument init list once we have initialized everything */ -- 2.44.0
[PATCH 3/4] target/hexagon: idef-parser fix leak of init_list
gen_inst_init_args() is called for instructions using a predicate as an rvalue. Upon first call, the list of arguments which might need initialization init_list is freed to indicate that they have been processed. For instructions without an rvalue predicate, gen_inst_init_args() isn't called and init_list will never be freed. Free init_list from free_instruction() if it hasn't already been freed. Signed-off-by: Anton Johansson --- target/hexagon/idef-parser/parser-helpers.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/target/hexagon/idef-parser/parser-helpers.c b/target/hexagon/idef-parser/parser-helpers.c index 95f2b43076..bae01c2bb8 100644 --- a/target/hexagon/idef-parser/parser-helpers.c +++ b/target/hexagon/idef-parser/parser-helpers.c @@ -2121,6 +2121,13 @@ void free_instruction(Context *c) g_string_free(g_array_index(c->inst.strings, GString*, i), TRUE); } g_array_free(c->inst.strings, TRUE); +/* + * Free list of arguments that might need initialization, if they haven't + * already been free'd. + */ +if (c->inst.init_list) { +g_array_free(c->inst.init_list, TRUE); +} /* Free INAME token value */ g_string_free(c->inst.name, TRUE); /* Free variables and registers */ -- 2.44.0
[PATCH 0/4] target/hexagon: Minor idef-parser cleanup
Was running idef-parser with valgrind and noticed we were leaking the init_list GArray, which is used to hold instruction arguments that may need initialization. This patchset fixes the leak, removes unused macros and undefined functions, and simplifies gen_inst_init_args() to only handle predicate values. Anton Johansson (4): target/hexagon: idef-parser remove unused defines target/hexagon: idef-parser remove undefined functions target/hexagon: idef-parser fix leak of init_list target/hexagon: idef-parser simplify predicate init target/hexagon/idef-parser/idef-parser.h| 10 target/hexagon/idef-parser/idef-parser.y| 2 -- target/hexagon/idef-parser/parser-helpers.c | 27 - target/hexagon/idef-parser/parser-helpers.h | 13 -- 4 files changed, 16 insertions(+), 36 deletions(-) -- 2.44.0
[PATCH 1/4] target/hexagon: idef-parser remove unused defines
Before switching to GArray/g_string_printf we used fixed size arrays for output buffers and instructions arguments among other things. Macros defining the sizes of these buffers were left behind, remove them. Signed-off-by: Anton Johansson --- target/hexagon/idef-parser/idef-parser.h | 10 -- 1 file changed, 10 deletions(-) diff --git a/target/hexagon/idef-parser/idef-parser.h b/target/hexagon/idef-parser/idef-parser.h index 3faa1deecd..8594cbe3a2 100644 --- a/target/hexagon/idef-parser/idef-parser.h +++ b/target/hexagon/idef-parser/idef-parser.h @@ -23,16 +23,6 @@ #include #include -#define TCGV_NAME_SIZE 7 -#define MAX_WRITTEN_REGS 32 -#define OFFSET_STR_LEN 32 -#define ALLOC_LIST_LEN 32 -#define ALLOC_NAME_SIZE 32 -#define INIT_LIST_LEN 32 -#define OUT_BUF_LEN (1024 * 1024) -#define SIGNATURE_BUF_LEN (128 * 1024) -#define HEADER_BUF_LEN (128 * 1024) - /* Variadic macros to wrap the buffer printing functions */ #define EMIT(c, ...) \ do { \ -- 2.44.0
Re: qemu-img cache modes with Linux cgroup v1
On Mon, May 06, 2024 at 08:10:25PM +0300, Alex Kalenyuk wrote: > Hey, just FYI about tmpfs, during some development on Fedora 39 I noticed > O_DIRECT is now supported on tmpfs (as opposed to our CI which runs Centos > 9 Stream). > `qemu-img convert -t none -O raw tests/images/cirros-qcow2.img > /tmp/cirros.raw` > where /tmp is indeed a tmpfs. > > I might be missing something so feel free to call that out Yes, it was added by: commit e88e0d366f9cfbb810b0c8509dc5d130d5a53e02 Author: Hugh Dickins Date: Thu Aug 10 23:27:07 2023 -0700 tmpfs: trivial support for direct IO It's fairly new but great to have. Stefan signature.asc Description: PGP signature
Re: [PATCH v2 5/5] monitor: use aio_co_reschedule_self()
On Fri, May 03, 2024 at 07:33:17PM +0200, Kevin Wolf wrote: > Am 06.02.2024 um 20:06 hat Stefan Hajnoczi geschrieben: > > The aio_co_reschedule_self() API is designed to avoid the race > > condition between scheduling the coroutine in another AioContext and > > yielding. > > > > The QMP dispatch code uses the open-coded version that appears > > susceptible to the race condition at first glance: > > > > aio_co_schedule(qemu_get_aio_context(), qemu_coroutine_self()); > > qemu_coroutine_yield(); > > > > The code is actually safe because the iohandler and qemu_aio_context > > AioContext run under the Big QEMU Lock. Nevertheless, set a good example > > and use aio_co_reschedule_self() so it's obvious that there is no race. > > > > Suggested-by: Hanna Reitz > > Reviewed-by: Manos Pitsidianakis > > Reviewed-by: Hanna Czenczek > > Signed-off-by: Stefan Hajnoczi > > --- > > qapi/qmp-dispatch.c | 7 ++- > > 1 file changed, 2 insertions(+), 5 deletions(-) > > > > diff --git a/qapi/qmp-dispatch.c b/qapi/qmp-dispatch.c > > index 176b549473..f3488afeef 100644 > > --- a/qapi/qmp-dispatch.c > > +++ b/qapi/qmp-dispatch.c > > @@ -212,8 +212,7 @@ QDict *coroutine_mixed_fn qmp_dispatch(const > > QmpCommandList *cmds, QObject *requ > > * executing the command handler so that it can make progress > > if it > > * involves an AIO_WAIT_WHILE(). > > */ > > -aio_co_schedule(qemu_get_aio_context(), qemu_coroutine_self()); > > -qemu_coroutine_yield(); > > +aio_co_reschedule_self(qemu_get_aio_context()); > > Turns out that this one actually causes a regression. [1] This code is > ŕun in iohandler_ctx, aio_co_reschedule_self() looks at the new context > and compares it with qemu_get_current_aio_context() - and because both > are qemu_aio_context, it decides that it has nothing to do. So the > command handler coroutine actually still runs in iohandler_ctx now, > which is not what we want. > > We could just revert this patch because it was only meant as a cleanup > without a semantic difference. > > Or aio_co_reschedule_self() could look at qemu_coroutine_self()->ctx > instead of using qemu_get_current_aio_context(). That would be a little > more indirect, though, and I'm not sure if co->ctx is always up to date. > > Any opinions on what is the best way to fix this? If the commit is reverted then similar bugs may be introduced again in the future. The qemu_get_current_aio_context() API is unaware of iohandler_ctx and this can lead to unexpected results. I will send patches to revert the commit and add doc comments explaining iohandler_ctx's special behavior. This will reduce, but not eliminate, the risk of future bugs. Modifying aio_co_reschedule_self() might be better long-term fix, but I'm afraid it will create more bugs because it will expose the subtle distinction between the current coroutine AioContext and non-coroutine AioContext in new places. I think the root cause is that iohandler_ctx isn't a full-fledged AioContext with its own event loop. iohandler_ctx is a special superset of qemu_aio_context that the main loop monitors. Stefan > > Kevin > > [1] https://issues.redhat.com/browse/RHEL-34618 > signature.asc Description: PGP signature
Re: [PATCH 4/4] tests/qtest: Check STM32L4x5 clock connections
- Le 6 Mai 24, à 6:16, Thomas Huth th...@redhat.com a écrit : > On 05/05/2024 16.05, Inès Varhol wrote: >> For USART, GPIO and SYSCFG devices, check that clock frequency before >> and after enabling the peripheral clock in RCC is correct. >> >> Signed-off-by: Inès Varhol >> --- >> Hello, >> >> Should these tests be regrouped in stm32l4x5_rcc-test.c ? > > Hi, > > sounds mostly like a matter of taste at a first glance. Or what would be the > benefit of putting everything into the *rcc-test.c file? Could you maybe > consolidate the get_clock_freq_hz() function that way? (maybe that > get_clock_freq_hz() function could also be consolidated as a inline function > in a shared header instead?) > > Thomas Hello, I was indeed looking to consolidate the functions get_clock_freq_hz() and check_clock() from *usart-test.c (along with the definitions for RCC registers). Thank you for your suggestion, I'll use a header file. Inès Varhol
Re: [PATCH RFC 00/26] Multifd device state transfer support with VFIO consumer
On Mon, May 06, 2024 at 06:26:46PM +0200, Maciej S. Szmigiero wrote: > On 29.04.2024 17:09, Peter Xu wrote: > > On Fri, Apr 26, 2024 at 07:34:09PM +0200, Maciej S. Szmigiero wrote: > > > On 24.04.2024 00:35, Peter Xu wrote: > > > > On Wed, Apr 24, 2024 at 12:25:08AM +0200, Maciej S. Szmigiero wrote: > > > > > On 24.04.2024 00:20, Peter Xu wrote: > > > > > > On Tue, Apr 23, 2024 at 06:15:35PM +0200, Maciej S. Szmigiero wrote: > > > > > > > On 19.04.2024 17:31, Peter Xu wrote: > > > > > > > > On Fri, Apr 19, 2024 at 11:07:21AM +0100, Daniel P. Berrangé > > > > > > > > wrote: > > > > > > > > > On Thu, Apr 18, 2024 at 04:02:49PM -0400, Peter Xu wrote: > > > > > > > > > > On Thu, Apr 18, 2024 at 08:14:15PM +0200, Maciej S. > > > > > > > > > > Szmigiero wrote: > > > > > > > > > > > I think one of the reasons for these results is that > > > > > > > > > > > mixed (RAM + device > > > > > > > > > > > state) multifd channels participate in the RAM sync > > > > > > > > > > > process > > > > > > > > > > > (MULTIFD_FLAG_SYNC) whereas device state dedicated > > > > > > > > > > > channels don't. > > > > > > > > > > > > > > > > > > > > Firstly, I'm wondering whether we can have better names for > > > > > > > > > > these new > > > > > > > > > > hooks. Currently (only comment on the async* stuff): > > > > > > > > > > > > > > > > > > > > - complete_precopy_async > > > > > > > > > > - complete_precopy > > > > > > > > > > - complete_precopy_async_wait > > > > > > > > > > > > > > > > > > > > But perhaps better: > > > > > > > > > > > > > > > > > > > > - complete_precopy_begin > > > > > > > > > > - complete_precopy > > > > > > > > > > - complete_precopy_end > > > > > > > > > > > > > > > > > > > > ? > > > > > > > > > > > > > > > > > > > > As I don't see why the device must do something with async > > > > > > > > > > in such hook. > > > > > > > > > > To me it's more like you're splitting one process into > > > > > > > > > > multiple, then > > > > > > > > > > begin/end sounds more generic. > > > > > > > > > > > > > > > > > > > > Then, if with that in mind, IIUC we can already split > > > > > > > > > > ram_save_complete() > > > > > > > > > > into >1 phases too. For example, I would be curious whether > > > > > > > > > > the performance > > > > > > > > > > will go back to normal if we offloading > > > > > > > > > > multifd_send_sync_main() into the > > > > > > > > > > complete_precopy_end(), because we really only need one > > > > > > > > > > shot of that, and I > > > > > > > > > > am quite surprised it already greatly affects VFIO dumping > > > > > > > > > > its own things. > > > > > > > > > > > > > > > > > > > > I would even ask one step further as what Dan was asking: > > > > > > > > > > have you thought > > > > > > > > > > about dumping VFIO states via multifd even during > > > > > > > > > > iterations? Would that > > > > > > > > > > help even more than this series (which IIUC only helps > > > > > > > > > > during the blackout > > > > > > > > > > phase)? > > > > > > > > > > > > > > > > > > To dump during RAM iteration, the VFIO device will need to > > > > > > > > > have > > > > > > > > > dirty tracking and iterate on its state, because the guest > > > > > > > > > CPUs > > > > > > > > > will still be running potentially changing VFIO state. That > > > > > > > > > seems > > > > > > > > > impractical in the general case. > > > > > > > > > > > > > > > > We already do such interations in vfio_save_iterate()? > > > > > > > > > > > > > > > > My understanding is the recent VFIO work is based on the fact > > > > > > > > that the VFIO > > > > > > > > device can track device state changes more or less (besides > > > > > > > > being able to > > > > > > > > save/load full states). E.g. I still remember in our QE tests > > > > > > > > some old > > > > > > > > devices report much more dirty pages than expected during the > > > > > > > > iterations > > > > > > > > when we were looking into such issue that a huge amount of > > > > > > > > dirty pages > > > > > > > > reported. But newer models seem to have fixed that and report > > > > > > > > much less. > > > > > > > > > > > > > > > > That issue was about GPU not NICs, though, and IIUC a major > > > > > > > > portion of such > > > > > > > > tracking used to be for GPU vRAMs. So maybe I was mixing up > > > > > > > > these, and > > > > > > > > maybe they work differently. > > > > > > > > > > > > > > The device which this series was developed against (Mellanox > > > > > > > ConnectX-7) > > > > > > > is already transferring its live state before the VM gets stopped > > > > > > > (via > > > > > > > save_live_iterate SaveVMHandler). > > > > > > > > > > > > > > It's just that in addition to the live state it has more than 400 > > > > > > > MiB > > > > > > > of state that cannot be transferred while the VM is still running. > > > > > > > And that fact hurts a lot with respect to the migration downtime. > > > > > > > > >
Re: [RFC PATCH v3 3/5] KVM: x86: Add notifications for Heki policy configuration and violation
On Fri, May 03, 2024 at 07:03:21AM GMT, Sean Christopherson wrote: > On Fri, May 03, 2024, Mickaël Salaün wrote: > > Add an interface for user space to be notified about guests' Heki policy > > and related violations. > > > > Extend the KVM_ENABLE_CAP IOCTL with KVM_CAP_HEKI_CONFIGURE and > > KVM_CAP_HEKI_DENIAL. Each one takes a bitmask as first argument that can > > contains KVM_HEKI_EXIT_REASON_CR0 and KVM_HEKI_EXIT_REASON_CR4. The > > returned value is the bitmask of known Heki exit reasons, for now: > > KVM_HEKI_EXIT_REASON_CR0 and KVM_HEKI_EXIT_REASON_CR4. > > > > If KVM_CAP_HEKI_CONFIGURE is set, a VM exit will be triggered for each > > KVM_HC_LOCK_CR_UPDATE hypercalls according to the requested control > > register. This enables to enlighten the VMM with the guest > > auto-restrictions. > > > > If KVM_CAP_HEKI_DENIAL is set, a VM exit will be triggered for each > > pinned CR violation. This enables the VMM to react to a policy > > violation. > > > > Cc: Borislav Petkov > > Cc: Dave Hansen > > Cc: H. Peter Anvin > > Cc: Ingo Molnar > > Cc: Kees Cook > > Cc: Madhavan T. Venkataraman > > Cc: Paolo Bonzini > > Cc: Sean Christopherson > > Cc: Thomas Gleixner > > Cc: Vitaly Kuznetsov > > Cc: Wanpeng Li > > Signed-off-by: Mickaël Salaün > > Link: https://lore.kernel.org/r/20240503131910.307630-4-...@digikod.net > > --- > > > > Changes since v1: > > * New patch. Making user space aware of Heki properties was requested by > > Sean Christopherson. > > No, I suggested having userspace _control_ the pinning[*], not merely be > notified > of pinning. > > : IMO, manipulation of protections, both for memory (this patch) and CPU > state > : (control registers in the next patch) should come from userspace. I have > no > : objection to KVM providing plumbing if necessary, but I think userspace > needs to > : to have full control over the actual state. > : > : One of the things that caused Intel's control register pinning series to > stall > : out was how to handle edge cases like kexec() and reboot. Deferring to > userspace > : means the kernel doesn't need to define policy, e.g. when to unprotect > memory, > : and avoids questions like "should userspace be able to overwrite pinned > control > : registers". > : > : And like the confidential VM use case, keeping userspace in the loop is a > big > : beneifit, e.g. the guest can't circumvent protections by coercing > userspace into > : writing to protected memory. > > I stand by that suggestion, because I don't see a sane way to handle things > like > kexec() and reboot without having a _much_ more sophisticated policy than > would > ever be acceptable in KVM. > > I think that can be done without KVM having any awareness of CR pinning > whatsoever. > E.g. userspace just needs to ability to intercept CR writes and inject #GPs. > Off > the cuff, I suspect the uAPI could look very similar to MSR filtering. E.g. > I bet > userspace could enforce MSR pinning without any new KVM uAPI at all. > > [*] https://lore.kernel.org/all/zfuyhpuhtmbyd...@google.com OK, I had concern about the control not directly coming from the guest, especially in the case of pKVM and confidential computing, but I get you point. It should indeed be quite similar to the MSR filtering on the userspace side, except that we need another interface for the guest to request such change (i.e. self-protection). Would it be OK to keep this new KVM_HC_LOCK_CR_UPDATE hypercall but forward the request to userspace with a VM exit instead? That would also enable userspace to get the request and directly configure the CR pinning with the same VM exit.
Re: [PULL 00/46] Mostly build system and other cleanups patches for 2024-05-06
On 5/6/24 00:50, Paolo Bonzini wrote: The following changes since commit 4977ce198d2390bff8c71ad5cb1a5f6aa24b56fb: Merge tag 'pull-tcg-20240501' ofhttps://gitlab.com/rth7680/qemu into staging (2024-05-01 15:15:33 -0700) are available in the Git repository at: https://gitlab.com/bonzini/qemu.git tags/for-upstream for you to fetch changes up to deb686ef0e609ceaec0daa5dc88eb5b3dd9701b0: qga/commands-posix: fix typo in qmp_guest_set_user_password (2024-05-03 19:36:51 +0200) * target/i386: Introduce SapphireRapids-v3 to add missing features * switch boards to "default y" * allow building emulators without any board * configs: list "implied" device groups in the default configs * remove unnecessary declarations from typedefs.h * target/i386: Give IRQs a chance when resetting HF_INHIBIT_IRQ_MASK Applied, thanks. Please update https://wiki.qemu.org/ChangeLog/9.1 as appropriate. r~
Re: [PULL 00/15] Hexagon: simplify gen for packets w/o read-after-write
On 5/5/24 19:42, Brian Cain wrote: The following changes since commit 248f6f62df073a3b4158fd0093863ab885feabb5: Merge tag 'pull-axp-20240504' ofhttps://gitlab.com/rth7680/qemu into staging (2024-05-04 08:39:46 -0700) are available in the Git repository at: https://github.com/quic/qemu tags/pull-hex-20240505 for you to fetch changes up to a4696661491cac8c1c08e7d482d751f808ce3143: Hexagon (target/hexagon) Remove hex_common.read_attribs_file (2024-05-05 16:22:07 -0700) Short-circuit for packets w/o read-after-write Cleanup unused code in gen_*.py scripts Applied, thanks. Please update https://wiki.qemu.org/ChangeLog/9.1 as appropriate. r~
Re: [PATCH v2 17/25] target/i386: move C0-FF opcodes to new decoder (except for x87)
On 5/6/24 01:09, Paolo Bonzini wrote: The shift instructions are rewritten instead of reusing code from the old decoder. Rotates use CC_OP_ADCOX more extensively and generally rely more on the optimizer, so that the code generators are shared between the immediate-count and variable-count cases. In particular, this makes gen_RCL and gen_RCR pretty efficient for the count == 1 case, which becomes (apart from a few extra movs) something like: (compute_cc_all if needed) // save old value for OF calculation mov cc_src2, T0 // the bulk of RCL is just this! deposit T0, cc_src, T0, 1, TARGET_LONG_BITS - 1 // compute carry shr cc_dst, cc_src2, length - 1 and cc_dst, cc_dst, 1 // compute overflow xor cc_src2, cc_src2, T0 extract cc_src2, cc_src2, length - 1, 1 32-bit MUL and IMUL are also slightly more efficient on 64-bit hosts. Signed-off-by: Paolo Bonzini --- target/i386/tcg/decode-new.h |1 + target/i386/tcg/translate.c | 23 +- target/i386/tcg/decode-new.c.inc | 142 + target/i386/tcg/emit.c.inc | 1014 +- 4 files changed, 1169 insertions(+), 11 deletions(-) Reviewed-by: Richard Henderson r~
Re: [PATCH v2 15/25] target/i386: move 60-BF opcodes to new decoder
On 5/6/24 01:09, Paolo Bonzini wrote: Compared to the old decoder, the main differences in translation are for the little-used ARPL instruction. IMUL is adjusted a bit to share more code to produce flags, but is otherwise very similar. Signed-off-by: Paolo Bonzini Reviewed-by: Richard Henderson +static void gen_POPA(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_popa(s); +} ... +static void gen_PUSHA(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_pusha(s); +} 3-space indent? r~
Re: [PATCH v2 05/25] target/i386: cleanup cc_op changes for REP/REPZ/REPNZ
On 5/6/24 09:31, Paolo Bonzini wrote: The comment deals with the former, the removal with the latter. The idea of the comment is that after SCAS/CMPS you have CC_OP_SUB*, so in principle you may expect that you need to set CC_OP_DYNAMIC explicitly at the end of a REPZ/REPNZ, which is where the CX != 0 and CX == 0 paths join. But this is not necessary, because there is nothing after that instruction - the translation block ends. So I guess the comment could instead be placed at the end of the function? /* * Only one iteration is done at a time, so the translation * block has ended unconditionally at this point and there * is no control flow junction - no need to set CC_OP_DYNAMIC. */ What do you think? Just before gen_jmp_rel_csize? Yes, that seems good. r~
Re: [PATCH v2 05/25] target/i386: cleanup cc_op changes for REP/REPZ/REPNZ
On Mon, May 6, 2024 at 6:08 PM Richard Henderson wrote: > > -gen_update_cc_op(s); > > l2 = gen_jz_ecx_string(s); > > +/* > > + * Only one iteration is done at a time, so there is > > + * no control flow junction here and cc_op is never dynamic. > > + */ > > fn(s, ot); > > gen_op_add_reg_im(s, s->aflag, R_ECX, -1); > > -gen_update_cc_op(s); > > gen_jcc1(s, (JCC_Z << 1) | (nz ^ 1), l2); > > if (s->repz_opt) { > > gen_op_jz_ecx(s, l2); > > Ok, but only because gen_jcc1 does the gen_update_cc_op. The comment is > neither correct > nor necessary. Yeah, it's true that gen_jcc1 does the update. On the other hand, there are two different kinds of cc_op updates: 1) at branches, if you know that only one of the sides might write cc_op - so you ensure it's up-to-date before the branch - and set CC_OP_DYNAMIC at the junction. Same if you have movcond instead of a branch. 2) at end of translation block, to spill the value lazily (because in the middle of the TB we are able to restore it from insn data). With these patches there is never a need to do this, because gen_jmp_rel() and gen_jmp_rel_csize() take care of it. The comment deals with the former, the removal with the latter. The idea of the comment is that after SCAS/CMPS you have CC_OP_SUB*, so in principle you may expect that you need to set CC_OP_DYNAMIC explicitly at the end of a REPZ/REPNZ, which is where the CX != 0 and CX == 0 paths join. But this is not necessary, because there is nothing after that instruction - the translation block ends. So I guess the comment could instead be placed at the end of the function? /* * Only one iteration is done at a time, so the translation * block has ended unconditionally at this point and there * is no control flow junction - no need to set CC_OP_DYNAMIC. */ What do you think? Paolo
Re: [PATCH RFC 00/26] Multifd device state transfer support with VFIO consumer
On 29.04.2024 17:09, Peter Xu wrote: On Fri, Apr 26, 2024 at 07:34:09PM +0200, Maciej S. Szmigiero wrote: On 24.04.2024 00:35, Peter Xu wrote: On Wed, Apr 24, 2024 at 12:25:08AM +0200, Maciej S. Szmigiero wrote: On 24.04.2024 00:20, Peter Xu wrote: On Tue, Apr 23, 2024 at 06:15:35PM +0200, Maciej S. Szmigiero wrote: On 19.04.2024 17:31, Peter Xu wrote: On Fri, Apr 19, 2024 at 11:07:21AM +0100, Daniel P. Berrangé wrote: On Thu, Apr 18, 2024 at 04:02:49PM -0400, Peter Xu wrote: On Thu, Apr 18, 2024 at 08:14:15PM +0200, Maciej S. Szmigiero wrote: I think one of the reasons for these results is that mixed (RAM + device state) multifd channels participate in the RAM sync process (MULTIFD_FLAG_SYNC) whereas device state dedicated channels don't. Firstly, I'm wondering whether we can have better names for these new hooks. Currently (only comment on the async* stuff): - complete_precopy_async - complete_precopy - complete_precopy_async_wait But perhaps better: - complete_precopy_begin - complete_precopy - complete_precopy_end ? As I don't see why the device must do something with async in such hook. To me it's more like you're splitting one process into multiple, then begin/end sounds more generic. Then, if with that in mind, IIUC we can already split ram_save_complete() into >1 phases too. For example, I would be curious whether the performance will go back to normal if we offloading multifd_send_sync_main() into the complete_precopy_end(), because we really only need one shot of that, and I am quite surprised it already greatly affects VFIO dumping its own things. I would even ask one step further as what Dan was asking: have you thought about dumping VFIO states via multifd even during iterations? Would that help even more than this series (which IIUC only helps during the blackout phase)? To dump during RAM iteration, the VFIO device will need to have dirty tracking and iterate on its state, because the guest CPUs will still be running potentially changing VFIO state. That seems impractical in the general case. We already do such interations in vfio_save_iterate()? My understanding is the recent VFIO work is based on the fact that the VFIO device can track device state changes more or less (besides being able to save/load full states). E.g. I still remember in our QE tests some old devices report much more dirty pages than expected during the iterations when we were looking into such issue that a huge amount of dirty pages reported. But newer models seem to have fixed that and report much less. That issue was about GPU not NICs, though, and IIUC a major portion of such tracking used to be for GPU vRAMs. So maybe I was mixing up these, and maybe they work differently. The device which this series was developed against (Mellanox ConnectX-7) is already transferring its live state before the VM gets stopped (via save_live_iterate SaveVMHandler). It's just that in addition to the live state it has more than 400 MiB of state that cannot be transferred while the VM is still running. And that fact hurts a lot with respect to the migration downtime. AFAIK it's a very similar story for (some) GPUs. So during iteration phase VFIO cannot yet leverage the multifd channels when with this series, am I right? That's right. Is it possible to extend that use case too? I guess so, but since this phase (iteration while the VM is still running) doesn't impact downtime it is much less critical. But it affects the bandwidth, e.g. even with multifd enabled, the device iteration data will still bottleneck at ~15Gbps on a common system setup the best case, even if the hosts are 100Gbps direct connected. Would that be a concern in the future too, or it's known problem and it won't be fixed anyway? I think any improvements to the migration performance are good, even if they don't impact downtime. It's just that this patch set focuses on the downtime phase as the more critical thing. After this gets improved there's no reason why not to look at improving performance of the VM live phase too if it brings sensible improvements. I remember Avihai used to have plan to look into similar issues, I hope this is exactly what he is looking for. Otherwise changing migration protocol from time to time is cumbersome; we always need to provide a flag to make sure old systems migrates in the old ways, new systems run the new ways, and for such a relatively major change I'd want to double check on how far away we can support offload VFIO iterations data to multifd. The device state transfer is indicated by a new flag in the multifd header (MULTIFD_FLAG_DEVICE_STATE). If we are to use multifd channels for VM live phase transfers these could simply re-use the same flag type. Right, and that's also my major purpose of such request to consider both issues. If supporting iterators can be easy on top of this, I am thinking whether we should do this in one shot.
Re: [PATCH RFC 23/26] migration/multifd: Device state transfer support - send side
On 29.04.2024 22:04, Peter Xu wrote: On Tue, Apr 16, 2024 at 04:43:02PM +0200, Maciej S. Szmigiero wrote: +bool multifd_queue_page(RAMBlock *block, ram_addr_t offset) +{ +g_autoptr(GMutexLocker) locker = NULL; + +/* + * Device state submissions for shared channels can come + * from multiple threads and conflict with page submissions + * with respect to multifd_send_state access. + */ +if (!multifd_send_state->device_state_dedicated_channels) { +locker = g_mutex_locker_new(_send_state->queue_job_mutex); Haven't read the rest, but suggest to stick with QemuMutex for the whole patchset, as that's what we use in the rest migration code, along with QEMU_LOCK_GUARD(). Ack, from a quick scan of QEMU thread sync primitives it seems that QemuMutex with QemuLockable and QemuCond should fulfill the requirements to replace GMutex, GMutexLocker and GCond. Thanks, Maciej
Re: [PATCH v2 11/25] target/i386: reintroduce debugging mechanism
On 5/6/24 01:09, Paolo Bonzini wrote: Signed-off-by: Paolo Bonzini --- target/i386/tcg/translate.c | 27 +++ target/i386/tcg/decode-new.c.inc | 3 +++ 2 files changed, 30 insertions(+) Acked-by: Richard Henderson r~
Re: [PATCH v2 10/25] target/i386: cleanup *gen_eob*
On 5/6/24 01:09, Paolo Bonzini wrote: Create a new wrapper for syscall/sysret, and do not go through multiple layers of wrappers. Signed-off-by: Paolo Bonzini --- target/i386/tcg/translate.c | 25 - 1 file changed, 12 insertions(+), 13 deletions(-) Reviewed-by: Richard Henderson r~
Re: [PATCH v2 09/25] target/i386: clarify the "reg" argument of functions returning CCPrepare
On 5/6/24 01:09, Paolo Bonzini wrote: Signed-off-by: Paolo Bonzini --- target/i386/tcg/translate.c | 15 --- 1 file changed, 8 insertions(+), 7 deletions(-) Reviewed-by: Richard Henderson r~
Re: [PATCH v2 08/25] target/i386: do not use s->T0 and s->T1 as scratch registers for CCPrepare
On 5/6/24 01:09, Paolo Bonzini wrote: Instead of using s->T0 or s->T1, create a scratch register when computing the C, NC, L or LE conditions. Signed-off-by: Paolo Bonzini --- target/i386/tcg/translate.c | 17 ++--- 1 file changed, 10 insertions(+), 7 deletions(-) Reviewed-by: Richard Henderson r~
Re: [PATCH v2 07/25] target/i386: extend cc_* when using them to compute flags
On 5/6/24 01:09, Paolo Bonzini wrote: Instead of using s->tmp0 or s->tmp4 as the result, just extend the cc_* registers in place. It is harmless and, if multiple setcc instructions are used, the optimizer will be able to remove the redundant ones. Signed-off-by: Paolo Bonzini --- target/i386/tcg/translate.c | 44 +++-- 1 file changed, 18 insertions(+), 26 deletions(-) Reviewed-by: Richard Henderson r~
Re: [PATCH v2 06/25] target/i386: pull cc_op update to callers of gen_jmp_rel{,_csize}
On 5/6/24 01:09, Paolo Bonzini wrote: gen_update_cc_op must be called before control flow splits. Doing it in gen_jmp_rel{,_csize} may hide bugs, instead assert that cc_op is clean---even if that means a few more calls to gen_update_cc_op(). With this new invariant, setting cc_op to CC_OP_DYNAMIC is unnecessary since the caller should have done it. Signed-off-by: Paolo Bonzini --- target/i386/tcg/translate.c | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) Reviewed-by: Richard Henderson r~
Re: [PATCH v2 05/25] target/i386: cleanup cc_op changes for REP/REPZ/REPNZ
On 5/6/24 01:09, Paolo Bonzini wrote: gen_update_cc_op must be called before control flow splits. Do it where the jump on ECX!=0 is translated. On the other hand, remove the call before gen_jcc1, which takes care of it already, and explain why REPZ/REPNZ need not use CC_OP_DYNAMIC---the translation block ends before any control-flow-dependent cc_op could be observed. Signed-off-by: Paolo Bonzini --- target/i386/tcg/translate.c | 13 + 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 3f1d2858fc9..6b766f5dd3f 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -1242,11 +1242,15 @@ static inline void gen_jcc1(DisasContext *s, int b, TCGLabel *l1) } /* XXX: does not work with gdbstub "ice" single step - not a - serious problem */ + serious problem. The caller can jump to the returned label + to stop the REP but, if the flags have changed, it has to call + gen_update_cc_op before doing so. */ static TCGLabel *gen_jz_ecx_string(DisasContext *s) { TCGLabel *l1 = gen_new_label(); TCGLabel *l2 = gen_new_label(); + +gen_update_cc_op(s); gen_op_jnz_ecx(s, l1); gen_set_label(l2); gen_jmp_rel_csize(s, 0, 1); @@ -1342,7 +1346,6 @@ static void gen_repz(DisasContext *s, MemOp ot, void (*fn)(DisasContext *s, MemOp ot)) { TCGLabel *l2; -gen_update_cc_op(s); l2 = gen_jz_ecx_string(s); fn(s, ot); gen_op_add_reg_im(s, s->aflag, R_ECX, -1); Ok. @@ -1364,11 +1367,13 @@ static void gen_repz2(DisasContext *s, MemOp ot, int nz, void (*fn)(DisasContext *s, MemOp ot)) { TCGLabel *l2; -gen_update_cc_op(s); l2 = gen_jz_ecx_string(s); +/* + * Only one iteration is done at a time, so there is + * no control flow junction here and cc_op is never dynamic. + */ fn(s, ot); gen_op_add_reg_im(s, s->aflag, R_ECX, -1); -gen_update_cc_op(s); gen_jcc1(s, (JCC_Z << 1) | (nz ^ 1), l2); if (s->repz_opt) { gen_op_jz_ecx(s, l2); Ok, but only because gen_jcc1 does the gen_update_cc_op. The comment is neither correct nor necessary. The reason to write cc_op before branches instead of junctions is to avoid having *two* writes of cc_op on either side of the branch. r~
Re: [PATCH 0/3] Make it possible to compile the x86 binaries without FDC
On Thu, Apr 25, 2024 at 8:43 PM Thomas Huth wrote: > OTOH, it seems > to work fine, and the FDC is only disabled when it is not available > in the binary, so I hope this patch is fine, too. We do the same for parallel so i think it should be fine---definitely for -nodefaults, and I'd say in general too. The CMOS byte already has a way to communicate no-floppy (0, see cmos_get_fd_drive_type). Paolo
Re: [PATCH v2 04/25] target/i386: cc_op is not dynamic in gen_jcc1
On 5/6/24 01:09, Paolo Bonzini wrote: Resetting cc_op to CC_OP_DYNAMIC should be done at control flow junctions, which is not the case here. This translation block is ending and the only effect of calling set_cc_op() would be a discard of s->cc_srcT. This discard is useless (it's a temporary, not a global) and in fact prevents gen_prepare_cc from returning s->cc_srcT. Signed-off-by: Paolo Bonzini --- target/i386/tcg/translate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) Reviewed-by: Richard Henderson r~
Re: [PATCH v3 1/5] accel/tcg: Move SoftMMU specific units to softmmu_specific_ss[]
On 3/5/24 14:34, Philippe Mathieu-Daudé wrote: Currently these files are only used in system emulation, but could eventually be used by user emulation. Use the "softmmu_specific_ss" to express they are related to SoftMMU. Signed-off-by: Philippe Mathieu-Daudé --- accel/tcg/meson.build | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/accel/tcg/meson.build b/accel/tcg/meson.build index aef80de967..84826f043a 100644 --- a/accel/tcg/meson.build +++ b/accel/tcg/meson.build @@ -16,12 +16,13 @@ tcg_specific_ss.add(when: 'CONFIG_SYSTEM_ONLY', if_false: files('user-exec-stub. if get_option('plugins') tcg_specific_ss.add(files('plugin-gen.c')) endif -specific_ss.add_all(when: 'CONFIG_TCG', if_true: tcg_specific_ss) -specific_ss.add(when: ['CONFIG_SYSTEM_ONLY', 'CONFIG_TCG'], if_true: files( +softmmu_specific_ss = ss.source_set() +softmmu_specific_ss.add(files( 'cputlb.c', 'watchpoint.c', )) +tcg_specific_ss.add(when: 'CONFIG_SYSTEM_ONLY', if_true: softmmu_specific_ss) Should be .add_all() here. system_ss.add(when: ['CONFIG_TCG'], if_true: files( 'icount-common.c', @@ -34,3 +35,5 @@ tcg_module_ss.add(when: ['CONFIG_SYSTEM_ONLY', 'CONFIG_TCG'], if_true: files( 'tcg-accel-ops-icount.c', 'tcg-accel-ops-rr.c', )) + +specific_ss.add_all(when: 'CONFIG_TCG', if_true: tcg_specific_ss)
Re: More doc updates needed for new migrate argument @channels
On Mon, May 06, 2024 at 06:58:00AM +0200, Markus Armbruster wrote: > Peter Xu writes: > > [...] > > > I also copied qemu-devel starting from now. > > My bad, I messed that up! Het, do you want to send a patch? Thanks, -- Peter Xu
Re: [PULL 00/12] qemu-sparc queue 20240506
On 6/5/24 16:50, Michael Tokarev wrote: 06.05.2024 14:44, Mark Cave-Ayland wrote: Mark Cave-Ayland (1): hw/sparc64: set iommu_platform=on for virtio devices attached to the sun4u machine Richard Henderson (7): linux-user/sparc: Add more hwcap bits for sparc64 target/sparc: Fix FEXPAND target/sparc: Fix FMUL8x16 target/sparc: Fix FMUL8x16A{U,L} target/sparc: Fix FMULD8*X16 target/sparc: Fix FPMERGE target/sparc: Split out do_ms16b Should these "Fix" changes go to stable? If the backport is clean, otherwise they fix a 17yo bug so could just go in the next release IMHO. Thomas Huth (4): target/sparc/cpu: Rename the CPU models with a "+" in their names target/sparc/cpu: Avoid spaces by default in the CPU names docs/system/target-sparc: Improve the Sparc documentation docs/about: Deprecate the old "UltraSparc" CPU names that contain a "+" Thanks, /mjt
[PATCH 5/5] hw/mips/boston: Implement multi core support
Implement multiple physical core support by passing topology to CPS subsystem and generate cpu-map fdt node to decribe new topology. Signed-off-by: Jiaxun Yang --- hw/mips/boston.c | 37 - 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/hw/mips/boston.c b/hw/mips/boston.c index 1b44fb354c..4ed7d366fe 100644 --- a/hw/mips/boston.c +++ b/hw/mips/boston.c @@ -542,7 +542,10 @@ static const void *create_fdt(BostonState *s, qemu_fdt_setprop_cell(fdt, "/cpus", "#size-cells", 0x0); qemu_fdt_setprop_cell(fdt, "/cpus", "#address-cells", 0x1); +qemu_fdt_add_subnode(fdt, "/cpus/cpu-map"); for (cpu = 0; cpu < ms->smp.cpus; cpu++) { +char *map_path; + name = g_strdup_printf("/cpus/cpu@%d", cpu); qemu_fdt_add_subnode(fdt, name); qemu_fdt_setprop_string(fdt, name, "compatible", "img,mips"); @@ -550,6 +553,27 @@ static const void *create_fdt(BostonState *s, qemu_fdt_setprop_cell(fdt, name, "reg", cpu); qemu_fdt_setprop_string(fdt, name, "device_type", "cpu"); qemu_fdt_setprop_cells(fdt, name, "clocks", clk_ph, FDT_BOSTON_CLK_CPU); +qemu_fdt_setprop_cell(fdt, name, "phandle", qemu_fdt_alloc_phandle(fdt)); + +if (ms->smp.threads > 1) { +map_path = g_strdup_printf( +"/cpus/cpu-map/socket%d/cluster%d/core%d/thread%d", +cpu / (ms->smp.clusters * ms->smp.cores * ms->smp.threads), +(cpu / (ms->smp.cores * ms->smp.threads)) % ms->smp.clusters, +(cpu / ms->smp.threads) % ms->smp.cores, +cpu % ms->smp.threads); +} else { +map_path = g_strdup_printf( +"/cpus/cpu-map/socket%d/cluster%d/core%d", +cpu / (ms->smp.clusters * ms->smp.cores), +(cpu / ms->smp.cores) % ms->smp.clusters, +cpu % ms->smp.cores); +} + +qemu_fdt_add_path(fdt, map_path); +qemu_fdt_setprop_phandle(fdt, map_path, "cpu", name); + +g_free(map_path); g_free(name); } @@ -591,6 +615,15 @@ static const void *create_fdt(BostonState *s, g_free(name); g_free(gic_name); +/* CM node */ +name = g_strdup_printf("/soc/cm@%" HWADDR_PRIx, memmap[BOSTON_CM].base); +qemu_fdt_add_subnode(fdt, name); +qemu_fdt_setprop_string(fdt, name, "compatible", "mti,mips-cm"); +qemu_fdt_setprop_cells(fdt, name, "reg", memmap[BOSTON_CM].base, +memmap[BOSTON_CM].size); +g_free(name); + + /* CDMM node */ name = g_strdup_printf("/soc/cdmm@%" HWADDR_PRIx, memmap[BOSTON_CDMM].base); qemu_fdt_add_subnode(fdt, name); @@ -703,7 +736,9 @@ static void boston_mach_init(MachineState *machine) object_initialize_child(OBJECT(machine), "cps", >cps, TYPE_MIPS_CPS); object_property_set_str(OBJECT(>cps), "cpu-type", machine->cpu_type, _fatal); -object_property_set_uint(OBJECT(>cps), "num-vp", machine->smp.cpus, +object_property_set_uint(OBJECT(>cps), "num-pcore", machine->smp.cores, +_fatal); +object_property_set_uint(OBJECT(>cps), "num-vp", machine->smp.threads, _fatal); qdev_connect_clock_in(DEVICE(>cps), "clk-in", qdev_get_clock_out(dev, "cpu-refclk")); -- 2.34.1
[PATCH 2/5] hw/msic/mips_cmgcr: Implement multicore functions
We implemented following functions to allow software to probe and control VPs on secondary core - Reading out pcore count and coherence state - Two scratch GCRs for firmware - Semaphore GCR for register locking - Redirect block to other cores Signed-off-by: Jiaxun Yang --- hw/misc/mips_cmgcr.c | 168 +++ include/hw/misc/mips_cmgcr.h | 87 +++--- 2 files changed, 215 insertions(+), 40 deletions(-) diff --git a/hw/misc/mips_cmgcr.c b/hw/misc/mips_cmgcr.c index 2703040f45..8c2d184f2c 100644 --- a/hw/misc/mips_cmgcr.c +++ b/hw/misc/mips_cmgcr.c @@ -73,14 +73,19 @@ static inline void update_gic_base(MIPSGCRState *gcr, uint64_t val) static uint64_t gcr_read(void *opaque, hwaddr addr, unsigned size) { MIPSGCRState *gcr = (MIPSGCRState *) opaque; -MIPSGCRVPState *current_vps = >vps[current_cpu->cpu_index]; -MIPSGCRVPState *other_vps = >vps[current_vps->other]; +int redirect_corenum = mips_gcr_get_redirect_corenum(gcr); +int redirect_vpid = mips_gcr_get_redirect_vpid(gcr); +int current_corenum = mips_gcr_get_current_corenum(gcr); +int current_vpid = mips_gcr_get_current_vpid(gcr); +MIPSGCRPCoreState *current_pcore = >pcs[current_corenum]; +MIPSGCRVPState *current_vps = _pcore->vps[current_vpid]; +MIPSGCRPCoreState *other_pcore = >pcs[redirect_corenum]; +MIPSGCRVPState *other_vps = _pcore->vps[redirect_vpid]; switch (addr) { /* Global Control Block Register */ case GCR_CONFIG_OFS: -/* Set PCORES to 0 */ -return 0; +return gcr->num_pcores - 1; case GCR_BASE_OFS: return gcr->gcr_base; case GCR_REV_OFS: @@ -96,7 +101,19 @@ static uint64_t gcr_read(void *opaque, hwaddr addr, unsigned size) case GCR_L2_CONFIG_OFS: /* L2 BYPASS */ return GCR_L2_CONFIG_BYPASS_MSK; +case GCR_SYS_CONFIG2_OFS: +return gcr->num_vps << GCR_SYS_CONFIG2_MAXVP_SHF; +case GCR_SCRATCH0_OFS: +return gcr->scratch[0]; +case GCR_SCRATCH1_OFS: +return gcr->scratch[1]; +case GCR_SEM_OFS: +return gcr->sem; /* Core-Local and Core-Other Control Blocks */ +case MIPS_CLCB_OFS + GCR_CL_COH_EN_OFS: +return current_pcore->coh_en; +case MIPS_COCB_OFS + GCR_CL_COH_EN_OFS: +return other_pcore->coh_en; case MIPS_CLCB_OFS + GCR_CL_CONFIG_OFS: case MIPS_COCB_OFS + GCR_CL_CONFIG_OFS: /* Set PVP to # of VPs - 1 */ @@ -105,10 +122,18 @@ static uint64_t gcr_read(void *opaque, hwaddr addr, unsigned size) return current_vps->reset_base; case MIPS_COCB_OFS + GCR_CL_RESETBASE_OFS: return other_vps->reset_base; -case MIPS_CLCB_OFS + GCR_CL_OTHER_OFS: -return current_vps->other; -case MIPS_COCB_OFS + GCR_CL_OTHER_OFS: -return other_vps->other; +case MIPS_CLCB_OFS + GCR_CL_REDIRECT_OFS: +return current_vps->redirect; +case MIPS_COCB_OFS + GCR_CL_REDIRECT_OFS: +return other_vps->redirect; +case MIPS_CLCB_OFS + GCR_CL_ID_OFS: +return current_corenum; +case MIPS_COCB_OFS + GCR_CL_ID_OFS: +return redirect_corenum; +case MIPS_CLCB_OFS + GCR_CL_SCRATCH_OFS: +return current_vps->scratch; +case MIPS_COCB_OFS + GCR_CL_SCRATCH_OFS: +return other_vps->scratch; default: qemu_log_mask(LOG_UNIMP, "Read %d bytes at GCR offset 0x%" HWADDR_PRIx "\n", size, addr); @@ -123,12 +148,36 @@ static inline target_ulong get_exception_base(MIPSGCRVPState *vps) return (int32_t)(vps->reset_base & GCR_CL_RESET_BASE_RESETBASE_MSK); } +static inline void set_redirect(MIPSGCRState *gcr, +MIPSGCRVPState *vps, target_ulong data) +{ +int new_vpid = data & GCR_CL_REDIRECT_VP_MSK; +int new_coreid = (data & GCR_CL_REDIRECT_CORE_MSK) >> GCR_CL_REDIRECT_CORE_SHF; + +if (new_vpid >= gcr->num_vps) { +return; +} + +if (new_coreid >= gcr->num_pcores) { +return; +} + +vps->redirect = data & (GCR_CL_REDIRECT_VP_MSK | GCR_CL_REDIRECT_CORE_MSK); +} + /* Write GCR registers */ static void gcr_write(void *opaque, hwaddr addr, uint64_t data, unsigned size) { -MIPSGCRState *gcr = (MIPSGCRState *)opaque; -MIPSGCRVPState *current_vps = >vps[current_cpu->cpu_index]; -MIPSGCRVPState *other_vps = >vps[current_vps->other]; +MIPSGCRState *gcr = (MIPSGCRState *) opaque; +int redirect_corenum = mips_gcr_get_redirect_corenum(gcr); +int redirect_vpid = mips_gcr_get_redirect_vpid(gcr); +int redirect_vpnum = mips_gcr_get_redirect_vpnum(gcr); +int current_corenum = mips_gcr_get_current_corenum(gcr); +int current_vpid = mips_gcr_get_current_vpid(gcr); +MIPSGCRPCoreState *current_pcore = >pcs[current_corenum]; +MIPSGCRVPState *current_vps = _pcore->vps[current_vpid]; +MIPSGCRPCoreState *other_pcore = >pcs[redirect_corenum]; +MIPSGCRVPState *other_vps =
[PATCH 1/5] target/mips: Make globalnumber a CPU property
GlobalNumber marks topology information of a CPU instance. Make it a CPU property to allow CPS to override topology information. Signed-off-by: Jiaxun Yang --- target/mips/cpu.c| 16 +++- target/mips/cpu.h| 10 +- target/mips/sysemu/machine.c | 5 ++--- 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/target/mips/cpu.c b/target/mips/cpu.c index bbe01d07dd..762000d09b 100644 --- a/target/mips/cpu.c +++ b/target/mips/cpu.c @@ -296,7 +296,6 @@ static void mips_cpu_reset_hold(Object *obj, ResetType type) env->CP0_Random = env->tlb->nb_tlb - 1; env->tlb->tlb_in_use = env->tlb->nb_tlb; env->CP0_Wired = 0; -env->CP0_GlobalNumber = (cs->cpu_index & 0xFF) << CP0GN_VPId; env->CP0_EBase = KSEG0_BASE | (cs->cpu_index & 0x3FF); if (env->CP0_Config3 & (1 << CP0C3_CMGCR)) { env->CP0_CMGCRBase = 0x1fbf8000 >> 4; @@ -484,6 +483,12 @@ static void mips_cpu_realizefn(DeviceState *dev, Error **errp) env->exception_base = (int32_t)0xBFC0; +#if !defined(CONFIG_USER_ONLY) +if (env->CP0_GlobalNumber == -1) { +env->CP0_GlobalNumber = (cs->cpu_index & 0xFF) << CP0GN_VPId; +} +#endif + #if defined(CONFIG_TCG) && !defined(CONFIG_USER_ONLY) mmu_init(env, env->cpu_model); #endif @@ -563,6 +568,13 @@ static const TCGCPUOps mips_tcg_ops = { }; #endif /* CONFIG_TCG */ +static Property mips_cpu_properties[] = { +#if !defined(CONFIG_USER_ONLY) +DEFINE_PROP_INT32("globalnumber", MIPSCPU, env.CP0_GlobalNumber, -1), +#endif +DEFINE_PROP_END_OF_LIST(), +}; + static void mips_cpu_class_init(ObjectClass *c, void *data) { MIPSCPUClass *mcc = MIPS_CPU_CLASS(c); @@ -592,6 +604,8 @@ static void mips_cpu_class_init(ObjectClass *c, void *data) #ifdef CONFIG_TCG cc->tcg_ops = _tcg_ops; #endif /* CONFIG_TCG */ + +device_class_set_props(dc, mips_cpu_properties); } static const TypeInfo mips_cpu_type_info = { diff --git a/target/mips/cpu.h b/target/mips/cpu.h index 3e906a175a..7499608678 100644 --- a/target/mips/cpu.h +++ b/target/mips/cpu.h @@ -612,8 +612,13 @@ typedef struct CPUArchState { # define CP0EnLo_RI 31 # define CP0EnLo_XI 30 #endif -int32_t CP0_GlobalNumber; +/* CP0_GlobalNumber is preserved across CPU reset. */ #define CP0GN_VPId 0 +#define CP0GN_VPId_MASK (0xFFUL << CP0GN_VPId) +#define CP0GN_CoreNum 8 +#define CP0GN_CoreNum_MASK (0xFUL << CP0GN_CoreNum) +#define CP0GN_ClusterNum 16 +#define CP0GN_ClusterNum_MASK (0xFUL << CP0GN_ClusterNum) /* * CP0 Register 4 */ @@ -1175,6 +1180,9 @@ typedef struct CPUArchState { struct {} end_reset_fields; /* Fields from here on are preserved across CPU reset. */ +#if !defined(CONFIG_USER_ONLY) +int32_t CP0_GlobalNumber; +#endif CPUMIPSMVPContext *mvp; #if !defined(CONFIG_USER_ONLY) CPUMIPSTLBContext *tlb; diff --git a/target/mips/sysemu/machine.c b/target/mips/sysemu/machine.c index 213fd637fc..235d640862 100644 --- a/target/mips/sysemu/machine.c +++ b/target/mips/sysemu/machine.c @@ -218,8 +218,8 @@ static const VMStateDescription vmstate_tlb = { const VMStateDescription vmstate_mips_cpu = { .name = "cpu", -.version_id = 21, -.minimum_version_id = 21, +.version_id = 22, +.minimum_version_id = 22, .post_load = cpu_post_load, .fields = (const VMStateField[]) { /* Active TC */ @@ -257,7 +257,6 @@ const VMStateDescription vmstate_mips_cpu = { VMSTATE_INT32(env.CP0_VPEOpt, MIPSCPU), VMSTATE_UINT64(env.CP0_EntryLo0, MIPSCPU), VMSTATE_UINT64(env.CP0_EntryLo1, MIPSCPU), -VMSTATE_INT32(env.CP0_GlobalNumber, MIPSCPU), VMSTATE_UINTTL(env.CP0_Context, MIPSCPU), VMSTATE_INT32(env.CP0_MemoryMapID, MIPSCPU), VMSTATE_INT32(env.CP0_PageMask, MIPSCPU), -- 2.34.1
[PATCH 0/5] hw/mips: Proper multi core support
Hi all, This series implemented propper multiple core support for MIPS CPS systsm. Previously all CPUs are being implemented as a smt thread in a single core. Now it respects topology supplied in QEMU args. To test: Build a latest kernel with 64r6el_defconfig (tested on 6.6, next-20240506). Then run: ``` qemu-system-mips64el -M boston -cpu I6500 -kernel ~/linux-next/vmlinux -smp 4,cores=2,threads=2 -append "console=ttyS0,115200" -nographic ``` In dmesg of guest kernel: ``` [0.00] VP topology {2,2} total 4 ... [0.085190] smp: Bringing up secondary CPUs ... [0.090219] Primary instruction cache 64kB, VIPT, 4-way, linesize 64 bytes. [0.095461] Primary data cache 64kB, 4-way, PIPT, no aliases, linesize 64 bytes [0.096658] CPU1 revision is: 0001b000 (MIPS I6500) [0.096718] FPU revision is: 20f30300 [0.124711] Synchronize counters for CPU 1: done. [0.940979] Primary instruction cache 64kB, VIPT, 4-way, linesize 64 bytes. [0.941041] Primary data cache 64kB, 4-way, PIPT, no aliases, linesize 64 bytes [0.941256] CPU2 revision is: 0001b000 (MIPS I6500) [0.941289] FPU revision is: 20f30300 [0.965322] Synchronize counters for CPU 2: done. [1.260937] Primary instruction cache 64kB, VIPT, 4-way, linesize 64 bytes. [1.261001] Primary data cache 64kB, 4-way, PIPT, no aliases, linesize 64 bytes [1.261172] CPU3 revision is: 0001b000 (MIPS I6500) [1.261209] FPU revision is: 20f30300 [1.285390] Synchronize counters for CPU 3: done. [1.346985] smp: Brought up 1 node, 4 CPUs ``` Please review. Thanks To: qemu-devel@nongnu.org Cc: Philippe Mathieu-Daudé Signed-off-by: Jiaxun Yang --- Jiaxun Yang (5): target/mips: Make globalnumber a CPU property hw/msic/mips_cmgcr: Implement multicore functions hw/msic/mips_cpc: Implement multi core support hw/mips/cps: Implement multi core support hw/mips/boston: Implement multi core support hw/mips/boston.c | 37 +- hw/mips/cps.c| 66 ++--- hw/misc/mips_cmgcr.c | 168 +++ hw/misc/mips_cpc.c | 97 ++--- include/hw/mips/cps.h| 1 + include/hw/misc/mips_cmgcr.h | 87 +++--- include/hw/misc/mips_cpc.h | 15 +++- target/mips/cpu.c| 16 - target/mips/cpu.h| 10 ++- target/mips/sysemu/machine.c | 5 +- 10 files changed, 403 insertions(+), 99 deletions(-) --- base-commit: 248f6f62df073a3b4158fd0093863ab885feabb5 change-id: 20240506-mips-smp-9af9e71ad8c2 Best regards, -- Jiaxun Yang
[PATCH 4/5] hw/mips/cps: Implement multi core support
Implement multiple physical core support by creating CPU devices accorading to the new topology and passing pcore/vp information to CPC and CMGCR sub-devices. Signed-off-by: Jiaxun Yang --- hw/mips/cps.c | 66 +++ include/hw/mips/cps.h | 1 + 2 files changed, 41 insertions(+), 26 deletions(-) diff --git a/hw/mips/cps.c b/hw/mips/cps.c index 07b73b0a1f..6cf02379a9 100644 --- a/hw/mips/cps.c +++ b/hw/mips/cps.c @@ -73,31 +73,43 @@ static void mips_cps_realize(DeviceState *dev, Error **errp) return; } -for (int i = 0; i < s->num_vp; i++) { -MIPSCPU *cpu = MIPS_CPU(object_new(s->cpu_type)); -CPUMIPSState *env = >env; +object_initialize_child(OBJECT(dev), "gcr", >gcr, TYPE_MIPS_GCR); -/* All VPs are halted on reset. Leave powering up to CPC. */ -object_property_set_bool(OBJECT(cpu), "start-powered-off", true, - _abort); +for (int corenum = 0; corenum < s->num_pcore; corenum++) { +for (int vpid = 0; vpid < s->num_vp; vpid++) { +int vpnum = corenum * s->num_vp + vpid; +int32_t globalnumber = (corenum << CP0GN_CoreNum) | vpid; -/* All cores use the same clock tree */ -qdev_connect_clock_in(DEVICE(cpu), "clk-in", s->clock); +MIPSCPU *cpu = MIPS_CPU(object_new(s->cpu_type)); +CPUMIPSState *env = >env; -if (!qdev_realize_and_unref(DEVICE(cpu), NULL, errp)) { -return; -} +/* All VPs are halted on reset. Leave powering up to CPC. */ +object_property_set_bool(OBJECT(cpu), "start-powered-off", true, +_abort); + +object_property_set_int(OBJECT(cpu), "globalnumber", globalnumber, +_abort); + +/* All cores use the same clock tree */ +qdev_connect_clock_in(DEVICE(cpu), "clk-in", s->clock); + +if (!qdev_realize_and_unref(DEVICE(cpu), NULL, errp)) { +return; +} + +g_assert(vpnum == CPU(cpu)->cpu_index); -/* Init internal devices */ -cpu_mips_irq_init_cpu(cpu); -cpu_mips_clock_init(cpu); +/* Init internal devices */ +cpu_mips_irq_init_cpu(cpu); +cpu_mips_clock_init(cpu); -if (cpu_mips_itu_supported(env)) { -itu_present = true; -/* Attach ITC Tag to the VP */ -env->itc_tag = mips_itu_get_tag_region(>itu); +if (cpu_mips_itu_supported(env)) { +itu_present = true; +/* Attach ITC Tag to the VP */ +env->itc_tag = mips_itu_get_tag_region(>itu); +} +qemu_register_reset(main_cpu_reset, cpu); } -qemu_register_reset(main_cpu_reset, cpu); } /* Inter-Thread Communication Unit */ @@ -119,8 +131,12 @@ static void mips_cps_realize(DeviceState *dev, Error **errp) object_initialize_child(OBJECT(dev), "cpc", >cpc, TYPE_MIPS_CPC); object_property_set_uint(OBJECT(>cpc), "num-vp", s->num_vp, _abort); +object_property_set_uint(OBJECT(>cpc), "num-pcore", s->num_pcore, +_abort); object_property_set_int(OBJECT(>cpc), "vp-start-running", 1, _abort); +object_property_set_link(OBJECT(>cpc), "gcr", OBJECT(>gcr), +_abort); if (!sysbus_realize(SYS_BUS_DEVICE(>cpc), errp)) { return; } @@ -130,7 +146,7 @@ static void mips_cps_realize(DeviceState *dev, Error **errp) /* Global Interrupt Controller */ object_initialize_child(OBJECT(dev), "gic", >gic, TYPE_MIPS_GIC); -object_property_set_uint(OBJECT(>gic), "num-vp", s->num_vp, +object_property_set_uint(OBJECT(>gic), "num-vp", s->num_vp * s->num_pcore, _abort); object_property_set_uint(OBJECT(>gic), "num-irq", 128, _abort); @@ -141,16 +157,13 @@ static void mips_cps_realize(DeviceState *dev, Error **errp) memory_region_add_subregion(>container, 0, sysbus_mmio_get_region(SYS_BUS_DEVICE(>gic), 0)); -/* Global Configuration Registers */ -gcr_base = MIPS_CPU(first_cpu)->env.CP0_CMGCRBase << 4; - -object_initialize_child(OBJECT(dev), "gcr", >gcr, TYPE_MIPS_GCR); +gcr_base = GCR_BASE_ADDR; +object_property_set_uint(OBJECT(>gcr), "num-pcores", s->num_pcore, +_abort); object_property_set_uint(OBJECT(>gcr), "num-vp", s->num_vp, _abort); object_property_set_int(OBJECT(>gcr), "gcr-rev", 0x800, _abort); -object_property_set_int(OBJECT(>gcr), "gcr-base", gcr_base, -_abort); object_property_set_link(OBJECT(>gcr), "gic", OBJECT(>gic.mr),
[PATCH 3/5] hw/msic/mips_cpc: Implement multi core support
Implement multiple physical core support for MIPS CPC controller. Including some R/O configuration registers and VP bring up support on multiple cores. Signed-off-by: Jiaxun Yang --- hw/misc/mips_cpc.c | 97 ++ include/hw/misc/mips_cpc.h | 15 ++- 2 files changed, 85 insertions(+), 27 deletions(-) diff --git a/hw/misc/mips_cpc.c b/hw/misc/mips_cpc.c index 1e8fd2e699..f6a2f29088 100644 --- a/hw/misc/mips_cpc.c +++ b/hw/misc/mips_cpc.c @@ -25,9 +25,25 @@ #include "hw/sysbus.h" #include "migration/vmstate.h" +#include "hw/misc/mips_cmgcr.h" #include "hw/misc/mips_cpc.h" #include "hw/qdev-properties.h" +static inline int cpc_vpnum_to_corenum(MIPSCPCState *cpc, int vpnum) +{ +return vpnum / cpc->num_vp; +} + +static inline int cpc_vpnum_to_vpid(MIPSCPCState *cpc, int vpnum) +{ +return vpnum % cpc->num_vp; +} + +static inline MIPSCPCPCoreState *cpc_vpnum_to_pcs(MIPSCPCState *cpc, int vpnum) +{ +return >pcs[cpc_vpnum_to_corenum(cpc, vpnum)]; +} + static inline uint64_t cpc_vp_run_mask(MIPSCPCState *cpc) { return (1ULL << cpc->num_vp) - 1; @@ -39,36 +55,41 @@ static void mips_cpu_reset_async_work(CPUState *cs, run_on_cpu_data data) cpu_reset(cs); cs->halted = 0; -cpc->vp_running |= 1ULL << cs->cpu_index; +cpc_vpnum_to_pcs(cpc, cs->cpu_index)->vp_running |= +1 << cpc_vpnum_to_vpid(cpc, cs->cpu_index); } -static void cpc_run_vp(MIPSCPCState *cpc, uint64_t vp_run) +static void cpc_run_vp(MIPSCPCState *cpc, int pcore, uint64_t vp_run) { -CPUState *cs = first_cpu; +MIPSCPCPCoreState *pcs = >pcs[pcore]; -CPU_FOREACH(cs) { -uint64_t i = 1ULL << cs->cpu_index; -if (i & vp_run & ~cpc->vp_running) { +for (int vpid = 0; vpid < cpc->num_vp; vpid++) { +if ((1 << vpid) & vp_run & ~pcs->vp_running) { +int vpnum = pcore * cpc->num_vp + vpid; /* * To avoid racing with a CPU we are just kicking off. * We do the final bit of preparation for the work in * the target CPUs context. */ -async_safe_run_on_cpu(cs, mips_cpu_reset_async_work, - RUN_ON_CPU_HOST_PTR(cpc)); +async_safe_run_on_cpu(qemu_get_cpu(vpnum), +mips_cpu_reset_async_work, +RUN_ON_CPU_HOST_PTR(cpc)); +pcs->vp_running |= 1 << vpid; } } } -static void cpc_stop_vp(MIPSCPCState *cpc, uint64_t vp_stop) +static void cpc_stop_vp(MIPSCPCState *cpc, int pcore, uint64_t vp_stop) { -CPUState *cs = first_cpu; +MIPSCPCPCoreState *pcs = >pcs[pcore]; + +for (int vpid = 0; vpid < cpc->num_vp; vpid++) { +if ((1 << vpid) & vp_stop & pcs->vp_running) { +int vpnum = pcore * cpc->num_vp + vpid; +CPUState *cs = qemu_get_cpu(vpnum); -CPU_FOREACH(cs) { -uint64_t i = 1ULL << cs->cpu_index; -if (i & vp_stop & cpc->vp_running) { cpu_interrupt(cs, CPU_INTERRUPT_HALT); -cpc->vp_running &= ~i; +pcs->vp_running &= ~(1 << vpid); } } } @@ -77,15 +98,20 @@ static void cpc_write(void *opaque, hwaddr offset, uint64_t data, unsigned size) { MIPSCPCState *s = opaque; +int current_corenum = cpc_vpnum_to_corenum(s, current_cpu->cpu_index); switch (offset) { case CPC_CL_BASE_OFS + CPC_VP_RUN_OFS: +cpc_run_vp(s, current_corenum, data); +break; case CPC_CO_BASE_OFS + CPC_VP_RUN_OFS: -cpc_run_vp(s, data & cpc_vp_run_mask(s)); +cpc_run_vp(s, mips_gcr_get_redirect_corenum(s->gcr), data); break; case CPC_CL_BASE_OFS + CPC_VP_STOP_OFS: +cpc_stop_vp(s, current_corenum, data); +break; case CPC_CO_BASE_OFS + CPC_VP_STOP_OFS: -cpc_stop_vp(s, data & cpc_vp_run_mask(s)); +cpc_stop_vp(s, mips_gcr_get_redirect_corenum(s->gcr), data); break; default: qemu_log_mask(LOG_UNIMP, @@ -101,9 +127,13 @@ static uint64_t cpc_read(void *opaque, hwaddr offset, unsigned size) MIPSCPCState *s = opaque; switch (offset) { +case CPC_CL_BASE_OFS + CPC_CL_STAT_CONF_OFS: +case CPC_CO_BASE_OFS + CPC_CL_STAT_CONF_OFS: +return CPC_CL_STAT_CONF_SEQ_STATE_U6 << CPC_CL_STAT_CONF_SEQ_STATE_SHF; case CPC_CL_BASE_OFS + CPC_VP_RUNNING_OFS: +return cpc_vpnum_to_pcs(s, current_cpu->cpu_index)->vp_running; case CPC_CO_BASE_OFS + CPC_VP_RUNNING_OFS: -return s->vp_running; +return s->pcs[mips_gcr_get_redirect_corenum(s->gcr)].vp_running; default: qemu_log_mask(LOG_UNIMP, "%s: Bad offset 0x%x\n", __func__, (int)offset); @@ -137,9 +167,11 @@ static void mips_cpc_realize(DeviceState *dev, Error **errp) if (s->vp_start_running > cpc_vp_run_mask(s)) { error_setg(errp,
Re: [PATCH-for-9.1 v2 2/3] migration: Remove RDMA protocol handling
On Mon, May 06, 2024 at 12:08:43PM +0200, Jinpu Wang wrote: > Hi Peter, hi Daniel, Hi, Jinpu, Thanks for sharing this test results. Sounds like a great news. What's your plan next? Would it then be worthwhile / possible moving QEMU into that direction? Would that greatly simplify rdma code as Dan mentioned? Thanks, > > On Fri, May 3, 2024 at 4:33 PM Peter Xu wrote: > > > > On Fri, May 03, 2024 at 08:40:03AM +0200, Jinpu Wang wrote: > > > I had a brief check in the rsocket changelog, there seems some > > > improvement over time, > > > might be worth revisiting this. due to socket abstraction, we can't > > > use some feature like > > > ODP, it won't be a small and easy task. > > > > It'll be good to know whether Dan's suggestion would work first, without > > rewritting everything yet so far. Not sure whether some perf test could > > help with the rsocket APIs even without QEMU's involvements (or looking for > > test data supporting / invalidate such conversions). > > > I did a quick test with iperf on 100 G environment and 40 G > environment, in summary rsocket works pretty well. > > iperf tests between 2 hosts with 40 G (IB), > first a few test with different num. of threads on top of ipoib > interface, later with preload rsocket on top of same ipoib interface. > > jw...@ps401a-914.nst:~$ iperf -p 52000 -c 10.43.3.145 > > Client connecting to 10.43.3.145, TCP port 52000 > TCP window size: 165 KByte (default) > > [ 3] local 10.43.3.146 port 55602 connected with 10.43.3.145 port 52000 > [ ID] Interval Transfer Bandwidth > [ 3] 0.-10.0001 sec 2.85 GBytes 2.44 Gbits/sec > jw...@ps401a-914.nst:~$ iperf -p 52000 -c 10.43.3.145 -P 2 > > Client connecting to 10.43.3.145, TCP port 52000 > TCP window size: 165 KByte (default) > > [ 4] local 10.43.3.146 port 39640 connected with 10.43.3.145 port 52000 > [ 3] local 10.43.3.146 port 39626 connected with 10.43.3.145 port 52000 > [ ID] Interval Transfer Bandwidth > [ 3] 0.-10.0012 sec 2.85 GBytes 2.45 Gbits/sec > [ 4] 0.-10.0026 sec 2.86 GBytes 2.45 Gbits/sec > [SUM] 0.-10.0026 sec 5.71 GBytes 4.90 Gbits/sec > [ CT] final connect times (min/avg/max/stdev) = > 0.281/0.300/0.318/0.318 ms (tot/err) = 2/0 > jw...@ps401a-914.nst:~$ iperf -p 52000 -c 10.43.3.145 -P 4 > > Client connecting to 10.43.3.145, TCP port 52000 > TCP window size: 165 KByte (default) > > [ 4] local 10.43.3.146 port 46956 connected with 10.43.3.145 port 52000 > [ 6] local 10.43.3.146 port 46978 connected with 10.43.3.145 port 52000 > [ 3] local 10.43.3.146 port 46944 connected with 10.43.3.145 port 52000 > [ 5] local 10.43.3.146 port 46962 connected with 10.43.3.145 port 52000 > [ ID] Interval Transfer Bandwidth > [ 3] 0.-10.0017 sec 2.85 GBytes 2.45 Gbits/sec > [ 4] 0.-10.0015 sec 2.85 GBytes 2.45 Gbits/sec > [ 5] 0.-10.0026 sec 2.85 GBytes 2.45 Gbits/sec > [ 6] 0.-10.0005 sec 2.85 GBytes 2.45 Gbits/sec > [SUM] 0.-10.0005 sec 11.4 GBytes 9.80 Gbits/sec > [ CT] final connect times (min/avg/max/stdev) = > 0.274/0.312/0.360/0.212 ms (tot/err) = 4/0 > jw...@ps401a-914.nst:~$ iperf -p 52000 -c 10.43.3.145 -P 8 > > Client connecting to 10.43.3.145, TCP port 52000 > TCP window size: 165 KByte (default) > > [ 7] local 10.43.3.146 port 35062 connected with 10.43.3.145 port 52000 > [ 6] local 10.43.3.146 port 35058 connected with 10.43.3.145 port 52000 > [ 8] local 10.43.3.146 port 35066 connected with 10.43.3.145 port 52000 > [ 9] local 10.43.3.146 port 35074 connected with 10.43.3.145 port 52000 > [ 3] local 10.43.3.146 port 35038 connected with 10.43.3.145 port 52000 > [ 12] local 10.43.3.146 port 35088 connected with 10.43.3.145 port 52000 > [ 5] local 10.43.3.146 port 35048 connected with 10.43.3.145 port 52000 > [ 4] local 10.43.3.146 port 35050 connected with 10.43.3.145 port 52000 > [ ID] Interval Transfer Bandwidth > [ 4] 0.-10.0005 sec 2.85 GBytes 2.44 Gbits/sec > [ 8] 0.-10.0011 sec 2.85 GBytes 2.45 Gbits/sec > [ 5] 0.-10. sec 2.85 GBytes 2.45 Gbits/sec > [ 12] 0.-10.0021 sec 2.85 GBytes 2.44 Gbits/sec > [ 3] 0.-10.0003 sec 2.85 GBytes 2.44 Gbits/sec > [ 7] 0.-10.0065 sec 2.50 GBytes 2.14 Gbits/sec > [ 9] 0.-10.0077 sec 2.52 GBytes 2.16 Gbits/sec > [ 6] 0.-10.0003 sec 2.85 GBytes 2.44 Gbits/sec > [SUM] 0.-10.0003 sec 22.1 GBytes 19.0 Gbits/sec > [ CT] final connect times (min/avg/max/stdev) = > 0.096/0.226/0.339/0.109 ms
Re: [PATCH-for-9.1 v2 2/3] migration: Remove RDMA protocol handling
On Mon, May 06, 2024 at 02:06:28AM +, Gonglei (Arei) wrote: > Hi, Peter Hey, Lei, Happy to see you around again after years. > RDMA features high bandwidth, low latency (in non-blocking lossless > network), and direct remote memory access by bypassing the CPU (As you > know, CPU resources are expensive for cloud vendors, which is one of the > reasons why we introduced offload cards.), which TCP does not have. It's another cost to use offload cards, v.s. preparing more cpu resources? > In some scenarios where fast live migration is needed (extremely short > interruption duration and migration duration) is very useful. To this > end, we have also developed RDMA support for multifd. Will any of you upstream that work? I'm curious how intrusive would it be when adding it to multifd, if it can keep only 5 exported functions like what rdma.h does right now it'll be pretty nice. We also want to make sure it works with arbitrary sized loads and buffers, e.g. vfio is considering to add IO loads to multifd channels too. One thing to note that the question here is not about a pure performance comparison between rdma and nics only. It's about help us make a decision on whether to drop rdma, iow, even if rdma performs well, the community still has the right to drop it if nobody can actively work and maintain it. It's just that if nics can perform as good it's more a reason to drop, unless companies can help to provide good support and work together. Thanks, -- Peter Xu
Re: [PATCH 2/3] vfio/migration: Emit VFIO device migration state change QAPI event
On Mon, May 06, 2024 at 11:38:00AM -0300, Fabiano Rosas wrote: > Markus Armbruster writes: > > > Peter, Fabiano, I'd like to hear your opinion on the issue discussed > > below. > > > > Avihai Horon writes: > > > >> On 02/05/2024 13:22, Joao Martins wrote: > >>> External email: Use caution opening links or attachments > >>> > >>> > >>> On 01/05/2024 13:28, Avihai Horon wrote: > On 01/05/2024 14:50, Joao Martins wrote: > > External email: Use caution opening links or attachments > > > > > > On 30/04/2024 06:16, Avihai Horon wrote: > >> Emit VFIO device migration state change QAPI event when a VFIO device > >> changes its migration state. This can be used by management > >> applications > >> to get updates on the current state of the VFIO device for their own > >> purposes. > >> > >> A new per VFIO device capability, "migration-events", is added so > >> events > >> can be enabled only for the required devices. It is disabled by > >> default. > >> > >> Signed-off-by: Avihai Horon > >> --- > >>include/hw/vfio/vfio-common.h | 1 + > >>hw/vfio/migration.c | 44 > >> +++ > >>hw/vfio/pci.c | 2 ++ > >>3 files changed, 47 insertions(+) > >> > >> diff --git a/include/hw/vfio/vfio-common.h > >> b/include/hw/vfio/vfio-common.h > >> index b9da6c08ef..3ec5f2425e 100644 > >> --- a/include/hw/vfio/vfio-common.h > >> +++ b/include/hw/vfio/vfio-common.h > >> @@ -115,6 +115,7 @@ typedef struct VFIODevice { > >>bool no_mmap; > >>bool ram_block_discard_allowed; > >>OnOffAuto enable_migration; > >> +bool migration_events; > >>VFIODeviceOps *ops; > >>unsigned int num_irqs; > >>unsigned int num_regions; > >> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c > >> index 06ae40969b..6bbccf6545 100644 > >> --- a/hw/vfio/migration.c > >> +++ b/hw/vfio/migration.c > >> @@ -24,6 +24,7 @@ > >>#include "migration/register.h" > >>#include "migration/blocker.h" > >>#include "qapi/error.h" > >> +#include "qapi/qapi-events-vfio.h" > >>#include "exec/ramlist.h" > >>#include "exec/ram_addr.h" > >>#include "pci.h" > >> @@ -80,6 +81,46 @@ static const char *mig_state_to_str(enum > >> vfio_device_mig_state state) > >>} > >>} > >> > >> +static VFIODeviceMigState > >> +mig_state_to_qapi_state(enum vfio_device_mig_state state) > >> +{ > >> +switch (state) { > >> +case VFIO_DEVICE_STATE_STOP: > >> +return QAPI_VFIO_DEVICE_MIG_STATE_STOP; > >> +case VFIO_DEVICE_STATE_RUNNING: > >> +return QAPI_VFIO_DEVICE_MIG_STATE_RUNNING; > >> +case VFIO_DEVICE_STATE_STOP_COPY: > >> +return QAPI_VFIO_DEVICE_MIG_STATE_STOP_COPY; > >> +case VFIO_DEVICE_STATE_RESUMING: > >> +return QAPI_VFIO_DEVICE_MIG_STATE_RESUMING; > >> +case VFIO_DEVICE_STATE_RUNNING_P2P: > >> +return QAPI_VFIO_DEVICE_MIG_STATE_RUNNING_P2P; > >> +case VFIO_DEVICE_STATE_PRE_COPY: > >> +return QAPI_VFIO_DEVICE_MIG_STATE_PRE_COPY; > >> +case VFIO_DEVICE_STATE_PRE_COPY_P2P: > >> +return QAPI_VFIO_DEVICE_MIG_STATE_PRE_COPY_P2P; > >> +default: > >> +g_assert_not_reached(); > >> +} > >> +} > >> + > >> +static void vfio_migration_send_state_change_event(VFIODevice > >> *vbasedev) > >> +{ > >> +VFIOMigration *migration = vbasedev->migration; > >> +const char *id; > >> +Object *obj; > >> + > >> +if (!vbasedev->migration_events) { > >> +return; > >> +} > >> + > > Shouldn't this leap frog migrate_events() capability instead of > > introducing its > > vfio equivalent i.e. > > > > if (!migrate_events()) { > > return; > > } > > > > ? > > I used a per VFIO device cap so the events can be fine tuned for each > device > (maybe one device should send events while the other not). > This gives the most flexibility and I don't think it complicates the > configuration (one downside, though, is that it can't be enabled/disabled > dynamically during runtime). > > >>> Right. > >>> > I don't think events add much overhead, so if you prefer a global cap, I > can > change it. > However, I'm not sure overloading the existing migrate_events() is valid? > > >>> migration 'events' capability just means we will have some migration > >>> events > >>> emited via QAPI monitor for: 1) general global status and 2) for each > >>> migration > >>> pass (both with different event names=. > >> > >> Yes, it's already overloaded. > >> > >> In migration QAPI it
[PATCH 5/6] vhost, vhost-user: Add VIRTIO_F_IN_ORDER to vhost feature bits
Add support for the VIRTIO_F_IN_ORDER feature across a variety of vhost devices. The inclusion of VIRTIO_F_IN_ORDER in the feature bits arrays for these devices ensures that the backend is capable of offering and providing support for this feature, and that it can be disabled if the backend does not support it. Tested-by: Lei Yang Acked-by: Eugenio Pérez Signed-off-by: Jonah Palmer --- hw/block/vhost-user-blk.c| 1 + hw/net/vhost_net.c | 2 ++ hw/scsi/vhost-scsi.c | 1 + hw/scsi/vhost-user-scsi.c| 1 + hw/virtio/vhost-user-fs.c| 1 + hw/virtio/vhost-user-vsock.c | 1 + net/vhost-vdpa.c | 1 + 7 files changed, 8 insertions(+) diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c index 9e6bbc6950..1dd0a8ef63 100644 --- a/hw/block/vhost-user-blk.c +++ b/hw/block/vhost-user-blk.c @@ -51,6 +51,7 @@ static const int user_feature_bits[] = { VIRTIO_F_RING_PACKED, VIRTIO_F_IOMMU_PLATFORM, VIRTIO_F_RING_RESET, +VIRTIO_F_IN_ORDER, VHOST_INVALID_FEATURE_BIT }; diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index fd1a93701a..eb0b1c06e5 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -48,6 +48,7 @@ static const int kernel_feature_bits[] = { VIRTIO_F_IOMMU_PLATFORM, VIRTIO_F_RING_PACKED, VIRTIO_F_RING_RESET, +VIRTIO_F_IN_ORDER, VIRTIO_NET_F_HASH_REPORT, VHOST_INVALID_FEATURE_BIT }; @@ -76,6 +77,7 @@ static const int user_feature_bits[] = { VIRTIO_F_IOMMU_PLATFORM, VIRTIO_F_RING_PACKED, VIRTIO_F_RING_RESET, +VIRTIO_F_IN_ORDER, VIRTIO_NET_F_RSS, VIRTIO_NET_F_HASH_REPORT, VIRTIO_NET_F_GUEST_USO4, diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c index ae26bc19a4..40e7630191 100644 --- a/hw/scsi/vhost-scsi.c +++ b/hw/scsi/vhost-scsi.c @@ -38,6 +38,7 @@ static const int kernel_feature_bits[] = { VIRTIO_RING_F_EVENT_IDX, VIRTIO_SCSI_F_HOTPLUG, VIRTIO_F_RING_RESET, +VIRTIO_F_IN_ORDER, VHOST_INVALID_FEATURE_BIT }; diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c index a63b1f4948..1d59951ab7 100644 --- a/hw/scsi/vhost-user-scsi.c +++ b/hw/scsi/vhost-user-scsi.c @@ -36,6 +36,7 @@ static const int user_feature_bits[] = { VIRTIO_RING_F_EVENT_IDX, VIRTIO_SCSI_F_HOTPLUG, VIRTIO_F_RING_RESET, +VIRTIO_F_IN_ORDER, VHOST_INVALID_FEATURE_BIT }; diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c index cca2cd41be..9243dbb128 100644 --- a/hw/virtio/vhost-user-fs.c +++ b/hw/virtio/vhost-user-fs.c @@ -33,6 +33,7 @@ static const int user_feature_bits[] = { VIRTIO_F_RING_PACKED, VIRTIO_F_IOMMU_PLATFORM, VIRTIO_F_RING_RESET, +VIRTIO_F_IN_ORDER, VHOST_INVALID_FEATURE_BIT }; diff --git a/hw/virtio/vhost-user-vsock.c b/hw/virtio/vhost-user-vsock.c index 9431b9792c..cc7e4e47b4 100644 --- a/hw/virtio/vhost-user-vsock.c +++ b/hw/virtio/vhost-user-vsock.c @@ -21,6 +21,7 @@ static const int user_feature_bits[] = { VIRTIO_RING_F_INDIRECT_DESC, VIRTIO_RING_F_EVENT_IDX, VIRTIO_F_NOTIFY_ON_EMPTY, +VIRTIO_F_IN_ORDER, VHOST_INVALID_FEATURE_BIT }; diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index 85e73dd6a7..ed3185acfa 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -62,6 +62,7 @@ const int vdpa_feature_bits[] = { VIRTIO_F_RING_PACKED, VIRTIO_F_RING_RESET, VIRTIO_F_VERSION_1, +VIRTIO_F_IN_ORDER, VIRTIO_NET_F_CSUM, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, VIRTIO_NET_F_CTRL_MAC_ADDR, -- 2.39.3
[PATCH 0/6] virtio,vhost: Add VIRTIO_F_IN_ORDER support
The goal of these patches is to add support to a variety of virtio and vhost devices for the VIRTIO_F_IN_ORDER transport feature. This feature indicates that all buffers are used by the device in the same order in which they were made available by the driver. These patches attempt to implement a generalized, non-device-specific solution to support this feature. The core feature behind this solution is a buffer mechanism in the form of a VirtQueue's used_elems VirtQueueElement array. This allows devices who always use buffers in-order by default to have a minimal overhead impact. Devices that may not always use buffers in-order likely will experience a performance hit. How large that performance hit is will depend on how frequent elements are completed out-of-order. A VirtQueue whose device who uses this feature will use its used_elems VirtQueueElement array to hold used VirtQueueElements. The index that used elements are placed in used_elems is the same index on the used/descriptor ring that would satisfy the in-order requirement. In other words, used elements are placed in their in-order locations on used_elems and are only written to the used/descriptor ring once the elements on used_elems are able to continue their expected order. To differentiate between a "used" and "unused" element on the used_elems array (a "used" element being an element that has returned from processing and an "unused" element being an element that has not yet been processed), we added a boolean 'filled' member to the VirtQueueElement struct. This flag is set to true when the element comes back from processing (virtqueue_ordered_fill) and then set back to false once it's been written to the used/descriptor ring (virtqueue_ordered_flush). --- v1: Move series from RFC to PATCH for submission. Jonah Palmer (6): virtio: Add bool to VirtQueueElement virtio: virtqueue_pop - VIRTIO_F_IN_ORDER support virtio: virtqueue_ordered_fill - VIRTIO_F_IN_ORDER support virtio: virtqueue_ordered_flush - VIRTIO_F_IN_ORDER support vhost,vhost-user: Add VIRTIO_F_IN_ORDER to vhost feature bits virtio: Add VIRTIO_F_IN_ORDER property definition hw/block/vhost-user-blk.c| 1 + hw/net/vhost_net.c | 2 + hw/scsi/vhost-scsi.c | 1 + hw/scsi/vhost-user-scsi.c| 1 + hw/virtio/vhost-user-fs.c| 1 + hw/virtio/vhost-user-vsock.c | 1 + hw/virtio/virtio.c | 118 ++- include/hw/virtio/virtio.h | 5 +- net/vhost-vdpa.c | 1 + 9 files changed, 127 insertions(+), 4 deletions(-) -- 2.39.3
[PATCH 3/6] virtio: virtqueue_ordered_fill - VIRTIO_F_IN_ORDER support
Add VIRTIO_F_IN_ORDER feature support for virtqueue_fill operations. The goal of the virtqueue_fill operation when the VIRTIO_F_IN_ORDER feature has been negotiated is to search for this now-used element, set its length, and mark the element as filled in the VirtQueue's used_elems array. By marking the element as filled, it will indicate that this element is ready to be flushed, so long as the element is in-order. Tested-by: Lei Yang Signed-off-by: Jonah Palmer --- hw/virtio/virtio.c | 26 +- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index e6eb1bb453..064046b5e2 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -873,6 +873,28 @@ static void virtqueue_packed_fill(VirtQueue *vq, const VirtQueueElement *elem, vq->used_elems[idx].ndescs = elem->ndescs; } +static void virtqueue_ordered_fill(VirtQueue *vq, const VirtQueueElement *elem, + unsigned int len) +{ +unsigned int i = vq->used_idx; + +/* Search for element in vq->used_elems */ +while (i != vq->last_avail_idx) { +/* Found element, set length and mark as filled */ +if (vq->used_elems[i].index == elem->index) { +vq->used_elems[i].len = len; +vq->used_elems[i].filled = true; +break; +} + +i += vq->used_elems[i].ndescs; + +if (i >= vq->vring.num) { +i -= vq->vring.num; +} +} +} + static void virtqueue_packed_fill_desc(VirtQueue *vq, const VirtQueueElement *elem, unsigned int idx, @@ -923,7 +945,9 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem, return; } -if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) { +if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_IN_ORDER)) { +virtqueue_ordered_fill(vq, elem, len); +} else if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) { virtqueue_packed_fill(vq, elem, len, idx); } else { virtqueue_split_fill(vq, elem, len, idx); -- 2.39.3
[PATCH 1/6] virtio: Add bool to VirtQueueElement
Add the boolean 'filled' member to the VirtQueueElement structure. The use of this boolean will signify if the element has been written to the used / descriptor ring or not. This boolean is used to support the VIRTIO_F_IN_ORDER feature. Tested-by: Lei Yang Signed-off-by: Jonah Palmer --- include/hw/virtio/virtio.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index 7d5ffdc145..9ed9c3763c 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -69,6 +69,7 @@ typedef struct VirtQueueElement unsigned int ndescs; unsigned int out_num; unsigned int in_num; +bool filled; hwaddr *in_addr; hwaddr *out_addr; struct iovec *in_sg; -- 2.39.3
[PATCH 6/6] virtio: Add VIRTIO_F_IN_ORDER property definition
Extend the virtio device property definitions to include the VIRTIO_F_IN_ORDER feature. The default state of this feature is disabled, allowing it to be explicitly enabled where it's supported. Tested-by: Lei Yang Acked-by: Eugenio Pérez Signed-off-by: Jonah Palmer --- include/hw/virtio/virtio.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index 9ed9c3763c..30c23400e3 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -370,7 +370,9 @@ typedef struct VirtIORNGConf VirtIORNGConf; DEFINE_PROP_BIT64("packed", _state, _field, \ VIRTIO_F_RING_PACKED, false), \ DEFINE_PROP_BIT64("queue_reset", _state, _field, \ - VIRTIO_F_RING_RESET, true) + VIRTIO_F_RING_RESET, true), \ +DEFINE_PROP_BIT64("in_order", _state, _field, \ + VIRTIO_F_IN_ORDER, false) hwaddr virtio_queue_get_desc_addr(VirtIODevice *vdev, int n); bool virtio_queue_enabled_legacy(VirtIODevice *vdev, int n); -- 2.39.3
[PATCH 2/6] virtio: virtqueue_pop - VIRTIO_F_IN_ORDER support
Add VIRTIO_F_IN_ORDER feature support in virtqueue_split_pop and virtqueue_packed_pop. VirtQueueElements popped from the available/descritpor ring are added to the VirtQueue's used_elems array in-order and in the same fashion as they would be added the used and descriptor rings, respectively. This will allow us to keep track of the current order, what elements have been written, as well as an element's essential data after being processed. Tested-by: Lei Yang Signed-off-by: Jonah Palmer --- hw/virtio/virtio.c | 17 - 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 893a072c9d..e6eb1bb453 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -1506,7 +1506,7 @@ static void *virtqueue_alloc_element(size_t sz, unsigned out_num, unsigned in_nu static void *virtqueue_split_pop(VirtQueue *vq, size_t sz) { -unsigned int i, head, max; +unsigned int i, j, head, max; VRingMemoryRegionCaches *caches; MemoryRegionCache indirect_desc_cache; MemoryRegionCache *desc_cache; @@ -1539,6 +1539,8 @@ static void *virtqueue_split_pop(VirtQueue *vq, size_t sz) goto done; } +j = vq->last_avail_idx; + if (!virtqueue_get_head(vq, vq->last_avail_idx++, )) { goto done; } @@ -1630,6 +1632,12 @@ static void *virtqueue_split_pop(VirtQueue *vq, size_t sz) elem->in_sg[i] = iov[out_num + i]; } +if (virtio_vdev_has_feature(vdev, VIRTIO_F_IN_ORDER)) { +vq->used_elems[j].index = elem->index; +vq->used_elems[j].len = elem->len; +vq->used_elems[j].ndescs = elem->ndescs; +} + vq->inuse++; trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num); @@ -1758,6 +1766,13 @@ static void *virtqueue_packed_pop(VirtQueue *vq, size_t sz) elem->index = id; elem->ndescs = (desc_cache == _desc_cache) ? 1 : elem_entries; + +if (virtio_vdev_has_feature(vdev, VIRTIO_F_IN_ORDER)) { +vq->used_elems[vq->last_avail_idx].index = elem->index; +vq->used_elems[vq->last_avail_idx].len = elem->len; +vq->used_elems[vq->last_avail_idx].ndescs = elem->ndescs; +} + vq->last_avail_idx += elem->ndescs; vq->inuse += elem->ndescs; -- 2.39.3
[PATCH 4/6] virtio: virtqueue_ordered_flush - VIRTIO_F_IN_ORDER support
Add VIRTIO_F_IN_ORDER feature support for virtqueue_flush operations. The goal of the virtqueue_flush operation when the VIRTIO_F_IN_ORDER feature has been negotiated is to write elements to the used/descriptor ring in-order and then update used_idx. The function iterates through the VirtQueueElement used_elems array in-order starting at vq->used_idx. If the element is valid (filled), the element is written to the used/descriptor ring. This process continues until we find an invalid (not filled) element. If any elements were written, the used_idx is updated. Tested-by: Lei Yang Signed-off-by: Jonah Palmer --- hw/virtio/virtio.c | 75 +- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 064046b5e2..0efed2c88e 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -1006,6 +1006,77 @@ static void virtqueue_packed_flush(VirtQueue *vq, unsigned int count) } } +static void virtqueue_ordered_flush(VirtQueue *vq) +{ +unsigned int i = vq->used_idx; +unsigned int ndescs = 0; +uint16_t old = vq->used_idx; +bool packed; +VRingUsedElem uelem; + +packed = virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED); + +if (packed) { +if (unlikely(!vq->vring.desc)) { +return; +} +} else if (unlikely(!vq->vring.used)) { +return; +} + +/* First expected in-order element isn't ready, nothing to do */ +if (!vq->used_elems[i].filled) { +return; +} + +/* Write first expected in-order element to used ring (split VQs) */ +if (!packed) { +uelem.id = vq->used_elems[i].index; +uelem.len = vq->used_elems[i].len; +vring_used_write(vq, , i); +} + +ndescs += vq->used_elems[i].ndescs; +i += ndescs; +if (i >= vq->vring.num) { +i -= vq->vring.num; +} + +/* Search for more filled elements in-order */ +while (vq->used_elems[i].filled) { +if (packed) { +virtqueue_packed_fill_desc(vq, >used_elems[i], ndescs, false); +} else { +uelem.id = vq->used_elems[i].index; +uelem.len = vq->used_elems[i].len; +vring_used_write(vq, , i); +} + +vq->used_elems[i].filled = false; +ndescs += vq->used_elems[i].ndescs; +i += ndescs; +if (i >= vq->vring.num) { +i -= vq->vring.num; +} +} + +if (packed) { +virtqueue_packed_fill_desc(vq, >used_elems[vq->used_idx], 0, true); +vq->used_idx += ndescs; +if (vq->used_idx >= vq->vring.num) { +vq->used_idx -= vq->vring.num; +vq->used_wrap_counter ^= 1; +vq->signalled_used_valid = false; +} +} else { +vring_used_idx_set(vq, i); +if (unlikely((int16_t)(i - vq->signalled_used) < (uint16_t)(i - old))) { +vq->signalled_used_valid = false; +} +} +vq->inuse -= ndescs; +} + void virtqueue_flush(VirtQueue *vq, unsigned int count) { if (virtio_device_disabled(vq->vdev)) { @@ -1013,7 +1084,9 @@ void virtqueue_flush(VirtQueue *vq, unsigned int count) return; } -if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) { +if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_IN_ORDER)) { +virtqueue_ordered_flush(vq); +} else if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) { virtqueue_packed_flush(vq, count); } else { virtqueue_split_flush(vq, count); -- 2.39.3