Re: [PATCH v3 8/9] mips/loongson3_virt: do not require CONFIG_USB

2024-02-14 Thread Philippe Mathieu-Daudé

On 13/2/24 16:50, Paolo Bonzini wrote:

Once the Kconfig for hw/mips is cleaned up, it will be possible to build a
binary that does not include any USB host controller and therefore that
does not include the code guarded by CONFIG_USB.  While the simpler
creation functions such as usb_create_simple can be inlined, this is not
true of usb_bus_find().  Remove it, replacing it with a search of the
single USB bus created by loongson3_virt_devices_init().

Signed-off-by: Paolo Bonzini 
---
  hw/mips/loongson3_virt.c | 5 +++--
  1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/hw/mips/loongson3_virt.c b/hw/mips/loongson3_virt.c
index caedde2df00..bedd3d496bd 100644
--- a/hw/mips/loongson3_virt.c
+++ b/hw/mips/loongson3_virt.c
@@ -447,8 +447,9 @@ static inline void loongson3_virt_devices_init(MachineState 
*machine,
  
  if (defaults_enabled() && object_class_by_name("pci-ohci")) {

  pci_create_simple(pci_bus, -1, "pci-ohci");
-usb_create_simple(usb_bus_find(-1), "usb-kbd");
-usb_create_simple(usb_bus_find(-1), "usb-tablet");
+Object *usb_bus = object_resolve_path_type("", TYPE_USB_BUS, NULL);
+usb_create_simple(USB_BUS(usb_bus), "usb-kbd");
+usb_create_simple(USB_BUS(usb_bus), "usb-tablet");
  }
  
  pci_init_nic_devices(pci_bus, mc->default_nic);


Can we remove usb_bus_find() completely instead?

$ git grep -w usb_bus_find
hw/hppa/machine.c:401:usb_create_simple(usb_bus_find(-1), 
"usb-kbd");
hw/hppa/machine.c:402:usb_create_simple(usb_bus_find(-1), 
"usb-mouse");
hw/mips/loongson3_virt.c:450:usb_create_simple(usb_bus_find(-1), 
"usb-kbd");
hw/mips/loongson3_virt.c:451:usb_create_simple(usb_bus_find(-1), 
"usb-tablet");

hw/ppc/mac_newworld.c:434:USBBus *usb_bus = usb_bus_find(-1);
hw/ppc/sam460ex.c:423:usb_create_simple(usb_bus_find(-1), "usb-kbd");
hw/ppc/sam460ex.c:424:usb_create_simple(usb_bus_find(-1), "usb-mouse");
hw/ppc/spapr.c:3027:USBBus *usb_bus = usb_bus_find(-1);
hw/sh4/r2d.c:315:usb_create_simple(usb_bus_find(-1), "usb-kbd");
hw/usb/bus.c:103:USBBus *usb_bus_find(int busnr)
hw/usb/bus.c:669:USBBus *bus = usb_bus_find(-1 /* any */);
include/hw/usb.h:500:USBBus *usb_bus_find(int busnr);




[PATCH v3 0/2] UART0 device name and fix hardcode boot address 0

2024-02-14 Thread Jamin Lin via
v1:
1. support uart controller both 0 and 1 base
2. fix hardcode boot address 0

v2:
1. introduce a new UART0 device name
2. remove ASPEED_SOC_SPI_BOOT_ADDR marco

v3:
1. add uart helper functions to get the index, start and last.
2. add more description in commit log

Jamin Lin (2):
  aspeed: introduce a new UART0 device name
  aspeed: fix hardcode boot address 0

 hw/arm/aspeed.c | 17 +++--
 hw/arm/aspeed_ast10x0.c |  1 +
 hw/arm/aspeed_ast2400.c |  6 --
 hw/arm/aspeed_ast2600.c |  3 ++-
 hw/arm/aspeed_soc_common.c  | 10 ++
 include/hw/arm/aspeed_soc.h | 19 +--
 6 files changed, 41 insertions(+), 15 deletions(-)

-- 
2.25.1




[PATCH v3 2/2] aspeed: fix hardcode boot address 0

2024-02-14 Thread Jamin Lin via
In the previous design of ASPEED SOCs QEMU model, it set the boot
address at "0" which was the hardcode setting for ast10x0, ast2600,
ast2500 and ast2400.

According to the design of ast2700, it has a bootmcu(riscv-32) which
is used for executing SPL and initialize DRAM and copy u-boot image
from SPI/Flash to DRAM at address 0x4 at SPL boot stage.
Then, CPUs(cortex-a35) execute u-boot, kernel and rofs.

Currently, qemu not support emulate two CPU architectures
at the same machine. Therefore, qemu will only support
to emulate CPU(cortex-a35) side for ast2700 and the boot
address is "0x4 ".

Fixed hardcode boot address "0" for future models using
a different mapping address.

Signed-off-by: Troy Lee 
Signed-off-by: Jamin Lin 
---
 hw/arm/aspeed.c | 4 +++-
 hw/arm/aspeed_ast2400.c | 4 ++--
 hw/arm/aspeed_ast2600.c | 2 +-
 include/hw/arm/aspeed_soc.h | 2 --
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c
index aa165d583b..9fec245e4e 100644
--- a/hw/arm/aspeed.c
+++ b/hw/arm/aspeed.c
@@ -289,12 +289,14 @@ static void aspeed_install_boot_rom(AspeedMachineState 
*bmc, BlockBackend *blk,
 uint64_t rom_size)
 {
 AspeedSoCState *soc = bmc->soc;
+AspeedSoCClass *sc = ASPEED_SOC_GET_CLASS(soc);
 
 memory_region_init_rom(>boot_rom, NULL, "aspeed.boot_rom", rom_size,
_abort);
 memory_region_add_subregion_overlap(>spi_boot_container, 0,
 >boot_rom, 1);
-write_boot_rom(blk, ASPEED_SOC_SPI_BOOT_ADDR, rom_size, _abort);
+write_boot_rom(blk, sc->memmap[ASPEED_DEV_SPI_BOOT],
+   rom_size, _abort);
 }
 
 void aspeed_board_init_flashes(AspeedSMCState *s, const char *flashtype,
diff --git a/hw/arm/aspeed_ast2400.c b/hw/arm/aspeed_ast2400.c
index 95da85fee0..d125886207 100644
--- a/hw/arm/aspeed_ast2400.c
+++ b/hw/arm/aspeed_ast2400.c
@@ -26,7 +26,7 @@
 #define ASPEED_SOC_IOMEM_SIZE   0x0020
 
 static const hwaddr aspeed_soc_ast2400_memmap[] = {
-[ASPEED_DEV_SPI_BOOT]  =  ASPEED_SOC_SPI_BOOT_ADDR,
+[ASPEED_DEV_SPI_BOOT]  = 0x,
 [ASPEED_DEV_IOMEM]  = 0x1E60,
 [ASPEED_DEV_FMC]= 0x1E62,
 [ASPEED_DEV_SPI1]   = 0x1E63,
@@ -61,7 +61,7 @@ static const hwaddr aspeed_soc_ast2400_memmap[] = {
 };
 
 static const hwaddr aspeed_soc_ast2500_memmap[] = {
-[ASPEED_DEV_SPI_BOOT]  = ASPEED_SOC_SPI_BOOT_ADDR,
+[ASPEED_DEV_SPI_BOOT]  = 0x,
 [ASPEED_DEV_IOMEM]  = 0x1E60,
 [ASPEED_DEV_FMC]= 0x1E62,
 [ASPEED_DEV_SPI1]   = 0x1E63,
diff --git a/hw/arm/aspeed_ast2600.c b/hw/arm/aspeed_ast2600.c
index f74561ecdc..174be53770 100644
--- a/hw/arm/aspeed_ast2600.c
+++ b/hw/arm/aspeed_ast2600.c
@@ -22,7 +22,7 @@
 #define ASPEED_SOC_DPMCU_SIZE   0x0004
 
 static const hwaddr aspeed_soc_ast2600_memmap[] = {
-[ASPEED_DEV_SPI_BOOT]  = ASPEED_SOC_SPI_BOOT_ADDR,
+[ASPEED_DEV_SPI_BOOT]  = 0x,
 [ASPEED_DEV_SRAM]  = 0x1000,
 [ASPEED_DEV_DPMCU] = 0x1800,
 /* 0x1600 0x17FF : AHB BUS do LPC Bus bridge */
diff --git a/include/hw/arm/aspeed_soc.h b/include/hw/arm/aspeed_soc.h
index e1a023be53..c60fac900a 100644
--- a/include/hw/arm/aspeed_soc.h
+++ b/include/hw/arm/aspeed_soc.h
@@ -224,8 +224,6 @@ enum {
 ASPEED_DEV_FSI2,
 };
 
-#define ASPEED_SOC_SPI_BOOT_ADDR 0x0
-
 qemu_irq aspeed_soc_get_irq(AspeedSoCState *s, int dev);
 bool aspeed_soc_uart_realize(AspeedSoCState *s, Error **errp);
 void aspeed_soc_uart_set_chr(AspeedSoCState *s, int dev, Chardev *chr);
-- 
2.25.1




[PATCH v3 1/2] aspeed: introduce a new UART0 device name

2024-02-14 Thread Jamin Lin via
The Aspeed datasheet refers to the UART controllers
as UART1 - UART13 for the ast10x0, ast2600, ast2500
and ast2400 SoCs and the Aspeed ast2700 introduces an UART0
and the UART controllers as UART0 - UART12.

To keep the naming in the QEMU models
in sync with the datasheet, let's introduce a new  UART0 device name
and do the required adjustements.

Signed-off-by: Troy Lee 
Signed-off-by: Jamin Lin 
---
 hw/arm/aspeed.c | 13 -
 hw/arm/aspeed_ast10x0.c |  1 +
 hw/arm/aspeed_ast2400.c |  2 ++
 hw/arm/aspeed_ast2600.c |  1 +
 hw/arm/aspeed_soc_common.c  | 10 ++
 include/hw/arm/aspeed_soc.h | 17 +
 6 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c
index 09b1e823ba..aa165d583b 100644
--- a/hw/arm/aspeed.c
+++ b/hw/arm/aspeed.c
@@ -342,7 +342,7 @@ static void connect_serial_hds_to_uarts(AspeedMachineState 
*bmc)
 int uart_chosen = bmc->uart_chosen ? bmc->uart_chosen : amc->uart_default;
 
 aspeed_soc_uart_set_chr(s, uart_chosen, serial_hd(0));
-for (int i = 1, uart = ASPEED_DEV_UART1; i < sc->uarts_num; i++, uart++) {
+for (int i = 0, uart = sc->uarts_base; i < sc->uarts_num; i++, uart++) {
 if (uart == uart_chosen) {
 continue;
 }
@@ -1094,7 +1094,7 @@ static char *aspeed_get_bmc_console(Object *obj, Error 
**errp)
 AspeedMachineClass *amc = ASPEED_MACHINE_GET_CLASS(bmc);
 int uart_chosen = bmc->uart_chosen ? bmc->uart_chosen : amc->uart_default;
 
-return g_strdup_printf("uart%d", uart_chosen - ASPEED_DEV_UART1 + 1);
+return g_strdup_printf("uart%d", aspeed_uart_index(uart_chosen));
 }
 
 static void aspeed_set_bmc_console(Object *obj, const char *value, Error 
**errp)
@@ -1103,6 +1103,8 @@ static void aspeed_set_bmc_console(Object *obj, const 
char *value, Error **errp)
 AspeedMachineClass *amc = ASPEED_MACHINE_GET_CLASS(bmc);
 AspeedSoCClass *sc = ASPEED_SOC_CLASS(object_class_by_name(amc->soc_name));
 int val;
+int uart_first = aspeed_uart_first(sc);
+int uart_last = aspeed_uart_last(sc);
 
 if (sscanf(value, "uart%u", ) != 1) {
 error_setg(errp, "Bad value for \"uart\" property");
@@ -1110,11 +1112,12 @@ static void aspeed_set_bmc_console(Object *obj, const 
char *value, Error **errp)
 }
 
 /* The number of UART depends on the SoC */
-if (val < 1 || val > sc->uarts_num) {
-error_setg(errp, "\"uart\" should be in range [1 - %d]", 
sc->uarts_num);
+if (val < uart_first || val > uart_last) {
+error_setg(errp, "\"uart\" should be in range [%d - %d]",
+   uart_first, uart_last);
 return;
 }
-bmc->uart_chosen = ASPEED_DEV_UART1 + val - 1;
+bmc->uart_chosen = val + ASPEED_DEV_UART0;
 }
 
 static void aspeed_machine_class_props_init(ObjectClass *oc)
diff --git a/hw/arm/aspeed_ast10x0.c b/hw/arm/aspeed_ast10x0.c
index c3b5116a6a..2634e0f654 100644
--- a/hw/arm/aspeed_ast10x0.c
+++ b/hw/arm/aspeed_ast10x0.c
@@ -436,6 +436,7 @@ static void aspeed_soc_ast1030_class_init(ObjectClass 
*klass, void *data)
 sc->wdts_num = 4;
 sc->macs_num = 1;
 sc->uarts_num = 13;
+sc->uarts_base = ASPEED_DEV_UART1;
 sc->irqmap = aspeed_soc_ast1030_irqmap;
 sc->memmap = aspeed_soc_ast1030_memmap;
 sc->num_cpus = 1;
diff --git a/hw/arm/aspeed_ast2400.c b/hw/arm/aspeed_ast2400.c
index 8829561bb6..95da85fee0 100644
--- a/hw/arm/aspeed_ast2400.c
+++ b/hw/arm/aspeed_ast2400.c
@@ -523,6 +523,7 @@ static void aspeed_soc_ast2400_class_init(ObjectClass *oc, 
void *data)
 sc->wdts_num = 2;
 sc->macs_num = 2;
 sc->uarts_num= 5;
+sc->uarts_base   = ASPEED_DEV_UART1;
 sc->irqmap   = aspeed_soc_ast2400_irqmap;
 sc->memmap   = aspeed_soc_ast2400_memmap;
 sc->num_cpus = 1;
@@ -551,6 +552,7 @@ static void aspeed_soc_ast2500_class_init(ObjectClass *oc, 
void *data)
 sc->wdts_num = 3;
 sc->macs_num = 2;
 sc->uarts_num= 5;
+sc->uarts_base   = ASPEED_DEV_UART1;
 sc->irqmap   = aspeed_soc_ast2500_irqmap;
 sc->memmap   = aspeed_soc_ast2500_memmap;
 sc->num_cpus = 1;
diff --git a/hw/arm/aspeed_ast2600.c b/hw/arm/aspeed_ast2600.c
index 4ee32ea99d..f74561ecdc 100644
--- a/hw/arm/aspeed_ast2600.c
+++ b/hw/arm/aspeed_ast2600.c
@@ -666,6 +666,7 @@ static void aspeed_soc_ast2600_class_init(ObjectClass *oc, 
void *data)
 sc->wdts_num = 4;
 sc->macs_num = 4;
 sc->uarts_num= 13;
+sc->uarts_base   = ASPEED_DEV_UART1;
 sc->irqmap   = aspeed_soc_ast2600_irqmap;
 sc->memmap   = aspeed_soc_ast2600_memmap;
 sc->num_cpus = 2;
diff --git a/hw/arm/aspeed_soc_common.c b/hw/arm/aspeed_soc_common.c
index 123a0c432c..95d0c0aba9 100644
--- a/hw/arm/aspeed_soc_common.c
+++ b/hw/arm/aspeed_soc_common.c
@@ -36,7 +36,7 @@ bool aspeed_soc_uart_realize(AspeedSoCState *s, Error **errp)
 AspeedSoCClass *sc = ASPEED_SOC_GET_CLASS(s);
 

[PATCH v3 2/2] aspeed: fix hardcode boot address 0

2024-02-14 Thread Jamin Lin via
In the previous design of ASPEED SOCs QEMU model, it set the boot
address at "0" which was the hardcode setting for ast10x0, ast2600,
ast2500 and ast2400.

According to the design of ast2700, it has a bootmcu(riscv-32) which
is used for executing SPL and initialize DRAM and copy u-boot image
from SPI/Flash to DRAM at address 0x4 at SPL boot stage.
Then, CPUs(cortex-a35) execute u-boot, kernel and rofs.

Currently, qemu not support emulate two CPU architectures
at the same machine. Therefore, qemu will only support
to emulate CPU(cortex-a35) side for ast2700 and the boot
address is "0x4 ".

Fixed hardcode boot address "0" for future models using
a different mapping address.

Signed-off-by: Troy Lee 
Signed-off-by: Jamin Lin 
---
 hw/arm/aspeed.c | 4 +++-
 hw/arm/aspeed_ast2400.c | 4 ++--
 hw/arm/aspeed_ast2600.c | 2 +-
 include/hw/arm/aspeed_soc.h | 2 --
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c
index aa165d583b..9fec245e4e 100644
--- a/hw/arm/aspeed.c
+++ b/hw/arm/aspeed.c
@@ -289,12 +289,14 @@ static void aspeed_install_boot_rom(AspeedMachineState 
*bmc, BlockBackend *blk,
 uint64_t rom_size)
 {
 AspeedSoCState *soc = bmc->soc;
+AspeedSoCClass *sc = ASPEED_SOC_GET_CLASS(soc);
 
 memory_region_init_rom(>boot_rom, NULL, "aspeed.boot_rom", rom_size,
_abort);
 memory_region_add_subregion_overlap(>spi_boot_container, 0,
 >boot_rom, 1);
-write_boot_rom(blk, ASPEED_SOC_SPI_BOOT_ADDR, rom_size, _abort);
+write_boot_rom(blk, sc->memmap[ASPEED_DEV_SPI_BOOT],
+   rom_size, _abort);
 }
 
 void aspeed_board_init_flashes(AspeedSMCState *s, const char *flashtype,
diff --git a/hw/arm/aspeed_ast2400.c b/hw/arm/aspeed_ast2400.c
index 95da85fee0..d125886207 100644
--- a/hw/arm/aspeed_ast2400.c
+++ b/hw/arm/aspeed_ast2400.c
@@ -26,7 +26,7 @@
 #define ASPEED_SOC_IOMEM_SIZE   0x0020
 
 static const hwaddr aspeed_soc_ast2400_memmap[] = {
-[ASPEED_DEV_SPI_BOOT]  =  ASPEED_SOC_SPI_BOOT_ADDR,
+[ASPEED_DEV_SPI_BOOT]  = 0x,
 [ASPEED_DEV_IOMEM]  = 0x1E60,
 [ASPEED_DEV_FMC]= 0x1E62,
 [ASPEED_DEV_SPI1]   = 0x1E63,
@@ -61,7 +61,7 @@ static const hwaddr aspeed_soc_ast2400_memmap[] = {
 };
 
 static const hwaddr aspeed_soc_ast2500_memmap[] = {
-[ASPEED_DEV_SPI_BOOT]  = ASPEED_SOC_SPI_BOOT_ADDR,
+[ASPEED_DEV_SPI_BOOT]  = 0x,
 [ASPEED_DEV_IOMEM]  = 0x1E60,
 [ASPEED_DEV_FMC]= 0x1E62,
 [ASPEED_DEV_SPI1]   = 0x1E63,
diff --git a/hw/arm/aspeed_ast2600.c b/hw/arm/aspeed_ast2600.c
index f74561ecdc..174be53770 100644
--- a/hw/arm/aspeed_ast2600.c
+++ b/hw/arm/aspeed_ast2600.c
@@ -22,7 +22,7 @@
 #define ASPEED_SOC_DPMCU_SIZE   0x0004
 
 static const hwaddr aspeed_soc_ast2600_memmap[] = {
-[ASPEED_DEV_SPI_BOOT]  = ASPEED_SOC_SPI_BOOT_ADDR,
+[ASPEED_DEV_SPI_BOOT]  = 0x,
 [ASPEED_DEV_SRAM]  = 0x1000,
 [ASPEED_DEV_DPMCU] = 0x1800,
 /* 0x1600 0x17FF : AHB BUS do LPC Bus bridge */
diff --git a/include/hw/arm/aspeed_soc.h b/include/hw/arm/aspeed_soc.h
index e1a023be53..c60fac900a 100644
--- a/include/hw/arm/aspeed_soc.h
+++ b/include/hw/arm/aspeed_soc.h
@@ -224,8 +224,6 @@ enum {
 ASPEED_DEV_FSI2,
 };
 
-#define ASPEED_SOC_SPI_BOOT_ADDR 0x0
-
 qemu_irq aspeed_soc_get_irq(AspeedSoCState *s, int dev);
 bool aspeed_soc_uart_realize(AspeedSoCState *s, Error **errp);
 void aspeed_soc_uart_set_chr(AspeedSoCState *s, int dev, Chardev *chr);
-- 
2.25.1




[PATCH v3 0/2] UART0 device name and fix hardcode boot address 0

2024-02-14 Thread Jamin Lin via
v1:
1. support uart controller both 0 and 1 base
2. fix hardcode boot address 0

v2:
1. introduce a new UART0 device name
2. remove ASPEED_SOC_SPI_BOOT_ADDR marco

v3:
1. add uart helper functions to get the index, start and last.
2. add more description in commit log

Jamin Lin (2):
  aspeed: introduce a new UART0 device name
  aspeed: fix hardcode boot address 0

 hw/arm/aspeed.c | 17 +++--
 hw/arm/aspeed_ast10x0.c |  1 +
 hw/arm/aspeed_ast2400.c |  6 --
 hw/arm/aspeed_ast2600.c |  3 ++-
 hw/arm/aspeed_soc_common.c  | 10 ++
 include/hw/arm/aspeed_soc.h | 19 +--
 6 files changed, 41 insertions(+), 15 deletions(-)

-- 
2.25.1




[PATCH v3 2/2] aspeed: fix hardcode boot address 0

2024-02-14 Thread Jamin Lin via
In the previous design of ASPEED SOCs QEMU model, it set the boot
address at "0" which was the hardcode setting for ast10x0, ast2600,
ast2500 and ast2400.

According to the design of ast2700, it has a bootmcu(riscv-32) which
is used for executing SPL and initialize DRAM and copy u-boot image
from SPI/Flash to DRAM at address 0x4 at SPL boot stage.
Then, CPUs(cortex-a35) execute u-boot, kernel and rofs.

Currently, qemu not support emulate two CPU architectures
at the same machine. Therefore, qemu will only support
to emulate CPU(cortex-a35) side for ast2700 and the boot
address is "0x4 ".

Fixed hardcode boot address "0" for future models using
a different mapping address.

Signed-off-by: Troy Lee 
Signed-off-by: Jamin Lin 
---
 hw/arm/aspeed.c | 4 +++-
 hw/arm/aspeed_ast2400.c | 4 ++--
 hw/arm/aspeed_ast2600.c | 2 +-
 include/hw/arm/aspeed_soc.h | 2 --
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c
index aa165d583b..9fec245e4e 100644
--- a/hw/arm/aspeed.c
+++ b/hw/arm/aspeed.c
@@ -289,12 +289,14 @@ static void aspeed_install_boot_rom(AspeedMachineState 
*bmc, BlockBackend *blk,
 uint64_t rom_size)
 {
 AspeedSoCState *soc = bmc->soc;
+AspeedSoCClass *sc = ASPEED_SOC_GET_CLASS(soc);
 
 memory_region_init_rom(>boot_rom, NULL, "aspeed.boot_rom", rom_size,
_abort);
 memory_region_add_subregion_overlap(>spi_boot_container, 0,
 >boot_rom, 1);
-write_boot_rom(blk, ASPEED_SOC_SPI_BOOT_ADDR, rom_size, _abort);
+write_boot_rom(blk, sc->memmap[ASPEED_DEV_SPI_BOOT],
+   rom_size, _abort);
 }
 
 void aspeed_board_init_flashes(AspeedSMCState *s, const char *flashtype,
diff --git a/hw/arm/aspeed_ast2400.c b/hw/arm/aspeed_ast2400.c
index 95da85fee0..d125886207 100644
--- a/hw/arm/aspeed_ast2400.c
+++ b/hw/arm/aspeed_ast2400.c
@@ -26,7 +26,7 @@
 #define ASPEED_SOC_IOMEM_SIZE   0x0020
 
 static const hwaddr aspeed_soc_ast2400_memmap[] = {
-[ASPEED_DEV_SPI_BOOT]  =  ASPEED_SOC_SPI_BOOT_ADDR,
+[ASPEED_DEV_SPI_BOOT]  = 0x,
 [ASPEED_DEV_IOMEM]  = 0x1E60,
 [ASPEED_DEV_FMC]= 0x1E62,
 [ASPEED_DEV_SPI1]   = 0x1E63,
@@ -61,7 +61,7 @@ static const hwaddr aspeed_soc_ast2400_memmap[] = {
 };
 
 static const hwaddr aspeed_soc_ast2500_memmap[] = {
-[ASPEED_DEV_SPI_BOOT]  = ASPEED_SOC_SPI_BOOT_ADDR,
+[ASPEED_DEV_SPI_BOOT]  = 0x,
 [ASPEED_DEV_IOMEM]  = 0x1E60,
 [ASPEED_DEV_FMC]= 0x1E62,
 [ASPEED_DEV_SPI1]   = 0x1E63,
diff --git a/hw/arm/aspeed_ast2600.c b/hw/arm/aspeed_ast2600.c
index f74561ecdc..174be53770 100644
--- a/hw/arm/aspeed_ast2600.c
+++ b/hw/arm/aspeed_ast2600.c
@@ -22,7 +22,7 @@
 #define ASPEED_SOC_DPMCU_SIZE   0x0004
 
 static const hwaddr aspeed_soc_ast2600_memmap[] = {
-[ASPEED_DEV_SPI_BOOT]  = ASPEED_SOC_SPI_BOOT_ADDR,
+[ASPEED_DEV_SPI_BOOT]  = 0x,
 [ASPEED_DEV_SRAM]  = 0x1000,
 [ASPEED_DEV_DPMCU] = 0x1800,
 /* 0x1600 0x17FF : AHB BUS do LPC Bus bridge */
diff --git a/include/hw/arm/aspeed_soc.h b/include/hw/arm/aspeed_soc.h
index e1a023be53..c60fac900a 100644
--- a/include/hw/arm/aspeed_soc.h
+++ b/include/hw/arm/aspeed_soc.h
@@ -224,8 +224,6 @@ enum {
 ASPEED_DEV_FSI2,
 };
 
-#define ASPEED_SOC_SPI_BOOT_ADDR 0x0
-
 qemu_irq aspeed_soc_get_irq(AspeedSoCState *s, int dev);
 bool aspeed_soc_uart_realize(AspeedSoCState *s, Error **errp);
 void aspeed_soc_uart_set_chr(AspeedSoCState *s, int dev, Chardev *chr);
-- 
2.25.1




Re: [PATCH v4 36/36] linux-user: Remove pgb_dynamic alignment assertion

2024-02-14 Thread Richard Henderson

On 2/14/24 21:07, Philippe Mathieu-Daudé wrote:

On 15/2/24 07:20, Richard Henderson wrote:

The assertion was never correct, because the alignment is a composite
of the image alignment and SHMLBA.  Even if the alignment didn't match
the image an assertion would not be correct -- more appropriate would
be an error message about an ill formed image.  But the image cannot
be held to SHMLBA under any circumstances.

Fixes: ee94743034b ("linux-user: completely re-write init_guest_space")
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2157
Signed-off-by: Richard Henderson 
---
  linux-user/elfload.c | 2 --
  1 file changed, 2 deletions(-)

diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index f3f1ab4f69..d92d66ca1e 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -3022,8 +3022,6 @@ static void pgb_dynamic(const char *image_name, uintptr_t 
guest_loaddr,

  uintptr_t brk, ret;
  PGBAddrs ga;
-    assert(QEMU_IS_ALIGNED(guest_loaddr, align));
-
  /* Try the identity map first. */
  if (pgb_addr_set(, guest_loaddr, guest_hiaddr, true)) {
  brk = (uintptr_t)sbrk(0);


I suppose this isn't part of this series since posted as
https://lore.kernel.org/qemu-devel/20240214045413.541677-1-richard.hender...@linaro.org/

Still:
Reported-by: Alexey Sheplyakov 
Reviewed-by: Philippe Mathieu-Daudé 


Whoops, no.  But I did need it for testing on ppc64 64k page host.


r~



Re: [PATCH v2] hw/arm/smmuv3: add support for stage 1 access fault

2024-02-14 Thread Eric Auger
Hi Luc,
On 2/13/24 09:22, Luc Michel wrote:
> An access fault is raised when the Access Flag is not set in the
> looked-up PTE and the AFFD field is not set in the corresponding context
> descriptor. This was already implemented for stage 2. Implement it for
> stage 1 as well.
>
> Signed-off-by: Luc Michel 
> ---
>
> v2: drop erroneous submodule modification
>
> ---
>
>  hw/arm/smmuv3-internal.h |  1 +
>  include/hw/arm/smmu-common.h |  1 +
>  hw/arm/smmu-common.c | 10 ++
>  hw/arm/smmuv3.c  |  1 +
>  4 files changed, 13 insertions(+)
>
> diff --git a/hw/arm/smmuv3-internal.h b/hw/arm/smmuv3-internal.h
> index e987bc4686b..e4dd11e1e62 100644
> --- a/hw/arm/smmuv3-internal.h
> +++ b/hw/arm/smmuv3-internal.h
> @@ -622,10 +622,11 @@ static inline int pa_range(STE *ste)
>  #define CD_TSZ(x, sel)   extract32((x)->word[0], (16 * (sel)) + 0, 6)
>  #define CD_TG(x, sel)extract32((x)->word[0], (16 * (sel)) + 6, 2)
>  #define CD_EPD(x, sel)   extract32((x)->word[0], (16 * (sel)) + 14, 1)
>  #define CD_ENDI(x)   extract32((x)->word[0], 15, 1)
>  #define CD_IPS(x)extract32((x)->word[1], 0 , 3)
> +#define CD_AFFD(x)   extract32((x)->word[1], 3 , 1)
>  #define CD_TBI(x)extract32((x)->word[1], 6 , 2)
>  #define CD_HD(x) extract32((x)->word[1], 10 , 1)
>  #define CD_HA(x) extract32((x)->word[1], 11 , 1)
>  #define CD_S(x)  extract32((x)->word[1], 12, 1)
>  #define CD_R(x)  extract32((x)->word[1], 13, 1)
> diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h
> index fd8d772da11..5ec2e6c1a43 100644
> --- a/include/hw/arm/smmu-common.h
> +++ b/include/hw/arm/smmu-common.h
> @@ -90,10 +90,11 @@ typedef struct SMMUTransCfg {
>  /* Shared fields between stage-1 and stage-2. */
>  int stage; /* translation stage */
>  bool disabled; /* smmu is disabled */
>  bool bypassed; /* translation is bypassed */
>  bool aborted;  /* translation is aborted */
> +bool affd; /* AF fault disable */
>  uint32_t iotlb_hits;   /* counts IOTLB hits */
>  uint32_t iotlb_misses; /* counts IOTLB misses*/
>  /* Used by stage-1 only. */
>  bool aa64; /* arch64 or aarch32 translation table */
>  bool record_faults;/* record fault events */
> diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
> index 9a8ac45431a..09ff72e55f5 100644
> --- a/hw/arm/smmu-common.c
> +++ b/hw/arm/smmu-common.c
> @@ -362,10 +362,20 @@ static int smmu_ptw_64_s1(SMMUTransCfg *cfg,
>  _size);
>  trace_smmu_ptw_block_pte(stage, level, baseaddr,
>   pte_addr, pte, iova, gpa,
>   block_size >> 20);
>  }
> +
> +/*
> + * If AFFD and PTE.AF are 0 => fault. (5.4. Context Descriptor)
> + * An Access fault takes priority over a Permission fault.
nit: you may precise that this holds because HTTU is not currently supported
> + */
> +if (!PTE_AF(pte) && !cfg->affd) {
> +info->type = SMMU_PTW_ERR_ACCESS;
> +goto error;
> +}
> +
>  ap = PTE_AP(pte);
>  if (is_permission_fault(ap, perm)) {
>  info->type = SMMU_PTW_ERR_PERMISSION;
>  goto error;
>  }
> diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
> index 68eeef3e1d4..c416b8c0030 100644
> --- a/hw/arm/smmuv3.c
> +++ b/hw/arm/smmuv3.c
> @@ -682,10 +682,11 @@ static int decode_cd(SMMUTransCfg *cfg, CD *cd, 
> SMMUEventInfo *event)
>  
>  cfg->oas = oas2bits(CD_IPS(cd));
>  cfg->oas = MIN(oas2bits(SMMU_IDR5_OAS), cfg->oas);
>  cfg->tbi = CD_TBI(cd);
>  cfg->asid = CD_ASID(cd);
> +cfg->affd = CD_AFFD(cd);
>  
>  trace_smmuv3_decode_cd(cfg->oas);
>  
>  /* decode data dependent on TT */
>  for (i = 0; i <= 1; i++) {
Besides, looks good to me
Reviewed-by: Eric Auger 
Eric




[PATCH v3 1/2] aspeed: introduce a new UART0 device name

2024-02-14 Thread Jamin Lin via
The Aspeed datasheet refers to the UART controllers
as UART1 - UART13 for the ast10x0, ast2600, ast2500
and ast2400 SoCs and the Aspeed ast2700 introduces an UART0
and the UART controllers as UART0 - UART12.

To keep the naming in the QEMU models
in sync with the datasheet, let's introduce a new  UART0 device name
and do the required adjustements.

Signed-off-by: Troy Lee 
Signed-off-by: Jamin Lin 
---
 hw/arm/aspeed.c | 13 -
 hw/arm/aspeed_ast10x0.c |  1 +
 hw/arm/aspeed_ast2400.c |  2 ++
 hw/arm/aspeed_ast2600.c |  1 +
 hw/arm/aspeed_soc_common.c  | 10 ++
 include/hw/arm/aspeed_soc.h | 17 +
 6 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c
index 09b1e823ba..aa165d583b 100644
--- a/hw/arm/aspeed.c
+++ b/hw/arm/aspeed.c
@@ -342,7 +342,7 @@ static void connect_serial_hds_to_uarts(AspeedMachineState 
*bmc)
 int uart_chosen = bmc->uart_chosen ? bmc->uart_chosen : amc->uart_default;
 
 aspeed_soc_uart_set_chr(s, uart_chosen, serial_hd(0));
-for (int i = 1, uart = ASPEED_DEV_UART1; i < sc->uarts_num; i++, uart++) {
+for (int i = 0, uart = sc->uarts_base; i < sc->uarts_num; i++, uart++) {
 if (uart == uart_chosen) {
 continue;
 }
@@ -1094,7 +1094,7 @@ static char *aspeed_get_bmc_console(Object *obj, Error 
**errp)
 AspeedMachineClass *amc = ASPEED_MACHINE_GET_CLASS(bmc);
 int uart_chosen = bmc->uart_chosen ? bmc->uart_chosen : amc->uart_default;
 
-return g_strdup_printf("uart%d", uart_chosen - ASPEED_DEV_UART1 + 1);
+return g_strdup_printf("uart%d", aspeed_uart_index(uart_chosen));
 }
 
 static void aspeed_set_bmc_console(Object *obj, const char *value, Error 
**errp)
@@ -1103,6 +1103,8 @@ static void aspeed_set_bmc_console(Object *obj, const 
char *value, Error **errp)
 AspeedMachineClass *amc = ASPEED_MACHINE_GET_CLASS(bmc);
 AspeedSoCClass *sc = ASPEED_SOC_CLASS(object_class_by_name(amc->soc_name));
 int val;
+int uart_first = aspeed_uart_first(sc);
+int uart_last = aspeed_uart_last(sc);
 
 if (sscanf(value, "uart%u", ) != 1) {
 error_setg(errp, "Bad value for \"uart\" property");
@@ -1110,11 +1112,12 @@ static void aspeed_set_bmc_console(Object *obj, const 
char *value, Error **errp)
 }
 
 /* The number of UART depends on the SoC */
-if (val < 1 || val > sc->uarts_num) {
-error_setg(errp, "\"uart\" should be in range [1 - %d]", 
sc->uarts_num);
+if (val < uart_first || val > uart_last) {
+error_setg(errp, "\"uart\" should be in range [%d - %d]",
+   uart_first, uart_last);
 return;
 }
-bmc->uart_chosen = ASPEED_DEV_UART1 + val - 1;
+bmc->uart_chosen = val + ASPEED_DEV_UART0;
 }
 
 static void aspeed_machine_class_props_init(ObjectClass *oc)
diff --git a/hw/arm/aspeed_ast10x0.c b/hw/arm/aspeed_ast10x0.c
index c3b5116a6a..2634e0f654 100644
--- a/hw/arm/aspeed_ast10x0.c
+++ b/hw/arm/aspeed_ast10x0.c
@@ -436,6 +436,7 @@ static void aspeed_soc_ast1030_class_init(ObjectClass 
*klass, void *data)
 sc->wdts_num = 4;
 sc->macs_num = 1;
 sc->uarts_num = 13;
+sc->uarts_base = ASPEED_DEV_UART1;
 sc->irqmap = aspeed_soc_ast1030_irqmap;
 sc->memmap = aspeed_soc_ast1030_memmap;
 sc->num_cpus = 1;
diff --git a/hw/arm/aspeed_ast2400.c b/hw/arm/aspeed_ast2400.c
index 8829561bb6..95da85fee0 100644
--- a/hw/arm/aspeed_ast2400.c
+++ b/hw/arm/aspeed_ast2400.c
@@ -523,6 +523,7 @@ static void aspeed_soc_ast2400_class_init(ObjectClass *oc, 
void *data)
 sc->wdts_num = 2;
 sc->macs_num = 2;
 sc->uarts_num= 5;
+sc->uarts_base   = ASPEED_DEV_UART1;
 sc->irqmap   = aspeed_soc_ast2400_irqmap;
 sc->memmap   = aspeed_soc_ast2400_memmap;
 sc->num_cpus = 1;
@@ -551,6 +552,7 @@ static void aspeed_soc_ast2500_class_init(ObjectClass *oc, 
void *data)
 sc->wdts_num = 3;
 sc->macs_num = 2;
 sc->uarts_num= 5;
+sc->uarts_base   = ASPEED_DEV_UART1;
 sc->irqmap   = aspeed_soc_ast2500_irqmap;
 sc->memmap   = aspeed_soc_ast2500_memmap;
 sc->num_cpus = 1;
diff --git a/hw/arm/aspeed_ast2600.c b/hw/arm/aspeed_ast2600.c
index 4ee32ea99d..f74561ecdc 100644
--- a/hw/arm/aspeed_ast2600.c
+++ b/hw/arm/aspeed_ast2600.c
@@ -666,6 +666,7 @@ static void aspeed_soc_ast2600_class_init(ObjectClass *oc, 
void *data)
 sc->wdts_num = 4;
 sc->macs_num = 4;
 sc->uarts_num= 13;
+sc->uarts_base   = ASPEED_DEV_UART1;
 sc->irqmap   = aspeed_soc_ast2600_irqmap;
 sc->memmap   = aspeed_soc_ast2600_memmap;
 sc->num_cpus = 2;
diff --git a/hw/arm/aspeed_soc_common.c b/hw/arm/aspeed_soc_common.c
index 123a0c432c..95d0c0aba9 100644
--- a/hw/arm/aspeed_soc_common.c
+++ b/hw/arm/aspeed_soc_common.c
@@ -36,7 +36,7 @@ bool aspeed_soc_uart_realize(AspeedSoCState *s, Error **errp)
 AspeedSoCClass *sc = ASPEED_SOC_GET_CLASS(s);
 

[PATCH v3 0/2] UART0 device name and fix hardcode boot address 0

2024-02-14 Thread Jamin Lin via
v1:
1. support uart controller both 0 and 1 base
2. fix hardcode boot address 0

v2:
1. introduce a new UART0 device name
2. remove ASPEED_SOC_SPI_BOOT_ADDR marco

v3:
1. add uart helper functions to get the index, start and last.
2. add more description in commit log

Jamin Lin (2):
  aspeed: introduce a new UART0 device name
  aspeed: fix hardcode boot address 0

 hw/arm/aspeed.c | 17 +++--
 hw/arm/aspeed_ast10x0.c |  1 +
 hw/arm/aspeed_ast2400.c |  6 --
 hw/arm/aspeed_ast2600.c |  3 ++-
 hw/arm/aspeed_soc_common.c  | 10 ++
 include/hw/arm/aspeed_soc.h | 19 +--
 6 files changed, 41 insertions(+), 15 deletions(-)

-- 
2.25.1




Re: [PATCH v4 36/36] linux-user: Remove pgb_dynamic alignment assertion

2024-02-14 Thread Philippe Mathieu-Daudé

On 15/2/24 07:20, Richard Henderson wrote:

The assertion was never correct, because the alignment is a composite
of the image alignment and SHMLBA.  Even if the alignment didn't match
the image an assertion would not be correct -- more appropriate would
be an error message about an ill formed image.  But the image cannot
be held to SHMLBA under any circumstances.

Fixes: ee94743034b ("linux-user: completely re-write init_guest_space")
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2157
Signed-off-by: Richard Henderson 
---
  linux-user/elfload.c | 2 --
  1 file changed, 2 deletions(-)

diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index f3f1ab4f69..d92d66ca1e 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -3022,8 +3022,6 @@ static void pgb_dynamic(const char *image_name, uintptr_t 
guest_loaddr,
  uintptr_t brk, ret;
  PGBAddrs ga;
  
-assert(QEMU_IS_ALIGNED(guest_loaddr, align));

-
  /* Try the identity map first. */
  if (pgb_addr_set(, guest_loaddr, guest_hiaddr, true)) {
  brk = (uintptr_t)sbrk(0);


I suppose this isn't part of this series since posted as
https://lore.kernel.org/qemu-devel/20240214045413.541677-1-richard.hender...@linaro.org/

Still:
Reported-by: Alexey Sheplyakov 
Reviewed-by: Philippe Mathieu-Daudé 



Re: [PATCH v4 22/36] linux-user: Use do_munmap for target_mmap failure

2024-02-14 Thread Philippe Mathieu-Daudé

On 15/2/24 07:20, Richard Henderson wrote:

For the cases for which the host mmap succeeds, but does
not yield the desired address, use do_munmap to restore
the reserved_va memory reservation.

Signed-off-by: Richard Henderson 
---
  linux-user/mmap.c | 6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)


Reviewed-by: Philippe Mathieu-Daudé 




Re: [PATCH v4 21/36] linux-user: Split out do_munmap

2024-02-14 Thread Philippe Mathieu-Daudé

On 15/2/24 07:20, Richard Henderson wrote:

Signed-off-by: Richard Henderson 
---
  linux-user/mmap.c | 23 ---
  1 file changed, 16 insertions(+), 7 deletions(-)


Reviewed-by: Philippe Mathieu-Daudé 




Re: [PATCH] linux-user: Remove pgb_dynamic alignment assertion

2024-02-14 Thread Philippe Mathieu-Daudé

On 14/2/24 05:54, Richard Henderson wrote:

The assertion was never correct, because the alignment is a composite
of the image alignment and SHMLBA.  Even if the alignment didn't match
the image an assertion would not be correct -- more appropriate would
be an error message about an ill formed image.  But the image cannot
be held to SHMLBA under any circumstances.


Reported-by: Alexey Sheplyakov 
Reviewed-by: Philippe Mathieu-Daudé 


Fixes: ee94743034b ("linux-user: completely re-write init_guest_space")
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2157
Signed-off-by: Richard Henderson 
---
  linux-user/elfload.c | 2 --
  1 file changed, 2 deletions(-)

diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index f3f1ab4f69..d92d66ca1e 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -3022,8 +3022,6 @@ static void pgb_dynamic(const char *image_name, uintptr_t 
guest_loaddr,
  uintptr_t brk, ret;
  PGBAddrs ga;
  
-assert(QEMU_IS_ALIGNED(guest_loaddr, align));

-
  /* Try the identity map first. */
  if (pgb_addr_set(, guest_loaddr, guest_hiaddr, true)) {
  brk = (uintptr_t)sbrk(0);





Re: [PATCH 04/12] vdpa: factor out vhost_vdpa_net_get_nc_vdpa

2024-02-14 Thread Eugenio Perez Martin
On Wed, Feb 14, 2024 at 9:59 PM Si-Wei Liu  wrote:
>
>
>
> On 2/14/2024 10:54 AM, Eugenio Perez Martin wrote:
> > On Wed, Feb 14, 2024 at 1:39 PM Si-Wei Liu  wrote:
> >> Introduce new API. No functional change on existing API.
> >>
> >> Acked-by: Jason Wang 
> >> Signed-off-by: Si-Wei Liu 
> > I'm ok with the new function, but doesn't the compiler complain
> > because adding a static function is not used?
> Hmmm, which one? vhost_vdpa_net_get_nc_vdpa is used by
> vhost_vdpa_net_first_nc_vdpa internally, and
> vhost_vdpa_net_first_nc_vdpa is used by vhost_vdpa_net_cvq_start (Patch
> 01). I think we should be fine?
>

Ouch, you're totally right.

Reviewed-by: Eugenio Pérez 

Thanks!

> -Siwei
> >
> >> ---
> >>   net/vhost-vdpa.c | 13 +
> >>   1 file changed, 9 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
> >> index 06c83b4..4168cad 100644
> >> --- a/net/vhost-vdpa.c
> >> +++ b/net/vhost-vdpa.c
> >> @@ -281,13 +281,18 @@ static ssize_t vhost_vdpa_receive(NetClientState 
> >> *nc, const uint8_t *buf,
> >>   }
> >>
> >>
> >> -/** From any vdpa net client, get the netclient of the first queue pair */
> >> -static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s)
> >> +/** From any vdpa net client, get the netclient of the i-th queue pair */
> >> +static VhostVDPAState *vhost_vdpa_net_get_nc_vdpa(VhostVDPAState *s, int 
> >> i)
> >>   {
> >>   NICState *nic = qemu_get_nic(s->nc.peer);
> >> -NetClientState *nc0 = qemu_get_peer(nic->ncs, 0);
> >> +NetClientState *nc_i = qemu_get_peer(nic->ncs, i);
> >> +
> >> +return DO_UPCAST(VhostVDPAState, nc, nc_i);
> >> +}
> >>
> >> -return DO_UPCAST(VhostVDPAState, nc, nc0);
> >> +static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s)
> >> +{
> >> +return vhost_vdpa_net_get_nc_vdpa(s, 0);
> >>   }
> >>
> >>   static void vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool 
> >> enable)
> >> --
> >> 1.8.3.1
> >>
>




[PATCH v4 17/36] linux-user: Move some mmap checks outside the lock

2024-02-14 Thread Richard Henderson
Basic validation of operands does not require the lock.
Hoist them from target_mmap__locked back into target_mmap.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-18-richard.hender...@linaro.org>
---
 linux-user/mmap.c | 107 +++---
 1 file changed, 53 insertions(+), 54 deletions(-)

diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index b4c3cc65aa..fbaea832c5 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -491,52 +491,14 @@ abi_ulong mmap_find_vma(abi_ulong start, abi_ulong size, 
abi_ulong align)
 }
 
 static abi_long target_mmap__locked(abi_ulong start, abi_ulong len,
-int target_prot, int flags,
+int target_prot, int flags, int page_flags,
 int fd, off_t offset)
 {
 int host_page_size = qemu_real_host_page_size();
 abi_ulong ret, last, real_start, real_last, retaddr, host_len;
 abi_ulong passthrough_start = -1, passthrough_last = 0;
-int page_flags;
 off_t host_offset;
 
-if (!len) {
-errno = EINVAL;
-return -1;
-}
-
-page_flags = validate_prot_to_pageflags(target_prot);
-if (!page_flags) {
-errno = EINVAL;
-return -1;
-}
-
-/* Also check for overflows... */
-len = TARGET_PAGE_ALIGN(len);
-if (!len) {
-errno = ENOMEM;
-return -1;
-}
-
-if (offset & ~TARGET_PAGE_MASK) {
-errno = EINVAL;
-return -1;
-}
-
-/*
- * If we're mapping shared memory, ensure we generate code for parallel
- * execution and flush old translations.  This will work up to the level
- * supported by the host -- anything that requires EXCP_ATOMIC will not
- * be atomic with respect to an external process.
- */
-if (flags & MAP_SHARED) {
-CPUState *cpu = thread_cpu;
-if (!(cpu->tcg_cflags & CF_PARALLEL)) {
-cpu->tcg_cflags |= CF_PARALLEL;
-tb_flush(cpu);
-}
-}
-
 real_start = start & -host_page_size;
 host_offset = offset & -host_page_size;
 
@@ -616,23 +578,9 @@ static abi_long target_mmap__locked(abi_ulong start, 
abi_ulong len,
 passthrough_start = start;
 passthrough_last = last;
 } else {
-if (start & ~TARGET_PAGE_MASK) {
-errno = EINVAL;
-return -1;
-}
 last = start + len - 1;
 real_last = ROUND_UP(last, host_page_size) - 1;
 
-/*
- * Test if requested memory area fits target address space
- * It can fail only on 64-bit host with 32-bit target.
- * On any other target/host host mmap() handles this error correctly.
- */
-if (last < start || !guest_range_valid_untagged(start, len)) {
-errno = ENOMEM;
-return -1;
-}
-
 if (flags & MAP_FIXED_NOREPLACE) {
 /* Validate that the chosen range is empty. */
 if (!page_check_range_empty(start, last)) {
@@ -778,13 +726,64 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int 
target_prot,
  int flags, int fd, off_t offset)
 {
 abi_long ret;
+int page_flags;
 
 trace_target_mmap(start, len, target_prot, flags, fd, offset);
+
+if (!len) {
+errno = EINVAL;
+return -1;
+}
+
+page_flags = validate_prot_to_pageflags(target_prot);
+if (!page_flags) {
+errno = EINVAL;
+return -1;
+}
+
+/* Also check for overflows... */
+len = TARGET_PAGE_ALIGN(len);
+if (!len || len != (size_t)len) {
+errno = ENOMEM;
+return -1;
+}
+
+if (offset & ~TARGET_PAGE_MASK) {
+errno = EINVAL;
+return -1;
+}
+if (flags & (MAP_FIXED | MAP_FIXED_NOREPLACE)) {
+if (start & ~TARGET_PAGE_MASK) {
+errno = EINVAL;
+return -1;
+}
+if (!guest_range_valid_untagged(start, len)) {
+errno = ENOMEM;
+return -1;
+}
+}
+
 mmap_lock();
 
-ret = target_mmap__locked(start, len, target_prot, flags, fd, offset);
+ret = target_mmap__locked(start, len, target_prot, flags,
+  page_flags, fd, offset);
 
 mmap_unlock();
+
+/*
+ * If we're mapping shared memory, ensure we generate code for parallel
+ * execution and flush old translations.  This will work up to the level
+ * supported by the host -- anything that requires EXCP_ATOMIC will not
+ * be atomic with respect to an external process.
+ */
+if (ret != -1 && (flags & MAP_TYPE) != MAP_PRIVATE) {
+CPUState *cpu = thread_cpu;
+if (!(cpu->tcg_cflags & CF_PARALLEL)) {
+cpu->tcg_cflags |= CF_PARALLEL;
+tb_flush(cpu);
+}
+}
+
 return ret;
 }
 
-- 
2.34.1




[PATCH v4 22/36] linux-user: Use do_munmap for target_mmap failure

2024-02-14 Thread Richard Henderson
For the cases for which the host mmap succeeds, but does
not yield the desired address, use do_munmap to restore
the reserved_va memory reservation.

Signed-off-by: Richard Henderson 
---
 linux-user/mmap.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index 8ebcca..cbcd31e941 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -326,7 +326,7 @@ static bool mmap_frag(abi_ulong real_start, abi_ulong 
start, abi_ulong last,
flags | MAP_ANONYMOUS, -1, 0);
 if (p != host_start) {
 if (p != MAP_FAILED) {
-munmap(p, host_page_size);
+do_munmap(p, host_page_size);
 errno = EEXIST;
 }
 return false;
@@ -622,7 +622,7 @@ static abi_long target_mmap__locked(abi_ulong start, 
abi_ulong len,
 p = mmap(g2h_untagged(start), len, host_prot,
  flags | MAP_FIXED, fd, host_offset);
 if (p == MAP_FAILED) {
-munmap(g2h_untagged(start), host_len);
+do_munmap(g2h_untagged(start), host_len);
 return -1;
 }
 host_start += offset - host_offset;
@@ -735,7 +735,7 @@ static abi_long target_mmap__locked(abi_ulong start, 
abi_ulong len,
  flags, fd, offset1);
 if (p != want_p) {
 if (p != MAP_FAILED) {
-munmap(p, len1);
+do_munmap(p, len1);
 errno = EEXIST;
 }
 return -1;
-- 
2.34.1




[PATCH v4 15/36] linux-user: Remove qemu_host_page_size from main

2024-02-14 Thread Richard Henderson
Use qemu_real_host_page_size() instead.

Signed-off-by: Richard Henderson 
Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-16-richard.hender...@linaro.org>
---
 linux-user/main.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/linux-user/main.c b/linux-user/main.c
index 74b2fbb393..e540acb84a 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -781,7 +781,7 @@ int main(int argc, char **argv, char **envp)
 }
 cpu_type = parse_cpu_option(cpu_model);
 
-/* init tcg before creating CPUs and to get qemu_host_page_size */
+/* init tcg before creating CPUs */
 {
 AccelState *accel = current_accel();
 AccelClass *ac = ACCEL_GET_CLASS(accel);
@@ -804,8 +804,10 @@ int main(int argc, char **argv, char **envp)
  */
 max_reserved_va = MAX_RESERVED_VA(cpu);
 if (reserved_va != 0) {
-if ((reserved_va + 1) % qemu_host_page_size) {
-char *s = size_to_str(qemu_host_page_size);
+int host_page_size = qemu_real_host_page_size();
+
+if ((reserved_va + 1) % host_page_size) {
+char *s = size_to_str(host_page_size);
 fprintf(stderr, "Reserved virtual address not aligned mod %s\n", 
s);
 g_free(s);
 exit(EXIT_FAILURE);
@@ -902,7 +904,7 @@ int main(int argc, char **argv, char **envp)
  * If we're in a chroot with no /proc, fall back to 1 page.
  */
 if (mmap_min_addr == 0) {
-mmap_min_addr = qemu_host_page_size;
+mmap_min_addr = qemu_real_host_page_size();
 qemu_log_mask(CPU_LOG_PAGE,
   "host mmap_min_addr=0x%lx (fallback)\n",
   mmap_min_addr);
-- 
2.34.1




[PATCH v4 11/36] migration: Remove qemu_host_page_size

2024-02-14 Thread Richard Henderson
Replace with the maximum of the real host page size
and the target page size.  This is an exact replacement.

Signed-off-by: Richard Henderson 
Reviewed-by: Ilya Leoshkevich 
Reviewed-by: Pierrick Bouvier 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-12-richard.hender...@linaro.org>
---
 migration/ram.c | 22 ++
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 4649a81204..61c1488352 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2935,7 +2935,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
 {
 RAMState **rsp = opaque;
 RAMBlock *block;
-int ret;
+int ret, max_hg_page_size;
 
 if (compress_threads_save_setup()) {
 return -1;
@@ -2950,6 +2950,12 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
 }
 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
 
+/*
+ * ??? Mirrors the previous value of qemu_host_page_size,
+ * but is this really what was intended for the migration?
+ */
+max_hg_page_size = MAX(qemu_real_host_page_size(), TARGET_PAGE_SIZE);
+
 WITH_RCU_READ_LOCK_GUARD() {
 qemu_put_be64(f, ram_bytes_total_with_ignored()
  | RAM_SAVE_FLAG_MEM_SIZE);
@@ -2958,8 +2964,8 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
 qemu_put_byte(f, strlen(block->idstr));
 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
 qemu_put_be64(f, block->used_length);
-if (migrate_postcopy_ram() && block->page_size !=
-  qemu_host_page_size) {
+if (migrate_postcopy_ram() &&
+block->page_size != max_hg_page_size) {
 qemu_put_be64(f, block->page_size);
 }
 if (migrate_ignore_shared()) {
@@ -3792,6 +3798,7 @@ static int parse_ramblock(QEMUFile *f, RAMBlock *block, 
ram_addr_t length)
 int ret = 0;
 /* ADVISE is earlier, it shows the source has the postcopy capability on */
 bool postcopy_advised = migration_incoming_postcopy_advised();
+int max_hg_page_size;
 
 assert(block);
 
@@ -3809,9 +3816,16 @@ static int parse_ramblock(QEMUFile *f, RAMBlock *block, 
ram_addr_t length)
 return ret;
 }
 }
+
+/*
+ * ??? Mirrors the previous value of qemu_host_page_size,
+ * but is this really what was intended for the migration?
+ */
+max_hg_page_size = MAX(qemu_real_host_page_size(), TARGET_PAGE_SIZE);
+
 /* For postcopy we need to check hugepage sizes match */
 if (postcopy_advised && migrate_postcopy_ram() &&
-block->page_size != qemu_host_page_size) {
+block->page_size != max_hg_page_size) {
 uint64_t remote_page_size = qemu_get_be64(f);
 if (remote_page_size != block->page_size) {
 error_report("Mismatched RAM page size %s "
-- 
2.34.1




[PATCH v4 20/36] linux-user: Do early mmap placement only for reserved_va

2024-02-14 Thread Richard Henderson
For reserved_va, place all non-fixed maps then proceed
as for MAP_FIXED.

Signed-off-by: Richard Henderson 
Reviewed-by: Pierrick Bouvier 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-21-richard.hender...@linaro.org>
---
 linux-user/mmap.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index cc983bedbd..1bbfeb25b1 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -540,17 +540,19 @@ static abi_long target_mmap__locked(abi_ulong start, 
abi_ulong len,
 host_offset = offset & -host_page_size;
 
 /*
- * If the user is asking for the kernel to find a location, do that
- * before we truncate the length for mapping files below.
+ * For reserved_va, we are in full control of the allocation.
+ * Find a suitable hole and convert to MAP_FIXED.
  */
-if (!(flags & (MAP_FIXED | MAP_FIXED_NOREPLACE))) {
+if (reserved_va && !(flags & (MAP_FIXED | MAP_FIXED_NOREPLACE))) {
 host_len = len + offset - host_offset;
-host_len = ROUND_UP(host_len, host_page_size);
-start = mmap_find_vma(real_start, host_len, TARGET_PAGE_SIZE);
+start = mmap_find_vma(real_start, host_len,
+  MAX(host_page_size, TARGET_PAGE_SIZE));
 if (start == (abi_ulong)-1) {
 errno = ENOMEM;
 return -1;
 }
+start += offset - host_offset;
+flags |= MAP_FIXED;
 }
 
 /*
-- 
2.34.1




[PATCH v4 12/36] hw/tpm: Remove HOST_PAGE_ALIGN from tpm_ppi_init

2024-02-14 Thread Richard Henderson
The size of the allocation need not match the alignment.

Signed-off-by: Richard Henderson 
Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-13-richard.hender...@linaro.org>
---
 hw/tpm/tpm_ppi.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/hw/tpm/tpm_ppi.c b/hw/tpm/tpm_ppi.c
index 7f74e26ec6..91eeafd53a 100644
--- a/hw/tpm/tpm_ppi.c
+++ b/hw/tpm/tpm_ppi.c
@@ -47,8 +47,7 @@ void tpm_ppi_reset(TPMPPI *tpmppi)
 void tpm_ppi_init(TPMPPI *tpmppi, MemoryRegion *m,
   hwaddr addr, Object *obj)
 {
-tpmppi->buf = qemu_memalign(qemu_real_host_page_size(),
-HOST_PAGE_ALIGN(TPM_PPI_ADDR_SIZE));
+tpmppi->buf = qemu_memalign(qemu_real_host_page_size(), TPM_PPI_ADDR_SIZE);
 memory_region_init_ram_device_ptr(>ram, obj, "tpm-ppi",
   TPM_PPI_ADDR_SIZE, tpmppi->buf);
 vmstate_register_ram(>ram, DEVICE(obj));
-- 
2.34.1




[PATCH v4 18/36] linux-user: Fix sub-host-page mmap

2024-02-14 Thread Richard Henderson
We cannot skip over the_end1 to the_end, because we fail to
record the validity of the guest page with the interval tree.
Remove "the_end" and rename "the_end1" to "the_end".

Signed-off-by: Richard Henderson 
Reviewed-by: Pierrick Bouvier 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-19-richard.hender...@linaro.org>
---
 linux-user/mmap.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index fbaea832c5..48fcdd4a32 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -643,7 +643,7 @@ static abi_long target_mmap__locked(abi_ulong start, 
abi_ulong len,
target_prot, flags, fd, offset)) {
 return -1;
 }
-goto the_end1;
+goto the_end;
 }
 if (!mmap_frag(real_start, start,
real_start + host_page_size - 1,
@@ -690,7 +690,7 @@ static abi_long target_mmap__locked(abi_ulong start, 
abi_ulong len,
 passthrough_last = real_last;
 }
 }
- the_end1:
+ the_end:
 if (flags & MAP_ANONYMOUS) {
 page_flags |= PAGE_ANON;
 }
@@ -708,7 +708,6 @@ static abi_long target_mmap__locked(abi_ulong start, 
abi_ulong len,
 }
 }
 shm_region_rm_complete(start, last);
- the_end:
 trace_target_mmap_complete(start);
 if (qemu_loglevel_mask(CPU_LOG_PAGE)) {
 FILE *f = qemu_log_trylock();
-- 
2.34.1




[PATCH v4 14/36] softmmu/physmem: Remove HOST_PAGE_ALIGN

2024-02-14 Thread Richard Henderson
Align allocation sizes to the maximum of host and target page sizes.

Signed-off-by: Richard Henderson 
Reviewed-by: Ilya Leoshkevich 
Reviewed-by: Pierrick Bouvier 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-15-richard.hender...@linaro.org>
---
 system/physmem.c | 15 +++
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/system/physmem.c b/system/physmem.c
index 508dcb7494..6b7c5747da 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -1680,7 +1680,8 @@ int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, 
Error **errp)
 
 assert(block);
 
-newsize = HOST_PAGE_ALIGN(newsize);
+newsize = TARGET_PAGE_ALIGN(newsize);
+newsize = REAL_HOST_PAGE_ALIGN(newsize);
 
 if (block->used_length == newsize) {
 /*
@@ -1916,7 +1917,9 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, 
MemoryRegion *mr,
 return NULL;
 }
 
-size = HOST_PAGE_ALIGN(size);
+size = TARGET_PAGE_ALIGN(size);
+size = REAL_HOST_PAGE_ALIGN(size);
+
 file_size = get_file_size(fd);
 if (file_size > offset && file_size < (offset + size)) {
 error_setg(errp, "backing store size 0x%" PRIx64
@@ -2014,13 +2017,17 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, 
ram_addr_t max_size,
 {
 RAMBlock *new_block;
 Error *local_err = NULL;
+int align;
 
 assert((ram_flags & ~(RAM_SHARED | RAM_RESIZEABLE | RAM_PREALLOC |
   RAM_NORESERVE)) == 0);
 assert(!host ^ (ram_flags & RAM_PREALLOC));
 
-size = HOST_PAGE_ALIGN(size);
-max_size = HOST_PAGE_ALIGN(max_size);
+align = qemu_real_host_page_size();
+align = MAX(align, TARGET_PAGE_SIZE);
+size = ROUND_UP(size, align);
+max_size = ROUND_UP(max_size, align);
+
 new_block = g_malloc0(sizeof(*new_block));
 new_block->mr = mr;
 new_block->resized = resized;
-- 
2.34.1




[PATCH v4 30/36] accel/tcg: Disconnect TargetPageDataNode from page size

2024-02-14 Thread Richard Henderson
Dynamically size the node for the runtime target page size.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-29-richard.hender...@linaro.org>
---
 accel/tcg/user-exec.c | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index 69b7429e31..3cac3a78c4 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -864,7 +864,7 @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, 
vaddr addr,
 typedef struct TargetPageDataNode {
 struct rcu_head rcu;
 IntervalTreeNode itree;
-char data[TPD_PAGES][TARGET_PAGE_DATA_SIZE] __attribute__((aligned));
+char data[] __attribute__((aligned));
 } TargetPageDataNode;
 
 static IntervalTreeRoot targetdata_root;
@@ -902,7 +902,8 @@ void page_reset_target_data(target_ulong start, 
target_ulong last)
 n_last = MIN(last, n->last);
 p_len = (n_last + 1 - n_start) >> TARGET_PAGE_BITS;
 
-memset(t->data[p_ofs], 0, p_len * TARGET_PAGE_DATA_SIZE);
+memset(t->data + p_ofs * TARGET_PAGE_DATA_SIZE, 0,
+   p_len * TARGET_PAGE_DATA_SIZE);
 }
 }
 
@@ -910,7 +911,7 @@ void *page_get_target_data(target_ulong address)
 {
 IntervalTreeNode *n;
 TargetPageDataNode *t;
-target_ulong page, region;
+target_ulong page, region, p_ofs;
 
 page = address & TARGET_PAGE_MASK;
 region = address & TBD_MASK;
@@ -926,7 +927,8 @@ void *page_get_target_data(target_ulong address)
 mmap_lock();
 n = interval_tree_iter_first(_root, page, page);
 if (!n) {
-t = g_new0(TargetPageDataNode, 1);
+t = g_malloc0(sizeof(TargetPageDataNode)
+  + TPD_PAGES * TARGET_PAGE_DATA_SIZE);
 n = >itree;
 n->start = region;
 n->last = region | ~TBD_MASK;
@@ -936,7 +938,8 @@ void *page_get_target_data(target_ulong address)
 }
 
 t = container_of(n, TargetPageDataNode, itree);
-return t->data[(page - region) >> TARGET_PAGE_BITS];
+p_ofs = (page - region) >> TARGET_PAGE_BITS;
+return t->data + p_ofs * TARGET_PAGE_DATA_SIZE;
 }
 #else
 void page_reset_target_data(target_ulong start, target_ulong last) { }
-- 
2.34.1




[PATCH v4 13/36] softmmu/physmem: Remove qemu_host_page_size

2024-02-14 Thread Richard Henderson
Use qemu_real_host_page_size() instead.

Signed-off-by: Richard Henderson 
Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-14-richard.hender...@linaro.org>
---
 system/physmem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/system/physmem.c b/system/physmem.c
index 5e66d9ae36..508dcb7494 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -3515,7 +3515,7 @@ int ram_block_discard_range(RAMBlock *rb, uint64_t start, 
size_t length)
  *fallocate works on hugepages and shmem
  *shared anonymous memory requires madvise REMOVE
  */
-need_madvise = (rb->page_size == qemu_host_page_size);
+need_madvise = (rb->page_size == qemu_real_host_page_size());
 need_fallocate = rb->fd != -1;
 if (need_fallocate) {
 /* For a file, this causes the area of the file to be zero'd
-- 
2.34.1




[PATCH v4 29/36] cpu: Remove page_size_init

2024-02-14 Thread Richard Henderson
Move qemu_host_page_{size,mask} and HOST_PAGE_ALIGN into bsd-user.
It should be removed from bsd-user as well, but defer that cleanup.

Reviewed-by: Warner Losh 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
Tested-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-28-richard.hender...@linaro.org>
---
 bsd-user/qemu.h   |  7 +++
 include/exec/cpu-common.h |  7 ---
 include/hw/core/cpu.h |  2 --
 accel/tcg/translate-all.c |  1 -
 bsd-user/main.c   | 12 
 cpu-target.c  | 13 -
 system/vl.c   |  1 -
 7 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/bsd-user/qemu.h b/bsd-user/qemu.h
index dc842fffa7..c05c512767 100644
--- a/bsd-user/qemu.h
+++ b/bsd-user/qemu.h
@@ -39,6 +39,13 @@ extern char **environ;
 #include "qemu/clang-tsa.h"
 
 #include "qemu-os.h"
+/*
+ * TODO: Remove these and rely only on qemu_real_host_page_size().
+ */
+extern uintptr_t qemu_host_page_size;
+extern intptr_t qemu_host_page_mask;
+#define HOST_PAGE_ALIGN(addr) ROUND_UP((addr), qemu_host_page_size)
+
 /*
  * This struct is used to hold certain information about the image.  Basically,
  * it replicates in user space what would be certain task_struct fields in the
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 9ead1be100..6346df17ce 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -20,13 +20,6 @@
 void cpu_exec_init_all(void);
 void cpu_exec_step_atomic(CPUState *cpu);
 
-/* Using intptr_t ensures that qemu_*_page_mask is sign-extended even
- * when intptr_t is 32-bit and we are aligning a long long.
- */
-extern uintptr_t qemu_host_page_size;
-extern intptr_t qemu_host_page_mask;
-
-#define HOST_PAGE_ALIGN(addr) ROUND_UP((addr), qemu_host_page_size)
 #define REAL_HOST_PAGE_ALIGN(addr) ROUND_UP((addr), qemu_real_host_page_size())
 
 /* The CPU list lock nests outside page_(un)lock or mmap_(un)lock */
diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index 4385ce54c9..5c2d55f6d2 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -1179,8 +1179,6 @@ bool target_words_bigendian(void);
 
 const char *target_name(void);
 
-void page_size_init(void);
-
 #ifdef NEED_CPU_H
 
 #ifndef CONFIG_USER_ONLY
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index 1c695efe02..c1f57e894a 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -256,7 +256,6 @@ bool cpu_unwind_state_data(CPUState *cpu, uintptr_t 
host_pc, uint64_t *data)
 
 void page_init(void)
 {
-page_size_init();
 page_table_config_init();
 }
 
diff --git a/bsd-user/main.c b/bsd-user/main.c
index 521b58b880..4d6ce59af4 100644
--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -49,6 +49,13 @@
 #include "host-os.h"
 #include "target_arch_cpu.h"
 
+
+/*
+ * TODO: Remove these and rely only on qemu_real_host_page_size().
+ */
+uintptr_t qemu_host_page_size;
+intptr_t qemu_host_page_mask;
+
 static bool opt_one_insn_per_tb;
 uintptr_t guest_base;
 bool have_guest_base;
@@ -307,6 +314,9 @@ int main(int argc, char **argv)
 (void) envlist_setenv(envlist, *wrk);
 }
 
+qemu_host_page_size = getpagesize();
+qemu_host_page_size = MAX(qemu_host_page_size, TARGET_PAGE_SIZE);
+
 cpu_model = NULL;
 
 qemu_add_opts(_trace_opts);
@@ -403,6 +413,8 @@ int main(int argc, char **argv)
 }
 }
 
+qemu_host_page_mask = -qemu_host_page_size;
+
 /* init debug */
 {
 int mask = 0;
diff --git a/cpu-target.c b/cpu-target.c
index 86444cc2c6..8763da51ee 100644
--- a/cpu-target.c
+++ b/cpu-target.c
@@ -474,16 +474,3 @@ const char *target_name(void)
 {
 return TARGET_NAME;
 }
-
-void page_size_init(void)
-{
-/* NOTE: we can always suppose that qemu_host_page_size >=
-   TARGET_PAGE_SIZE */
-if (qemu_host_page_size == 0) {
-qemu_host_page_size = qemu_real_host_page_size();
-}
-if (qemu_host_page_size < TARGET_PAGE_SIZE) {
-qemu_host_page_size = TARGET_PAGE_SIZE;
-}
-qemu_host_page_mask = -(intptr_t)qemu_host_page_size;
-}
diff --git a/system/vl.c b/system/vl.c
index a82555ae15..3a2586a04d 100644
--- a/system/vl.c
+++ b/system/vl.c
@@ -2118,7 +2118,6 @@ static void qemu_create_machine(QDict *qdict)
 }
 
 cpu_exec_init_all();
-page_size_init();
 
 if (machine_class->hw_version) {
 qemu_set_hw_version(machine_class->hw_version);
-- 
2.34.1




[PATCH v4 32/36] target/arm: Enable TARGET_PAGE_BITS_VARY for AArch64 user-only

2024-02-14 Thread Richard Henderson
Since aarch64 binaries are generally built for multiple
page sizes, it is trivial to allow the page size to vary.

Signed-off-by: Richard Henderson 
Reviewed-by: Pierrick Bouvier 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-31-richard.hender...@linaro.org>
---
 target/arm/cpu-param.h |  6 -
 target/arm/cpu.c   | 51 --
 2 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/target/arm/cpu-param.h b/target/arm/cpu-param.h
index f9b462a98f..da3243ab21 100644
--- a/target/arm/cpu-param.h
+++ b/target/arm/cpu-param.h
@@ -19,9 +19,13 @@
 #endif
 
 #ifdef CONFIG_USER_ONLY
-#define TARGET_PAGE_BITS 12
 # ifdef TARGET_AARCH64
 #  define TARGET_TAGGED_ADDRESSES
+/* Allow user-only to vary page size from 4k */
+#  define TARGET_PAGE_BITS_VARY
+#  define TARGET_PAGE_BITS_MIN  12
+# else
+#  define TARGET_PAGE_BITS 12
 # endif
 #else
 /*
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index 5fa86bc8d5..2325d4007f 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -1809,7 +1809,6 @@ static void arm_cpu_realizefn(DeviceState *dev, Error 
**errp)
 ARMCPU *cpu = ARM_CPU(dev);
 ARMCPUClass *acc = ARM_CPU_GET_CLASS(dev);
 CPUARMState *env = >env;
-int pagebits;
 Error *local_err = NULL;
 
 #if defined(CONFIG_TCG) && !defined(CONFIG_USER_ONLY)
@@ -2100,28 +2099,36 @@ static void arm_cpu_realizefn(DeviceState *dev, Error 
**errp)
!cpu_isar_feature(aa32_vfp_simd, cpu) ||
!arm_feature(env, ARM_FEATURE_XSCALE));
 
-if (arm_feature(env, ARM_FEATURE_V7) &&
-!arm_feature(env, ARM_FEATURE_M) &&
-!arm_feature(env, ARM_FEATURE_PMSA)) {
-/* v7VMSA drops support for the old ARMv5 tiny pages, so we
- * can use 4K pages.
- */
-pagebits = 12;
-} else {
-/* For CPUs which might have tiny 1K pages, or which have an
- * MPU and might have small region sizes, stick with 1K pages.
- */
-pagebits = 10;
-}
-if (!set_preferred_target_page_bits(pagebits)) {
-/* This can only ever happen for hotplugging a CPU, or if
- * the board code incorrectly creates a CPU which it has
- * promised via minimum_page_size that it will not.
- */
-error_setg(errp, "This CPU requires a smaller page size than the "
-   "system is using");
-return;
+#ifndef CONFIG_USER_ONLY
+{
+int pagebits;
+if (arm_feature(env, ARM_FEATURE_V7) &&
+!arm_feature(env, ARM_FEATURE_M) &&
+!arm_feature(env, ARM_FEATURE_PMSA)) {
+/*
+ * v7VMSA drops support for the old ARMv5 tiny pages,
+ * so we can use 4K pages.
+ */
+pagebits = 12;
+} else {
+/*
+ * For CPUs which might have tiny 1K pages, or which have an
+ * MPU and might have small region sizes, stick with 1K pages.
+ */
+pagebits = 10;
+}
+if (!set_preferred_target_page_bits(pagebits)) {
+/*
+ * This can only ever happen for hotplugging a CPU, or if
+ * the board code incorrectly creates a CPU which it has
+ * promised via minimum_page_size that it will not.
+ */
+error_setg(errp, "This CPU requires a smaller page size "
+   "than the system is using");
+return;
+}
 }
+#endif
 
 /* This cpu-id-to-MPIDR affinity is used only for TCG; KVM will override 
it.
  * We don't support setting cluster ID ([16..23]) (known as Aff2
-- 
2.34.1




[PATCH v4 27/36] tests/tcg: Extend file in linux-madvise.c

2024-02-14 Thread Richard Henderson
When guest page size > host page size, this test can fail
due to the SIGBUS protection hack.  Avoid this by making
sure that the file size is at least one guest page.

Visible with alpha guest on x86_64 host.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-26-richard.hender...@linaro.org>
---
 tests/tcg/multiarch/linux/linux-madvise.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/tcg/multiarch/linux/linux-madvise.c 
b/tests/tcg/multiarch/linux/linux-madvise.c
index 29d0997e68..539fb3b772 100644
--- a/tests/tcg/multiarch/linux/linux-madvise.c
+++ b/tests/tcg/multiarch/linux/linux-madvise.c
@@ -42,6 +42,8 @@ static void test_file(void)
 assert(ret == 0);
 written = write(fd, , sizeof(c));
 assert(written == sizeof(c));
+ret = ftruncate(fd, pagesize);
+assert(ret == 0);
 page = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE, fd, 0);
 assert(page != MAP_FAILED);
 
-- 
2.34.1




[PATCH v4 02/36] linux-user: Adjust SVr4 NULL page mapping

2024-02-14 Thread Richard Henderson
Use TARGET_PAGE_SIZE and MAP_FIXED_NOREPLACE.

We really should be attending to this earlier during
probe_guest_base, as well as better detection and
emulation of various Linux personalities.

Signed-off-by: Richard Henderson 
Reviewed-by: Ilya Leoshkevich 
Reviewed-by: Pierrick Bouvier 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-3-richard.hender...@linaro.org>
---
 linux-user/elfload.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index b8eef893d0..e918a13748 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -3912,8 +3912,9 @@ int load_elf_binary(struct linux_binprm *bprm, struct 
image_info *info)
and some applications "depend" upon this behavior.  Since
we do not have the power to recompile these, we emulate
the SVr4 behavior.  Sigh.  */
-target_mmap(0, qemu_host_page_size, PROT_READ | PROT_EXEC,
-MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+target_mmap(0, TARGET_PAGE_SIZE, PROT_READ | PROT_EXEC,
+MAP_FIXED_NOREPLACE | MAP_PRIVATE | MAP_ANONYMOUS,
+-1, 0);
 }
 #ifdef TARGET_MIPS
 info->interp_fp_abi = interp_info.fp_abi;
-- 
2.34.1




[PATCH v4 19/36] linux-user: Split out mmap_end

2024-02-14 Thread Richard Henderson
Use a subroutine instead of a goto within target_mmap__locked.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-20-richard.hender...@linaro.org>
---
 linux-user/mmap.c | 71 +++
 1 file changed, 41 insertions(+), 30 deletions(-)

diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index 48fcdd4a32..cc983bedbd 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -490,6 +490,43 @@ abi_ulong mmap_find_vma(abi_ulong start, abi_ulong size, 
abi_ulong align)
 }
 }
 
+/*
+ * Record a successful mmap within the user-exec interval tree.
+ */
+static abi_long mmap_end(abi_ulong start, abi_ulong last,
+ abi_ulong passthrough_start,
+ abi_ulong passthrough_last,
+ int flags, int page_flags)
+{
+if (flags & MAP_ANONYMOUS) {
+page_flags |= PAGE_ANON;
+}
+page_flags |= PAGE_RESET;
+if (passthrough_start > passthrough_last) {
+page_set_flags(start, last, page_flags);
+} else {
+if (start < passthrough_start) {
+page_set_flags(start, passthrough_start - 1, page_flags);
+}
+page_set_flags(passthrough_start, passthrough_last,
+   page_flags | PAGE_PASSTHROUGH);
+if (passthrough_last < last) {
+page_set_flags(passthrough_last + 1, last, page_flags);
+}
+}
+shm_region_rm_complete(start, last);
+trace_target_mmap_complete(start);
+if (qemu_loglevel_mask(CPU_LOG_PAGE)) {
+FILE *f = qemu_log_trylock();
+if (f) {
+fprintf(f, "page layout changed following mmap\n");
+page_dump(f);
+qemu_log_unlock(f);
+}
+}
+return start;
+}
+
 static abi_long target_mmap__locked(abi_ulong start, abi_ulong len,
 int target_prot, int flags, int page_flags,
 int fd, off_t offset)
@@ -632,7 +669,7 @@ static abi_long target_mmap__locked(abi_ulong start, 
abi_ulong len,
 ret = target_mprotect(start, len, target_prot);
 assert(ret == 0);
 }
-goto the_end;
+return mmap_end(start, last, -1, 0, flags, page_flags);
 }
 
 /* handle the start of the mapping */
@@ -643,7 +680,7 @@ static abi_long target_mmap__locked(abi_ulong start, 
abi_ulong len,
target_prot, flags, fd, offset)) {
 return -1;
 }
-goto the_end;
+return mmap_end(start, last, -1, 0, flags, page_flags);
 }
 if (!mmap_frag(real_start, start,
real_start + host_page_size - 1,
@@ -690,34 +727,8 @@ static abi_long target_mmap__locked(abi_ulong start, 
abi_ulong len,
 passthrough_last = real_last;
 }
 }
- the_end:
-if (flags & MAP_ANONYMOUS) {
-page_flags |= PAGE_ANON;
-}
-page_flags |= PAGE_RESET;
-if (passthrough_start > passthrough_last) {
-page_set_flags(start, last, page_flags);
-} else {
-if (start < passthrough_start) {
-page_set_flags(start, passthrough_start - 1, page_flags);
-}
-page_set_flags(passthrough_start, passthrough_last,
-   page_flags | PAGE_PASSTHROUGH);
-if (passthrough_last < last) {
-page_set_flags(passthrough_last + 1, last, page_flags);
-}
-}
-shm_region_rm_complete(start, last);
-trace_target_mmap_complete(start);
-if (qemu_loglevel_mask(CPU_LOG_PAGE)) {
-FILE *f = qemu_log_trylock();
-if (f) {
-fprintf(f, "page layout changed following mmap\n");
-page_dump(f);
-qemu_log_unlock(f);
-}
-}
-return start;
+return mmap_end(start, last, passthrough_start, passthrough_last,
+flags, page_flags);
 }
 
 /* NOTE: all the constants are the HOST ones */
-- 
2.34.1




[PATCH v4 35/36] target/alpha: Enable TARGET_PAGE_BITS_VARY for user-only

2024-02-14 Thread Richard Henderson
Since alpha binaries are generally built for multiple
page sizes, it is trivial to allow the page size to vary.

Signed-off-by: Richard Henderson 
Reviewed-by: Pierrick Bouvier 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-34-richard.hender...@linaro.org>
---
 target/alpha/cpu-param.h | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/target/alpha/cpu-param.h b/target/alpha/cpu-param.h
index 68c46f7998..c969cb016b 100644
--- a/target/alpha/cpu-param.h
+++ b/target/alpha/cpu-param.h
@@ -9,10 +9,22 @@
 #define ALPHA_CPU_PARAM_H
 
 #define TARGET_LONG_BITS 64
-#define TARGET_PAGE_BITS 13
 
 /* ??? EV4 has 34 phys addr bits, EV5 has 40, EV6 has 44.  */
 #define TARGET_PHYS_ADDR_SPACE_BITS  44
-#define TARGET_VIRT_ADDR_SPACE_BITS  (30 + TARGET_PAGE_BITS)
+
+#ifdef CONFIG_USER_ONLY
+/*
+ * Allow user-only to vary page size.  Real hardware allows only 8k and 64k,
+ * but since any variance means guests cannot assume a fixed value, allow
+ * a 4k minimum to match x86 host, which can minimize emulation issues.
+ */
+# define TARGET_PAGE_BITS_VARY
+# define TARGET_PAGE_BITS_MIN 12
+# define TARGET_VIRT_ADDR_SPACE_BITS  63
+#else
+# define TARGET_PAGE_BITS 13
+# define TARGET_VIRT_ADDR_SPACE_BITS  (30 + TARGET_PAGE_BITS)
+#endif
 
 #endif
-- 
2.34.1




[PATCH v4 08/36] linux-user: Remove qemu_host_page_{size, mask} from mmap.c

2024-02-14 Thread Richard Henderson
Use qemu_real_host_page_size instead.

Signed-off-by: Richard Henderson 
Reviewed-by: Ilya Leoshkevich 
Reviewed-by: Pierrick Bouvier 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-9-richard.hender...@linaro.org>
---
 linux-user/mmap.c | 66 +++
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index 96c9433e27..4d3c8717b9 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -165,6 +165,7 @@ static int target_to_host_prot(int prot)
 /* NOTE: all the constants are the HOST ones, but addresses are target. */
 int target_mprotect(abi_ulong start, abi_ulong len, int target_prot)
 {
+int host_page_size = qemu_real_host_page_size();
 abi_ulong starts[3];
 abi_ulong lens[3];
 int prots[3];
@@ -189,13 +190,13 @@ int target_mprotect(abi_ulong start, abi_ulong len, int 
target_prot)
 }
 
 last = start + len - 1;
-host_start = start & qemu_host_page_mask;
+host_start = start & -host_page_size;
 host_last = HOST_PAGE_ALIGN(last) - 1;
 nranges = 0;
 
 mmap_lock();
 
-if (host_last - host_start < qemu_host_page_size) {
+if (host_last - host_start < host_page_size) {
 /* Single host page contains all guest pages: sum the prot. */
 prot1 = target_prot;
 for (abi_ulong a = host_start; a < start; a += TARGET_PAGE_SIZE) {
@@ -205,7 +206,7 @@ int target_mprotect(abi_ulong start, abi_ulong len, int 
target_prot)
 prot1 |= page_get_flags(a + 1);
 }
 starts[nranges] = host_start;
-lens[nranges] = qemu_host_page_size;
+lens[nranges] = host_page_size;
 prots[nranges] = prot1;
 nranges++;
 } else {
@@ -218,10 +219,10 @@ int target_mprotect(abi_ulong start, abi_ulong len, int 
target_prot)
 /* If the resulting sum differs, create a new range. */
 if (prot1 != target_prot) {
 starts[nranges] = host_start;
-lens[nranges] = qemu_host_page_size;
+lens[nranges] = host_page_size;
 prots[nranges] = prot1;
 nranges++;
-host_start += qemu_host_page_size;
+host_start += host_page_size;
 }
 }
 
@@ -233,9 +234,9 @@ int target_mprotect(abi_ulong start, abi_ulong len, int 
target_prot)
 }
 /* If the resulting sum differs, create a new range. */
 if (prot1 != target_prot) {
-host_last -= qemu_host_page_size;
+host_last -= host_page_size;
 starts[nranges] = host_last + 1;
-lens[nranges] = qemu_host_page_size;
+lens[nranges] = host_page_size;
 prots[nranges] = prot1;
 nranges++;
 }
@@ -270,6 +271,7 @@ int target_mprotect(abi_ulong start, abi_ulong len, int 
target_prot)
 static bool mmap_frag(abi_ulong real_start, abi_ulong start, abi_ulong last,
   int prot, int flags, int fd, off_t offset)
 {
+int host_page_size = qemu_real_host_page_size();
 abi_ulong real_last;
 void *host_start;
 int prot_old, prot_new;
@@ -286,7 +288,7 @@ static bool mmap_frag(abi_ulong real_start, abi_ulong 
start, abi_ulong last,
 return false;
 }
 
-real_last = real_start + qemu_host_page_size - 1;
+real_last = real_start + host_page_size - 1;
 host_start = g2h_untagged(real_start);
 
 /* Get the protection of the target pages outside the mapping. */
@@ -304,12 +306,12 @@ static bool mmap_frag(abi_ulong real_start, abi_ulong 
start, abi_ulong last,
  * outside of the fragment we need to map.  Allocate a new host
  * page to cover, discarding whatever else may have been present.
  */
-void *p = mmap(host_start, qemu_host_page_size,
+void *p = mmap(host_start, host_page_size,
target_to_host_prot(prot),
flags | MAP_ANONYMOUS, -1, 0);
 if (p != host_start) {
 if (p != MAP_FAILED) {
-munmap(p, qemu_host_page_size);
+munmap(p, host_page_size);
 errno = EEXIST;
 }
 return false;
@@ -324,7 +326,7 @@ static bool mmap_frag(abi_ulong real_start, abi_ulong 
start, abi_ulong last,
 /* Adjust protection to be able to write. */
 if (!(host_prot_old & PROT_WRITE)) {
 host_prot_old |= PROT_WRITE;
-mprotect(host_start, qemu_host_page_size, host_prot_old);
+mprotect(host_start, host_page_size, host_prot_old);
 }
 
 /* Read or zero the new guest pages. */
@@ -338,7 +340,7 @@ static bool mmap_frag(abi_ulong real_start, abi_ulong 
start, abi_ulong last,
 
 /* Put final protection */
 if (host_prot_new != host_prot_old) {
-mprotect(host_start, qemu_host_page_size, host_prot_new);
+mprotect(host_start, host_page_size, host_prot_new);
  

[PATCH v4 10/36] linux-user: Remove HOST_PAGE_ALIGN from mmap.c

2024-02-14 Thread Richard Henderson
This removes a hidden use of qemu_host_page_size, using instead
the existing host_page_size local within each function.

Signed-off-by: Richard Henderson 
Reviewed-by: Pierrick Bouvier 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-11-richard.hender...@linaro.org>
---
 linux-user/mmap.c | 13 ++---
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index 53e5486cc8..d11f758d07 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -191,7 +191,7 @@ int target_mprotect(abi_ulong start, abi_ulong len, int 
target_prot)
 
 last = start + len - 1;
 host_start = start & -host_page_size;
-host_last = HOST_PAGE_ALIGN(last) - 1;
+host_last = ROUND_UP(last, host_page_size) - 1;
 nranges = 0;
 
 mmap_lock();
@@ -389,8 +389,7 @@ abi_ulong mmap_find_vma(abi_ulong start, abi_ulong size, 
abi_ulong align)
 start &= -host_page_size;
 }
 start = ROUND_UP(start, align);
-
-size = HOST_PAGE_ALIGN(size);
+size = ROUND_UP(size, host_page_size);
 
 if (reserved_va) {
 return mmap_find_vma_reserved(start, size, align);
@@ -550,7 +549,7 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int 
target_prot,
  */
 if (!(flags & (MAP_FIXED | MAP_FIXED_NOREPLACE))) {
 host_len = len + offset - host_offset;
-host_len = HOST_PAGE_ALIGN(host_len);
+host_len = ROUND_UP(host_len, host_page_size);
 start = mmap_find_vma(real_start, host_len, TARGET_PAGE_SIZE);
 if (start == (abi_ulong)-1) {
 errno = ENOMEM;
@@ -595,7 +594,7 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int 
target_prot,
 void *p;
 
 host_len = len + offset - host_offset;
-host_len = HOST_PAGE_ALIGN(host_len);
+host_len = ROUND_UP(host_len, host_page_size);
 host_prot = target_to_host_prot(target_prot);
 
 /* Note: we prefer to control the mapping address. */
@@ -625,7 +624,7 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int 
target_prot,
 goto fail;
 }
 last = start + len - 1;
-real_last = HOST_PAGE_ALIGN(last) - 1;
+real_last = ROUND_UP(last, host_page_size) - 1;
 
 /*
  * Test if requested memory area fits target address space
@@ -794,7 +793,7 @@ static int mmap_reserve_or_unmap(abi_ulong start, abi_ulong 
len)
 
 last = start + len - 1;
 real_start = start & -host_page_size;
-real_last = HOST_PAGE_ALIGN(last) - 1;
+real_last = ROUND_UP(last, host_page_size) - 1;
 
 /*
  * If guest pages remain on the first or last host pages,
-- 
2.34.1




[PATCH v4 26/36] tests/tcg: Remove run-test-mmap-*

2024-02-14 Thread Richard Henderson
These tests are confused, because -p does not change
the guest page size, but the host page size.

Signed-off-by: Richard Henderson 
Reviewed-by: Pierrick Bouvier 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-25-richard.hender...@linaro.org>
---
 tests/tcg/alpha/Makefile.target |  3 ---
 tests/tcg/arm/Makefile.target   |  3 ---
 tests/tcg/hppa/Makefile.target  |  3 ---
 tests/tcg/i386/Makefile.target  |  3 ---
 tests/tcg/m68k/Makefile.target  |  3 ---
 tests/tcg/multiarch/Makefile.target |  9 -
 tests/tcg/ppc/Makefile.target   | 12 
 tests/tcg/sh4/Makefile.target   |  3 ---
 tests/tcg/sparc64/Makefile.target   |  6 --
 9 files changed, 45 deletions(-)
 delete mode 100644 tests/tcg/ppc/Makefile.target
 delete mode 100644 tests/tcg/sparc64/Makefile.target

diff --git a/tests/tcg/alpha/Makefile.target b/tests/tcg/alpha/Makefile.target
index b94500a7d9..fdd7ddf64e 100644
--- a/tests/tcg/alpha/Makefile.target
+++ b/tests/tcg/alpha/Makefile.target
@@ -13,6 +13,3 @@ test-cmov: test-cond.c
$(CC) $(CFLAGS) $(EXTRA_CFLAGS) $< -o $@ $(LDFLAGS)
 
 run-test-cmov: test-cmov
-
-# On Alpha Linux only supports 8k pages
-EXTRA_RUNS+=run-test-mmap-8192
diff --git a/tests/tcg/arm/Makefile.target b/tests/tcg/arm/Makefile.target
index 3473f4619e..0a1965fce7 100644
--- a/tests/tcg/arm/Makefile.target
+++ b/tests/tcg/arm/Makefile.target
@@ -79,6 +79,3 @@ sha512-vector: sha512.c
 ARM_TESTS += sha512-vector
 
 TESTS += $(ARM_TESTS)
-
-# On ARM Linux only supports 4k pages
-EXTRA_RUNS+=run-test-mmap-4096
diff --git a/tests/tcg/hppa/Makefile.target b/tests/tcg/hppa/Makefile.target
index cdd0d572a7..ea5ae2186d 100644
--- a/tests/tcg/hppa/Makefile.target
+++ b/tests/tcg/hppa/Makefile.target
@@ -2,9 +2,6 @@
 #
 # HPPA specific tweaks - specifically masking out broken tests
 
-# On parisc Linux supports 4K/16K/64K (but currently only 4k works)
-EXTRA_RUNS+=run-test-mmap-4096 # run-test-mmap-16384 run-test-mmap-65536
-
 # This triggers failures for hppa-linux about 1% of the time
 # HPPA is the odd target that can't use the sigtramp page;
 # it requires the full vdso with dwarf2 unwind info.
diff --git a/tests/tcg/i386/Makefile.target b/tests/tcg/i386/Makefile.target
index 9906f9e116..bbe2c44b2a 100644
--- a/tests/tcg/i386/Makefile.target
+++ b/tests/tcg/i386/Makefile.target
@@ -71,9 +71,6 @@ endif
 I386_TESTS:=$(filter-out $(SKIP_I386_TESTS), $(ALL_X86_TESTS))
 TESTS=$(MULTIARCH_TESTS) $(I386_TESTS)
 
-# On i386 and x86_64 Linux only supports 4k pages (large pages are a different 
hack)
-EXTRA_RUNS+=run-test-mmap-4096
-
 sha512-sse: CFLAGS=-msse4.1 -O3
 sha512-sse: sha512.c
$(CC) $(CFLAGS) $(EXTRA_CFLAGS) $< -o $@ $(LDFLAGS)
diff --git a/tests/tcg/m68k/Makefile.target b/tests/tcg/m68k/Makefile.target
index 6ff214e60a..33f7b1b127 100644
--- a/tests/tcg/m68k/Makefile.target
+++ b/tests/tcg/m68k/Makefile.target
@@ -5,6 +5,3 @@
 
 VPATH += $(SRC_PATH)/tests/tcg/m68k
 TESTS += trap denormal
-
-# On m68k Linux supports 4k and 8k pages (but 8k is currently broken)
-EXTRA_RUNS+=run-test-mmap-4096 # run-test-mmap-8192
diff --git a/tests/tcg/multiarch/Makefile.target 
b/tests/tcg/multiarch/Makefile.target
index e10951a801..f11f3b084d 100644
--- a/tests/tcg/multiarch/Makefile.target
+++ b/tests/tcg/multiarch/Makefile.target
@@ -51,18 +51,9 @@ run-plugin-vma-pthread-with-%: vma-pthread
$(call skip-test, $<, "flaky on CI?")
 endif
 
-# We define the runner for test-mmap after the individual
-# architectures have defined their supported pages sizes. If no
-# additional page sizes are defined we only run the default test.
-
-# default case (host page size)
 run-test-mmap: test-mmap
$(call run-test, test-mmap, $(QEMU) $<, $< (default))
 
-# additional page sizes (defined by each architecture adding to EXTRA_RUNS)
-run-test-mmap-%: test-mmap
-   $(call run-test, test-mmap-$*, $(QEMU) -p $* $<, $< ($* byte pages))
-
 ifneq ($(GDB),)
 GDB_SCRIPT=$(SRC_PATH)/tests/guest-debug/run-test.py
 
diff --git a/tests/tcg/ppc/Makefile.target b/tests/tcg/ppc/Makefile.target
deleted file mode 100644
index f5e08c7376..00
--- a/tests/tcg/ppc/Makefile.target
+++ /dev/null
@@ -1,12 +0,0 @@
-# -*- Mode: makefile -*-
-#
-# PPC - included from tests/tcg/Makefile
-#
-
-ifneq (,$(findstring 64,$(TARGET_NAME)))
-# On PPC64 Linux can be configured with 4k (default) or 64k pages (currently 
broken)
-EXTRA_RUNS+=run-test-mmap-4096 #run-test-mmap-65536
-else
-# On PPC32 Linux supports 4K/16K/64K/256K (but currently only 4k works)
-EXTRA_RUNS+=run-test-mmap-4096 #run-test-mmap-16384 run-test-mmap-65536 
run-test-mmap-262144
-endif
diff --git a/tests/tcg/sh4/Makefile.target b/tests/tcg/sh4/Makefile.target
index 47c39a44b6..16eaa850a8 100644
--- a/tests/tcg/sh4/Makefile.target
+++ b/tests/tcg/sh4/Makefile.target
@@ -3,9 +3,6 @@
 # SuperH specific tweaks
 #
 
-# On sh Linux supports 4k, 8k, 16k and 64k pages (but only 4k currently works)

[PATCH v4 34/36] target/ppc: Enable TARGET_PAGE_BITS_VARY for user-only

2024-02-14 Thread Richard Henderson
Since ppc binaries are generally built for multiple
page sizes, it is trivial to allow the page size to vary.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-33-richard.hender...@linaro.org>
---
 target/ppc/cpu-param.h | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/target/ppc/cpu-param.h b/target/ppc/cpu-param.h
index 0a0416e0a8..b7ad52de03 100644
--- a/target/ppc/cpu-param.h
+++ b/target/ppc/cpu-param.h
@@ -31,6 +31,13 @@
 # define TARGET_PHYS_ADDR_SPACE_BITS 36
 # define TARGET_VIRT_ADDR_SPACE_BITS 32
 #endif
-#define TARGET_PAGE_BITS 12
+
+#ifdef CONFIG_USER_ONLY
+/* Allow user-only to vary page size from 4k */
+# define TARGET_PAGE_BITS_VARY
+# define TARGET_PAGE_BITS_MIN 12
+#else
+# define TARGET_PAGE_BITS 12
+#endif
 
 #endif
-- 
2.34.1




[PATCH v4 04/36] linux-user: Remove qemu_host_page_size from create_elf_tables

2024-02-14 Thread Richard Henderson
AT_PAGESZ is supposed to advertise the guest page size.
The random adjustment made here using qemu_host_page_size
does not match anything else within linux-user.

The idea here is good, but should be done more systemically
via adjustment to TARGET_PAGE_SIZE.

Signed-off-by: Richard Henderson 
Reviewed-by: Ilya Leoshkevich 
Reviewed-by: Pierrick Bouvier 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-5-richard.hender...@linaro.org>
---
 linux-user/elfload.c | 8 +---
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index e84a201448..dfb152bfcb 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -2679,13 +2679,7 @@ static abi_ulong create_elf_tables(abi_ulong p, int 
argc, int envc,
 NEW_AUX_ENT(AT_PHDR, (abi_ulong)(info->load_addr + exec->e_phoff));
 NEW_AUX_ENT(AT_PHENT, (abi_ulong)(sizeof (struct elf_phdr)));
 NEW_AUX_ENT(AT_PHNUM, (abi_ulong)(exec->e_phnum));
-if ((info->alignment & ~qemu_host_page_mask) != 0) {
-/* Target doesn't support host page size alignment */
-NEW_AUX_ENT(AT_PAGESZ, (abi_ulong)(TARGET_PAGE_SIZE));
-} else {
-NEW_AUX_ENT(AT_PAGESZ, (abi_ulong)(MAX(TARGET_PAGE_SIZE,
-   qemu_host_page_size)));
-}
+NEW_AUX_ENT(AT_PAGESZ, (abi_ulong)(TARGET_PAGE_SIZE));
 NEW_AUX_ENT(AT_BASE, (abi_ulong)(interp_info ? interp_info->load_addr : 
0));
 NEW_AUX_ENT(AT_FLAGS, (abi_ulong)0);
 NEW_AUX_ENT(AT_ENTRY, info->entry);
-- 
2.34.1




[PATCH v4 16/36] linux-user: Split out target_mmap__locked

2024-02-14 Thread Richard Henderson
All "goto fail" may be transformed to "return -1".

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
Reviewed-by: Pierrick Bouvier 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-17-richard.hender...@linaro.org>
---
 linux-user/mmap.c | 62 ++-
 1 file changed, 35 insertions(+), 27 deletions(-)

diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index d11f758d07..b4c3cc65aa 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -490,9 +490,9 @@ abi_ulong mmap_find_vma(abi_ulong start, abi_ulong size, 
abi_ulong align)
 }
 }
 
-/* NOTE: all the constants are the HOST ones */
-abi_long target_mmap(abi_ulong start, abi_ulong len, int target_prot,
- int flags, int fd, off_t offset)
+static abi_long target_mmap__locked(abi_ulong start, abi_ulong len,
+int target_prot, int flags,
+int fd, off_t offset)
 {
 int host_page_size = qemu_real_host_page_size();
 abi_ulong ret, last, real_start, real_last, retaddr, host_len;
@@ -500,30 +500,27 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int 
target_prot,
 int page_flags;
 off_t host_offset;
 
-mmap_lock();
-trace_target_mmap(start, len, target_prot, flags, fd, offset);
-
 if (!len) {
 errno = EINVAL;
-goto fail;
+return -1;
 }
 
 page_flags = validate_prot_to_pageflags(target_prot);
 if (!page_flags) {
 errno = EINVAL;
-goto fail;
+return -1;
 }
 
 /* Also check for overflows... */
 len = TARGET_PAGE_ALIGN(len);
 if (!len) {
 errno = ENOMEM;
-goto fail;
+return -1;
 }
 
 if (offset & ~TARGET_PAGE_MASK) {
 errno = EINVAL;
-goto fail;
+return -1;
 }
 
 /*
@@ -553,7 +550,7 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int 
target_prot,
 start = mmap_find_vma(real_start, host_len, TARGET_PAGE_SIZE);
 if (start == (abi_ulong)-1) {
 errno = ENOMEM;
-goto fail;
+return -1;
 }
 }
 
@@ -574,7 +571,7 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int 
target_prot,
 struct stat sb;
 
 if (fstat(fd, ) == -1) {
-goto fail;
+return -1;
 }
 
 /* Are we trying to create a map beyond EOF?.  */
@@ -601,7 +598,7 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int 
target_prot,
 p = mmap(g2h_untagged(start), host_len, host_prot,
  flags | MAP_FIXED | MAP_ANONYMOUS, -1, 0);
 if (p == MAP_FAILED) {
-goto fail;
+return -1;
 }
 /* update start so that it points to the file position at 'offset' */
 host_start = (uintptr_t)p;
@@ -610,7 +607,7 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int 
target_prot,
  flags | MAP_FIXED, fd, host_offset);
 if (p == MAP_FAILED) {
 munmap(g2h_untagged(start), host_len);
-goto fail;
+return -1;
 }
 host_start += offset - host_offset;
 }
@@ -621,7 +618,7 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int 
target_prot,
 } else {
 if (start & ~TARGET_PAGE_MASK) {
 errno = EINVAL;
-goto fail;
+return -1;
 }
 last = start + len - 1;
 real_last = ROUND_UP(last, host_page_size) - 1;
@@ -633,14 +630,14 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int 
target_prot,
  */
 if (last < start || !guest_range_valid_untagged(start, len)) {
 errno = ENOMEM;
-goto fail;
+return -1;
 }
 
 if (flags & MAP_FIXED_NOREPLACE) {
 /* Validate that the chosen range is empty. */
 if (!page_check_range_empty(start, last)) {
 errno = EEXIST;
-goto fail;
+return -1;
 }
 
 /*
@@ -671,17 +668,17 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int 
target_prot,
 if ((flags & MAP_TYPE) == MAP_SHARED
 && (target_prot & PROT_WRITE)) {
 errno = EINVAL;
-goto fail;
+return -1;
 }
 retaddr = target_mmap(start, len, target_prot | PROT_WRITE,
   (flags & (MAP_FIXED | MAP_FIXED_NOREPLACE))
   | MAP_PRIVATE | MAP_ANONYMOUS,
   -1, 0);
 if (retaddr == -1) {
-goto fail;
+return -1;
 }
 if (pread(fd, g2h_untagged(start), len, offset) == -1) {
-goto fail;
+return -1;
 }
 if (!(target_prot & PROT_WRITE)) {
   

[PATCH v4 23/36] linux-user: Split out mmap_h_eq_g

2024-02-14 Thread Richard Henderson
Move the MAX_FIXED_NOREPLACE check for reserved_va earlier.
Move the computation of host_prot earlier.

Signed-off-by: Richard Henderson 
Reviewed-by: Pierrick Bouvier 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-22-richard.hender...@linaro.org>
---
 linux-user/mmap.c | 68 ++-
 1 file changed, 55 insertions(+), 13 deletions(-)

diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index cbcd31e941..d3556bcc14 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -542,6 +542,33 @@ static abi_long mmap_end(abi_ulong start, abi_ulong last,
 return start;
 }
 
+/*
+ * Special case host page size == target page size,
+ * where there are no edge conditions.
+ */
+static abi_long mmap_h_eq_g(abi_ulong start, abi_ulong len,
+int host_prot, int flags, int page_flags,
+int fd, off_t offset)
+{
+void *p, *want_p = g2h_untagged(start);
+abi_ulong last;
+
+p = mmap(want_p, len, host_prot, flags, fd, offset);
+if (p == MAP_FAILED) {
+return -1;
+}
+/* If the host kernel does not support MAP_FIXED_NOREPLACE, emulate. */
+if ((flags & MAP_FIXED_NOREPLACE) && p != want_p) {
+do_munmap(p, len);
+errno = EEXIST;
+return -1;
+}
+
+start = h2g(p);
+last = start + len - 1;
+return mmap_end(start, last, start, last, flags, page_flags);
+}
+
 static abi_long target_mmap__locked(abi_ulong start, abi_ulong len,
 int target_prot, int flags, int page_flags,
 int fd, off_t offset)
@@ -550,6 +577,7 @@ static abi_long target_mmap__locked(abi_ulong start, 
abi_ulong len,
 abi_ulong ret, last, real_start, real_last, retaddr, host_len;
 abi_ulong passthrough_start = -1, passthrough_last = 0;
 off_t host_offset;
+int host_prot;
 
 real_start = start & -host_page_size;
 host_offset = offset & -host_page_size;
@@ -558,16 +586,33 @@ static abi_long target_mmap__locked(abi_ulong start, 
abi_ulong len,
  * For reserved_va, we are in full control of the allocation.
  * Find a suitable hole and convert to MAP_FIXED.
  */
-if (reserved_va && !(flags & (MAP_FIXED | MAP_FIXED_NOREPLACE))) {
-host_len = len + offset - host_offset;
-start = mmap_find_vma(real_start, host_len,
-  MAX(host_page_size, TARGET_PAGE_SIZE));
-if (start == (abi_ulong)-1) {
-errno = ENOMEM;
-return -1;
+if (reserved_va) {
+if (flags & MAP_FIXED_NOREPLACE) {
+/* Validate that the chosen range is empty. */
+if (!page_check_range_empty(start, start + len - 1)) {
+errno = EEXIST;
+return -1;
+}
+flags = (flags & ~MAP_FIXED_NOREPLACE) | MAP_FIXED;
+} else if (!(flags & MAP_FIXED)) {
+size_t real_len = len + offset - host_offset;
+abi_ulong align = MAX(host_page_size, TARGET_PAGE_SIZE);
+
+start = mmap_find_vma(real_start, real_len, align);
+if (start == (abi_ulong)-1) {
+errno = ENOMEM;
+return -1;
+}
+start += offset - host_offset;
+flags |= MAP_FIXED;
 }
-start += offset - host_offset;
-flags |= MAP_FIXED;
+}
+
+host_prot = target_to_host_prot(target_prot);
+
+if (host_page_size == TARGET_PAGE_SIZE) {
+return mmap_h_eq_g(start, len, host_prot, flags,
+   page_flags, fd, offset);
 }
 
 /*
@@ -603,12 +648,10 @@ static abi_long target_mmap__locked(abi_ulong start, 
abi_ulong len,
 
 if (!(flags & (MAP_FIXED | MAP_FIXED_NOREPLACE))) {
 uintptr_t host_start;
-int host_prot;
 void *p;
 
 host_len = len + offset - host_offset;
 host_len = ROUND_UP(host_len, host_page_size);
-host_prot = target_to_host_prot(target_prot);
 
 /* Note: we prefer to control the mapping address. */
 p = mmap(g2h_untagged(start), host_len, host_prot,
@@ -731,8 +774,7 @@ static abi_long target_mmap__locked(abi_ulong start, 
abi_ulong len,
 len1 = real_last - real_start + 1;
 want_p = g2h_untagged(real_start);
 
-p = mmap(want_p, len1, target_to_host_prot(target_prot),
- flags, fd, offset1);
+p = mmap(want_p, len1, host_prot, flags, fd, offset1);
 if (p != want_p) {
 if (p != MAP_FAILED) {
 do_munmap(p, len1);
-- 
2.34.1




[PATCH v4 28/36] *-user: Deprecate and disable -p pagesize

2024-02-14 Thread Richard Henderson
This option controls the host page size.  From the mis-usage in
our own testsuite, this is easily confused with guest page size.

The only thing that occurs when changing the host page size is
that stuff breaks, because one cannot actually change the host
page size.  Therefore reject all but the no-op setting as part
of the deprecation process.

Reviewed-by: Warner Losh 
Signed-off-by: Richard Henderson 
Reviewed-by: Philippe Mathieu-Daudé 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-27-richard.hender...@linaro.org>
---
 docs/about/deprecated.rst | 10 ++
 docs/user/main.rst|  3 ---
 bsd-user/main.c   | 10 +-
 linux-user/main.c | 12 ++--
 4 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst
index 5a2305ccd6..3074303b9c 100644
--- a/docs/about/deprecated.rst
+++ b/docs/about/deprecated.rst
@@ -63,6 +63,16 @@ as short-form boolean values, and passed to plugins as 
``arg_name=on``.
 However, short-form booleans are deprecated and full explicit ``arg_name=on``
 form is preferred.
 
+User-mode emulator command line arguments
+-
+
+``-p`` (since 9.0)
+''
+
+The ``-p`` option pretends to control the host page size.  However,
+it is not possible to change the host page size, and using the
+option only causes failures.
+
 QEMU Machine Protocol (QMP) commands
 
 
diff --git a/docs/user/main.rst b/docs/user/main.rst
index 7e7ad07409..d5fbb78d3c 100644
--- a/docs/user/main.rst
+++ b/docs/user/main.rst
@@ -87,9 +87,6 @@ Debug options:
Activate logging of the specified items (use '-d help' for a list of
log items)
 
-``-p pagesize``
-   Act as if the host page size was 'pagesize' bytes
-
 ``-g port``
Wait gdb connection to port
 
diff --git a/bsd-user/main.c b/bsd-user/main.c
index e5efb7b845..521b58b880 100644
--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -364,11 +364,11 @@ int main(int argc, char **argv)
 } else if (!strcmp(r, "L")) {
 interp_prefix = argv[optind++];
 } else if (!strcmp(r, "p")) {
-qemu_host_page_size = atoi(argv[optind++]);
-if (qemu_host_page_size == 0 ||
-(qemu_host_page_size & (qemu_host_page_size - 1)) != 0) {
-fprintf(stderr, "page size must be a power of two\n");
-exit(1);
+unsigned size, want = qemu_real_host_page_size();
+
+if (qemu_strtoui(arg, NULL, 10, ) || size != want) {
+warn_report("Deprecated page size option cannot "
+"change host page size (%u)", want);
 }
 } else if (!strcmp(r, "g")) {
 gdbstub = g_strdup(argv[optind++]);
diff --git a/linux-user/main.c b/linux-user/main.c
index e540acb84a..bad03f06d3 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -332,11 +332,11 @@ static void handle_arg_ld_prefix(const char *arg)
 
 static void handle_arg_pagesize(const char *arg)
 {
-qemu_host_page_size = atoi(arg);
-if (qemu_host_page_size == 0 ||
-(qemu_host_page_size & (qemu_host_page_size - 1)) != 0) {
-fprintf(stderr, "page size must be a power of two\n");
-exit(EXIT_FAILURE);
+unsigned size, want = qemu_real_host_page_size();
+
+if (qemu_strtoui(arg, NULL, 10, ) || size != want) {
+warn_report("Deprecated page size option cannot "
+"change host page size (%u)", want);
 }
 }
 
@@ -496,7 +496,7 @@ static const struct qemu_argument arg_table[] = {
 {"D",  "QEMU_LOG_FILENAME", true, handle_arg_log_filename,
  "logfile", "write logs to 'logfile' (default stderr)"},
 {"p",  "QEMU_PAGESIZE",true,  handle_arg_pagesize,
- "pagesize",   "set the host page size to 'pagesize'"},
+ "pagesize",   "deprecated change to host page size"},
 {"one-insn-per-tb",
"QEMU_ONE_INSN_PER_TB",  false, handle_arg_one_insn_per_tb,
  "",   "run with one guest instruction per emulated TB"},
-- 
2.34.1




[PATCH v4 36/36] linux-user: Remove pgb_dynamic alignment assertion

2024-02-14 Thread Richard Henderson
The assertion was never correct, because the alignment is a composite
of the image alignment and SHMLBA.  Even if the alignment didn't match
the image an assertion would not be correct -- more appropriate would
be an error message about an ill formed image.  But the image cannot
be held to SHMLBA under any circumstances.

Fixes: ee94743034b ("linux-user: completely re-write init_guest_space")
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2157
Signed-off-by: Richard Henderson 
---
 linux-user/elfload.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index f3f1ab4f69..d92d66ca1e 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -3022,8 +3022,6 @@ static void pgb_dynamic(const char *image_name, uintptr_t 
guest_loaddr,
 uintptr_t brk, ret;
 PGBAddrs ga;
 
-assert(QEMU_IS_ALIGNED(guest_loaddr, align));
-
 /* Try the identity map first. */
 if (pgb_addr_set(, guest_loaddr, guest_hiaddr, true)) {
 brk = (uintptr_t)sbrk(0);
-- 
2.34.1




[PATCH v4 06/36] linux-user/nios2: Remove qemu_host_page_size from init_guest_commpage

2024-02-14 Thread Richard Henderson
Use qemu_real_host_page_size.
If !reserved_va, use MAP_FIXED_NOREPLACE.

Signed-off-by: Richard Henderson 
Reviewed-by: Pierrick Bouvier 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-7-richard.hender...@linaro.org>
---
 linux-user/elfload.c | 14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index 1893b3c192..a9f1077861 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -1532,10 +1532,14 @@ static bool init_guest_commpage(void)
  0x3a, 0x68, 0x3b, 0x00,  /* trap 0 */
 };
 
-void *want = g2h_untagged(LO_COMMPAGE & -qemu_host_page_size);
-void *addr = mmap(want, qemu_host_page_size, PROT_READ | PROT_WRITE,
-  MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0);
+int host_page_size = qemu_real_host_page_size();
+void *want, *addr;
 
+want = g2h_untagged(LO_COMMPAGE & -host_page_size);
+addr = mmap(want, host_page_size, PROT_READ | PROT_WRITE,
+MAP_ANONYMOUS | MAP_PRIVATE |
+(reserved_va ? MAP_FIXED : MAP_FIXED_NOREPLACE),
+-1, 0);
 if (addr == MAP_FAILED) {
 perror("Allocating guest commpage");
 exit(EXIT_FAILURE);
@@ -1544,9 +1548,9 @@ static bool init_guest_commpage(void)
 return false;
 }
 
-memcpy(addr, kuser_page, sizeof(kuser_page));
+memcpy(g2h_untagged(LO_COMMPAGE), kuser_page, sizeof(kuser_page));
 
-if (mprotect(addr, qemu_host_page_size, PROT_READ)) {
+if (mprotect(addr, host_page_size, PROT_READ)) {
 perror("Protecting guest commpage");
 exit(EXIT_FAILURE);
 }
-- 
2.34.1




[PATCH v4 31/36] linux-user: Allow TARGET_PAGE_BITS_VARY

2024-02-14 Thread Richard Henderson
If set, match the host and guest page sizes.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-30-richard.hender...@linaro.org>
---
 linux-user/main.c | 16 +---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/linux-user/main.c b/linux-user/main.c
index bad03f06d3..12bb839982 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -55,6 +55,7 @@
 #include "loader.h"
 #include "user-mmap.h"
 #include "tcg/perf.h"
+#include "exec/page-vary.h"
 
 #ifdef CONFIG_SEMIHOSTING
 #include "semihosting/semihost.h"
@@ -680,6 +681,7 @@ int main(int argc, char **argv, char **envp)
 int i;
 int ret;
 int execfd;
+int host_page_size;
 unsigned long max_reserved_va;
 bool preserve_argv0;
 
@@ -791,6 +793,16 @@ int main(int argc, char **argv, char **envp)
  opt_one_insn_per_tb, _abort);
 ac->init_machine(NULL);
 }
+
+/*
+ * Finalize page size before creating CPUs.
+ * This will do nothing if !TARGET_PAGE_BITS_VARY.
+ * The most efficient setting is to match the host.
+ */
+host_page_size = qemu_real_host_page_size();
+set_preferred_target_page_bits(ctz32(host_page_size));
+finalize_target_page_bits();
+
 cpu = cpu_create(cpu_type);
 env = cpu_env(cpu);
 cpu_reset(cpu);
@@ -804,8 +816,6 @@ int main(int argc, char **argv, char **envp)
  */
 max_reserved_va = MAX_RESERVED_VA(cpu);
 if (reserved_va != 0) {
-int host_page_size = qemu_real_host_page_size();
-
 if ((reserved_va + 1) % host_page_size) {
 char *s = size_to_str(host_page_size);
 fprintf(stderr, "Reserved virtual address not aligned mod %s\n", 
s);
@@ -904,7 +914,7 @@ int main(int argc, char **argv, char **envp)
  * If we're in a chroot with no /proc, fall back to 1 page.
  */
 if (mmap_min_addr == 0) {
-mmap_min_addr = qemu_real_host_page_size();
+mmap_min_addr = host_page_size;
 qemu_log_mask(CPU_LOG_PAGE,
   "host mmap_min_addr=0x%lx (fallback)\n",
   mmap_min_addr);
-- 
2.34.1




[PATCH v4 07/36] linux-user/arm: Remove qemu_host_page_size from init_guest_commpage

2024-02-14 Thread Richard Henderson
Use qemu_real_host_page_size.
If the commpage is not within reserved_va, use MAP_FIXED_NOREPLACE.

Signed-off-by: Richard Henderson 
Reviewed-by: Pierrick Bouvier 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-8-richard.hender...@linaro.org>
---
 linux-user/elfload.c | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index a9f1077861..f3f1ab4f69 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -460,6 +460,7 @@ enum {
 static bool init_guest_commpage(void)
 {
 ARMCPU *cpu = ARM_CPU(thread_cpu);
+int host_page_size = qemu_real_host_page_size();
 abi_ptr commpage;
 void *want;
 void *addr;
@@ -472,10 +473,12 @@ static bool init_guest_commpage(void)
 return true;
 }
 
-commpage = HI_COMMPAGE & -qemu_host_page_size;
+commpage = HI_COMMPAGE & -host_page_size;
 want = g2h_untagged(commpage);
-addr = mmap(want, qemu_host_page_size, PROT_READ | PROT_WRITE,
-MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0);
+addr = mmap(want, host_page_size, PROT_READ | PROT_WRITE,
+MAP_ANONYMOUS | MAP_PRIVATE |
+(commpage < reserved_va ? MAP_FIXED : MAP_FIXED_NOREPLACE),
+-1, 0);
 
 if (addr == MAP_FAILED) {
 perror("Allocating guest commpage");
@@ -488,12 +491,12 @@ static bool init_guest_commpage(void)
 /* Set kernel helper versions; rest of page is 0.  */
 __put_user(5, (uint32_t *)g2h_untagged(0x0ffcu));
 
-if (mprotect(addr, qemu_host_page_size, PROT_READ)) {
+if (mprotect(addr, host_page_size, PROT_READ)) {
 perror("Protecting guest commpage");
 exit(EXIT_FAILURE);
 }
 
-page_set_flags(commpage, commpage | ~qemu_host_page_mask,
+page_set_flags(commpage, commpage | (host_page_size - 1),
PAGE_READ | PAGE_EXEC | PAGE_VALID);
 return true;
 }
-- 
2.34.1




[PATCH v4 01/36] accel/tcg: Remove qemu_host_page_size from page_protect/page_unprotect

2024-02-14 Thread Richard Henderson
Use qemu_real_host_page_size instead.  Except for the final mprotect
within page_protect, we already handled host < target page size.

Signed-off-by: Richard Henderson 
Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-2-richard.hender...@linaro.org>
---
 accel/tcg/user-exec.c | 18 ++
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index 68b252cb8e..69b7429e31 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -651,16 +651,17 @@ void page_protect(tb_page_addr_t address)
 {
 PageFlagsNode *p;
 target_ulong start, last;
+int host_page_size = qemu_real_host_page_size();
 int prot;
 
 assert_memory_lock();
 
-if (qemu_host_page_size <= TARGET_PAGE_SIZE) {
+if (host_page_size <= TARGET_PAGE_SIZE) {
 start = address & TARGET_PAGE_MASK;
 last = start + TARGET_PAGE_SIZE - 1;
 } else {
-start = address & qemu_host_page_mask;
-last = start + qemu_host_page_size - 1;
+start = address & -host_page_size;
+last = start + host_page_size - 1;
 }
 
 p = pageflags_find(start, last);
@@ -671,7 +672,7 @@ void page_protect(tb_page_addr_t address)
 
 if (unlikely(p->itree.last < last)) {
 /* More than one protection region covers the one host page. */
-assert(TARGET_PAGE_SIZE < qemu_host_page_size);
+assert(TARGET_PAGE_SIZE < host_page_size);
 while ((p = pageflags_next(p, start, last)) != NULL) {
 prot |= p->flags;
 }
@@ -679,7 +680,7 @@ void page_protect(tb_page_addr_t address)
 
 if (prot & PAGE_WRITE) {
 pageflags_set_clear(start, last, 0, PAGE_WRITE);
-mprotect(g2h_untagged(start), qemu_host_page_size,
+mprotect(g2h_untagged(start), last - start + 1,
  prot & (PAGE_READ | PAGE_EXEC) ? PROT_READ : PROT_NONE);
 }
 }
@@ -725,18 +726,19 @@ int page_unprotect(target_ulong address, uintptr_t pc)
 }
 #endif
 } else {
+int host_page_size = qemu_real_host_page_size();
 target_ulong start, len, i;
 int prot;
 
-if (qemu_host_page_size <= TARGET_PAGE_SIZE) {
+if (host_page_size <= TARGET_PAGE_SIZE) {
 start = address & TARGET_PAGE_MASK;
 len = TARGET_PAGE_SIZE;
 prot = p->flags | PAGE_WRITE;
 pageflags_set_clear(start, start + len - 1, PAGE_WRITE, 0);
 current_tb_invalidated = tb_invalidate_phys_page_unwind(start, pc);
 } else {
-start = address & qemu_host_page_mask;
-len = qemu_host_page_size;
+start = address & -host_page_size;
+len = host_page_size;
 prot = 0;
 
 for (i = 0; i < len; i += TARGET_PAGE_SIZE) {
-- 
2.34.1




[PATCH v4 05/36] linux-user/hppa: Simplify init_guest_commpage

2024-02-14 Thread Richard Henderson
If reserved_va, then we have already reserved the entire
guest virtual address space; no need to remap page.
If !reserved_va, then use MAP_FIXED_NOREPLACE.

Signed-off-by: Richard Henderson 
Reviewed-by: Pierrick Bouvier 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-6-richard.hender...@linaro.org>
---
 linux-user/elfload.c | 22 +-
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index dfb152bfcb..1893b3c192 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -1970,16 +1970,20 @@ static inline void init_thread(struct target_pt_regs 
*regs,
 
 static bool init_guest_commpage(void)
 {
-void *want = g2h_untagged(LO_COMMPAGE);
-void *addr = mmap(want, qemu_host_page_size, PROT_NONE,
-  MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0);
+/* If reserved_va, then we have already mapped 0 page on the host. */
+if (!reserved_va) {
+void *want, *addr;
 
-if (addr == MAP_FAILED) {
-perror("Allocating guest commpage");
-exit(EXIT_FAILURE);
-}
-if (addr != want) {
-return false;
+want = g2h_untagged(LO_COMMPAGE);
+addr = mmap(want, TARGET_PAGE_SIZE, PROT_NONE,
+MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED_NOREPLACE, -1, 0);
+if (addr == MAP_FAILED) {
+perror("Allocating guest commpage");
+exit(EXIT_FAILURE);
+}
+if (addr != want) {
+return false;
+}
 }
 
 /*
-- 
2.34.1




[PATCH v4 24/36] linux-user: Split out mmap_h_lt_g

2024-02-14 Thread Richard Henderson
Work much harder to get alignment and mapping beyond the end
of the file correct.  Both of which are excercised by our
test-mmap for alpha (8k pages) on any 4k page host.

Signed-off-by: Richard Henderson 
Reviewed-by: Pierrick Bouvier 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-23-richard.hender...@linaro.org>
---
 linux-user/mmap.c | 184 ++
 1 file changed, 153 insertions(+), 31 deletions(-)

diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index d3556bcc14..ff8f9f7ed0 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -569,6 +569,156 @@ static abi_long mmap_h_eq_g(abi_ulong start, abi_ulong 
len,
 return mmap_end(start, last, start, last, flags, page_flags);
 }
 
+/*
+ * Special case host page size < target page size.
+ *
+ * The two special cases are increased guest alignment, and mapping
+ * past the end of a file.
+ *
+ * When mapping files into a memory area larger than the file,
+ * accesses to pages beyond the file size will cause a SIGBUS.
+ *
+ * For example, if mmaping a file of 100 bytes on a host with 4K
+ * pages emulating a target with 8K pages, the target expects to
+ * be able to access the first 8K. But the host will trap us on
+ * any access beyond 4K.
+ *
+ * When emulating a target with a larger page-size than the hosts,
+ * we may need to truncate file maps at EOF and add extra anonymous
+ * pages up to the targets page boundary.
+ *
+ * This workaround only works for files that do not change.
+ * If the file is later extended (e.g. ftruncate), the SIGBUS
+ * vanishes and the proper behaviour is that changes within the
+ * anon page should be reflected in the file.
+ *
+ * However, this case is rather common with executable images,
+ * so the workaround is important for even trivial tests, whereas
+ * the mmap of of a file being extended is less common.
+ */
+static abi_long mmap_h_lt_g(abi_ulong start, abi_ulong len, int host_prot,
+int mmap_flags, int page_flags, int fd,
+off_t offset, int host_page_size)
+{
+void *p, *want_p = g2h_untagged(start);
+off_t fileend_adj = 0;
+int flags = mmap_flags;
+abi_ulong last, pass_last;
+
+if (!(flags & MAP_ANONYMOUS)) {
+struct stat sb;
+
+if (fstat(fd, ) == -1) {
+return -1;
+}
+if (offset >= sb.st_size) {
+/*
+ * The entire map is beyond the end of the file.
+ * Transform it to an anonymous mapping.
+ */
+flags |= MAP_ANONYMOUS;
+fd = -1;
+offset = 0;
+} else if (offset + len > sb.st_size) {
+/*
+ * A portion of the map is beyond the end of the file.
+ * Truncate the file portion of the allocation.
+ */
+fileend_adj = offset + len - sb.st_size;
+}
+}
+
+if (flags & (MAP_FIXED | MAP_FIXED_NOREPLACE)) {
+if (fileend_adj) {
+p = mmap(want_p, len, host_prot, flags | MAP_ANONYMOUS, -1, 0);
+} else {
+p = mmap(want_p, len, host_prot, flags, fd, offset);
+}
+if (p != want_p) {
+if (p != MAP_FAILED) {
+/* Host does not support MAP_FIXED_NOREPLACE: emulate. */
+do_munmap(p, len);
+errno = EEXIST;
+}
+return -1;
+}
+
+if (fileend_adj) {
+void *t = mmap(p, len - fileend_adj, host_prot,
+   (flags & ~MAP_FIXED_NOREPLACE) | MAP_FIXED,
+   fd, offset);
+
+if (t == MAP_FAILED) {
+int save_errno = errno;
+
+/*
+ * We failed a map over the top of the successful anonymous
+ * mapping above. The only failure mode is running out of VMAs,
+ * and there's nothing that we can do to detect that earlier.
+ * If we have replaced an existing mapping with MAP_FIXED,
+ * then we cannot properly recover.  It's a coin toss whether
+ * it would be better to exit or continue here.
+ */
+if (!(flags & MAP_FIXED_NOREPLACE) &&
+!page_check_range_empty(start, start + len - 1)) {
+qemu_log("QEMU target_mmap late failure: %s",
+ strerror(save_errno));
+}
+
+do_munmap(want_p, len);
+errno = save_errno;
+return -1;
+}
+}
+} else {
+size_t host_len, part_len;
+
+/*
+ * Take care to align the host memory.  Perform a larger anonymous
+ * allocation and extract the aligned portion.  Remap the file on
+ * top of that.
+ */
+host_len = len + TARGET_PAGE_SIZE - host_page_size;
+p = mmap(want_p, host_len, host_prot, flags | MAP_ANONYMOUS, 

[PATCH v4 09/36] linux-user: Remove REAL_HOST_PAGE_ALIGN from mmap.c

2024-02-14 Thread Richard Henderson
We already have qemu_real_host_page_size() in a local variable.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-10-richard.hender...@linaro.org>
---
 linux-user/mmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index 4d3c8717b9..53e5486cc8 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -585,7 +585,7 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int 
target_prot,
  * the hosts real pagesize. Additional anonymous maps
  * will be created beyond EOF.
  */
-len = REAL_HOST_PAGE_ALIGN(sb.st_size - offset);
+len = ROUND_UP(sb.st_size - offset, host_page_size);
 }
 }
 
-- 
2.34.1




[PATCH v4 33/36] linux-user: Bound mmap_min_addr by host page size

2024-02-14 Thread Richard Henderson
Bizzarely, it is possible to set /proc/sys/vm/mmap_min_addr
to a value below the host page size.  Fix that.

Signed-off-by: Richard Henderson 
Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-32-richard.hender...@linaro.org>
---
 linux-user/main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/linux-user/main.c b/linux-user/main.c
index 12bb839982..551acf1661 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -901,7 +901,7 @@ int main(int argc, char **argv, char **envp)
 if ((fp = fopen("/proc/sys/vm/mmap_min_addr", "r")) != NULL) {
 unsigned long tmp;
 if (fscanf(fp, "%lu", ) == 1 && tmp != 0) {
-mmap_min_addr = tmp;
+mmap_min_addr = MAX(tmp, host_page_size);
 qemu_log_mask(CPU_LOG_PAGE, "host mmap_min_addr=0x%lx\n",
   mmap_min_addr);
 }
-- 
2.34.1




[PATCH v4 21/36] linux-user: Split out do_munmap

2024-02-14 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 linux-user/mmap.c | 23 ---
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index 1bbfeb25b1..8ebcca 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -267,6 +267,21 @@ int target_mprotect(abi_ulong start, abi_ulong len, int 
target_prot)
 return ret;
 }
 
+/*
+ * Perform munmap on behalf of the target, with host parameters.
+ * If reserved_va, we must replace the memory reservation.
+ */
+static int do_munmap(void *addr, size_t len)
+{
+if (reserved_va) {
+void *ptr = mmap(addr, len, PROT_NONE,
+ MAP_FIXED | MAP_ANONYMOUS
+ | MAP_PRIVATE | MAP_NORESERVE, -1, 0);
+return ptr == addr ? 0 : -1;
+}
+return munmap(addr, len);
+}
+
 /* map an incomplete host page */
 static bool mmap_frag(abi_ulong real_start, abi_ulong start, abi_ulong last,
   int prot, int flags, int fd, off_t offset)
@@ -854,13 +869,7 @@ static int mmap_reserve_or_unmap(abi_ulong start, 
abi_ulong len)
 real_len = real_last - real_start + 1;
 host_start = g2h_untagged(real_start);
 
-if (reserved_va) {
-void *ptr = mmap(host_start, real_len, PROT_NONE,
- MAP_FIXED | MAP_ANONYMOUS
- | MAP_PRIVATE | MAP_NORESERVE, -1, 0);
-return ptr == host_start ? 0 : -1;
-}
-return munmap(host_start, real_len);
+return do_munmap(host_start, real_len);
 }
 
 int target_munmap(abi_ulong start, abi_ulong len)
-- 
2.34.1




[PATCH v4 00/36] linux-user: Improve host and guest page size handling

2024-02-14 Thread Richard Henderson
Changes for v4:
  * Split out do_munmap.
  * Incorporate review feedback.

Blurb from v1:

While working on mmap issues for 8.1, I noticed a lot of corner
cases of host != guest page size that we implement poorly.
This seems to be particularly visible on Apple M1 with 16k pages,
more so than Power with 64k pages for some reason.

Objective 1 is to deprecate and (essentially) disable the -p option.

The effect of -p is apparently confusing, so much so that our own
testsuite misuses it.  One cannot really change the host page size,
and pretending otherwise means that we don't treat the host memory
system correctly, and stuff breaks.

I have not yet done the same work for bsd-user.

Objective 2 is to allow the guest page size to change to match the host.

There are corner cases of host != guest page size will fail in odd ways.
For case of host > guest page size, the issues could be solved with
softmmu, allowing a non-linear mapping between host and guest addresses
and also disconnecting host and guest page permissions.

However, host < guest page has issues with SIGBUS which I believe to be
totally unfixable.  At minimum one would need to monitor changes to all
files mapped in the address space, but I'm sure there is much more.

But as always the best behaviour is obtained when the host and guest
page sizes match -- there are no corner cases to contend with.

There are a set of guests which can be configured to use multiple page
sizes, and therefore software developed for those guests (usually) does
not hard-code a particular page size.  For those, we can allow the
page size to vary and let the guest match the host.

I have only changed aarch64, alpha and ppc guests so far, as those
are both easy to test and, especially for the case of alpha's default
8k page size, prone to failure.


r~


Richard Henderson (36):
  accel/tcg: Remove qemu_host_page_size from page_protect/page_unprotect
  linux-user: Adjust SVr4 NULL page mapping
  linux-user: Remove qemu_host_page_{size, mask} in probe_guest_base
  linux-user: Remove qemu_host_page_size from create_elf_tables
  linux-user/hppa: Simplify init_guest_commpage
  linux-user/nios2: Remove qemu_host_page_size from init_guest_commpage
  linux-user/arm: Remove qemu_host_page_size from init_guest_commpage
  linux-user: Remove qemu_host_page_{size, mask} from mmap.c
  linux-user: Remove REAL_HOST_PAGE_ALIGN from mmap.c
  linux-user: Remove HOST_PAGE_ALIGN from mmap.c
  migration: Remove qemu_host_page_size
  hw/tpm: Remove HOST_PAGE_ALIGN from tpm_ppi_init
  softmmu/physmem: Remove qemu_host_page_size
  softmmu/physmem: Remove HOST_PAGE_ALIGN
  linux-user: Remove qemu_host_page_size from main
  linux-user: Split out target_mmap__locked
  linux-user: Move some mmap checks outside the lock
  linux-user: Fix sub-host-page mmap
  linux-user: Split out mmap_end
  linux-user: Do early mmap placement only for reserved_va
  linux-user: Split out do_munmap
  linux-user: Use do_munmap for target_mmap failure
  linux-user: Split out mmap_h_eq_g
  linux-user: Split out mmap_h_lt_g
  linux-user: Split out mmap_h_gt_g
  tests/tcg: Remove run-test-mmap-*
  tests/tcg: Extend file in linux-madvise.c
  *-user: Deprecate and disable -p pagesize
  cpu: Remove page_size_init
  accel/tcg: Disconnect TargetPageDataNode from page size
  linux-user: Allow TARGET_PAGE_BITS_VARY
  target/arm: Enable TARGET_PAGE_BITS_VARY for AArch64 user-only
  linux-user: Bound mmap_min_addr by host page size
  target/ppc: Enable TARGET_PAGE_BITS_VARY for user-only
  target/alpha: Enable TARGET_PAGE_BITS_VARY for user-only
  linux-user: Remove pgb_dynamic alignment assertion

 docs/about/deprecated.rst |  10 +
 docs/user/main.rst|   3 -
 bsd-user/qemu.h   |   7 +
 include/exec/cpu-common.h |   7 -
 include/hw/core/cpu.h |   2 -
 target/alpha/cpu-param.h  |  16 +-
 target/arm/cpu-param.h|   6 +-
 target/ppc/cpu-param.h|   9 +-
 accel/tcg/translate-all.c |   1 -
 accel/tcg/user-exec.c |  31 +-
 bsd-user/main.c   |  22 +-
 cpu-target.c  |  13 -
 hw/tpm/tpm_ppi.c  |   3 +-
 linux-user/elfload.c  |  68 +-
 linux-user/main.c |  34 +-
 linux-user/mmap.c | 767 +-
 migration/ram.c   |  22 +-
 system/physmem.c  |  17 +-
 system/vl.c   |   1 -
 target/arm/cpu.c  |  51 +-
 tests/tcg/multiarch/linux/linux-madvise.c |   2 +
 tests/tcg/alpha/Makefile.target   |   3 -
 tests/tcg/arm/Makefile.target |   3 -
 tests/tcg/hppa/Makefile.target|   3 -
 tests/tcg/i386/Makefile.target|   3 -
 tests/tcg/m68k/Makefile.target|   3 -
 

[PATCH v4 25/36] linux-user: Split out mmap_h_gt_g

2024-02-14 Thread Richard Henderson
Signed-off-by: Richard Henderson 
Reviewed-by: Pierrick Bouvier 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-24-richard.hender...@linaro.org>
---
 linux-user/mmap.c | 288 ++
 1 file changed, 139 insertions(+), 149 deletions(-)

diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index ff8f9f7ed0..82f4026283 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -282,7 +282,16 @@ static int do_munmap(void *addr, size_t len)
 return munmap(addr, len);
 }
 
-/* map an incomplete host page */
+/*
+ * Map an incomplete host page.
+ *
+ * Here be dragons.  This case will not work if there is an existing
+ * overlapping host page, which is file mapped, and for which the mapping
+ * is beyond the end of the file.  In that case, we will see SIGBUS when
+ * trying to write a portion of this page.
+ *
+ * FIXME: Work around this with a temporary signal handler and longjmp.
+ */
 static bool mmap_frag(abi_ulong real_start, abi_ulong start, abi_ulong last,
   int prot, int flags, int fd, off_t offset)
 {
@@ -719,19 +728,138 @@ static abi_long mmap_h_lt_g(abi_ulong start, abi_ulong 
len, int host_prot,
 return mmap_end(start, last, start, pass_last, mmap_flags, page_flags);
 }
 
+/*
+ * Special case host page size > target page size.
+ *
+ * The two special cases are address and file offsets that are valid
+ * for the guest that cannot be directly represented by the host.
+ */
+static abi_long mmap_h_gt_g(abi_ulong start, abi_ulong len,
+int target_prot, int host_prot,
+int flags, int page_flags, int fd,
+off_t offset, int host_page_size)
+{
+void *p, *want_p = g2h_untagged(start);
+off_t host_offset = offset & -host_page_size;
+abi_ulong last, real_start, real_last;
+bool misaligned_offset = false;
+size_t host_len;
+
+if (!(flags & (MAP_FIXED | MAP_FIXED_NOREPLACE))) {
+/*
+ * Adjust the offset to something representable on the host.
+ */
+host_len = len + offset - host_offset;
+p = mmap(want_p, host_len, host_prot, flags, fd, host_offset);
+if (p == MAP_FAILED) {
+return -1;
+}
+
+/* Update start to the file position at offset. */
+p += offset - host_offset;
+
+start = h2g(p);
+last = start + len - 1;
+return mmap_end(start, last, start, last, flags, page_flags);
+}
+
+if (!(flags & MAP_ANONYMOUS)) {
+misaligned_offset = (start ^ offset) & (host_page_size - 1);
+
+/*
+ * The fallback for misalignment is a private mapping + read.
+ * This carries none of semantics required of MAP_SHARED.
+ */
+if (misaligned_offset && (flags & MAP_TYPE) != MAP_PRIVATE) {
+errno = EINVAL;
+return -1;
+}
+}
+
+last = start + len - 1;
+real_start = start & -host_page_size;
+real_last = ROUND_UP(last, host_page_size) - 1;
+
+/*
+ * Handle the start and end of the mapping.
+ */
+if (real_start < start) {
+abi_ulong real_page_last = real_start + host_page_size - 1;
+if (last <= real_page_last) {
+/* Entire allocation a subset of one host page. */
+if (!mmap_frag(real_start, start, last, target_prot,
+   flags, fd, offset)) {
+return -1;
+}
+return mmap_end(start, last, -1, 0, flags, page_flags);
+}
+
+if (!mmap_frag(real_start, start, real_page_last, target_prot,
+   flags, fd, offset)) {
+return -1;
+}
+real_start = real_page_last + 1;
+}
+
+if (last < real_last) {
+abi_ulong real_page_start = real_last - host_page_size + 1;
+if (!mmap_frag(real_page_start, real_page_start, last,
+   target_prot, flags, fd,
+   offset + real_page_start - start)) {
+return -1;
+}
+real_last = real_page_start - 1;
+}
+
+if (real_start > real_last) {
+return mmap_end(start, last, -1, 0, flags, page_flags);
+}
+
+/*
+ * Handle the middle of the mapping.
+ */
+
+host_len = real_last - real_start + 1;
+want_p += real_start - start;
+
+if (flags & MAP_ANONYMOUS) {
+p = mmap(want_p, host_len, host_prot, flags, -1, 0);
+} else if (!misaligned_offset) {
+p = mmap(want_p, host_len, host_prot, flags, fd,
+ offset + real_start - start);
+} else {
+p = mmap(want_p, host_len, host_prot | PROT_WRITE,
+ flags | MAP_ANONYMOUS, -1, 0);
+}
+if (p != want_p) {
+if (p != MAP_FAILED) {
+do_munmap(p, host_len);
+errno = EEXIST;
+}
+return -1;
+}
+
+if (misaligned_offset) {
+/* TODO: The read could be short. */
+if 

[PATCH v4 03/36] linux-user: Remove qemu_host_page_{size, mask} in probe_guest_base

2024-02-14 Thread Richard Henderson
The host SHMLBA is by definition a multiple of the host page size.
Thus the remaining component of qemu_host_page_size is the
target page size.

Signed-off-by: Richard Henderson 
Reviewed-by: Ilya Leoshkevich 
Reviewed-by: Pierrick Bouvier 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-4-richard.hender...@linaro.org>
---
 linux-user/elfload.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index e918a13748..e84a201448 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -2893,7 +2893,7 @@ static bool pgb_addr_set(PGBAddrs *ga, abi_ulong 
guest_loaddr,
 
 /* Add any HI_COMMPAGE not covered by reserved_va. */
 if (reserved_va < HI_COMMPAGE) {
-ga->bounds[n][0] = HI_COMMPAGE & qemu_host_page_mask;
+ga->bounds[n][0] = HI_COMMPAGE & qemu_real_host_page_mask();
 ga->bounds[n][1] = HI_COMMPAGE + TARGET_PAGE_SIZE - 1;
 n++;
 }
@@ -3075,7 +3075,7 @@ void probe_guest_base(const char *image_name, abi_ulong 
guest_loaddr,
   abi_ulong guest_hiaddr)
 {
 /* In order to use host shmat, we must be able to honor SHMLBA.  */
-uintptr_t align = MAX(SHMLBA, qemu_host_page_size);
+uintptr_t align = MAX(SHMLBA, TARGET_PAGE_SIZE);
 
 /* Sanity check the guest binary. */
 if (reserved_va) {
-- 
2.34.1




Re: [PATCH v3 5/6] target/riscv: Gate hardware A/D PTE bit updating

2024-02-14 Thread Alistair Francis
On Sat, Feb 3, 2024 at 1:22 AM Daniel Henrique Barboza
 wrote:
>
> From: Andrew Jones 
>
> Gate hardware A/D PTE bit updating on {m,h}envcfg.ADUE and only
> enable menvcfg.ADUE on reset if svade has not been selected. Now
> that we also consider svade, we have four possible configurations:
>
>  1) !svade && !svadu
> use hardware updating and there's no way to disable it
> (the default, which maintains past behavior. Maintaining
>  the default, even with !svadu is a change that fixes [1])
>
>  2) !svade && svadu
> use hardware updating, but also provide {m,h}envcfg.ADUE,
> allowing software to switch to exception mode
> (being able to switch is a change which fixes [1])
>
>  3) svade && !svadu
> use exception mode and there's no way to switch to hardware
> updating
> (this behavior change fixes [2])
>
>  4) svade && svadu
> use exception mode, but also provide {m,h}envcfg.ADUE,
> allowing software to switch to hardware updating
> (this behavior change fixes [2])
>
> Fixes: 0af3f115e68e ("target/riscv: Add *envcfg.HADE related check in address 
> translation") [1]
> Fixes: 48531f5adb2a ("target/riscv: implement svade") [2]
> Reviewed-by: Daniel Henrique Barboza 
> Signed-off-by: Andrew Jones 

Reviewed-by: Alistair Francis 

Alistair

> ---
>  target/riscv/cpu.c |  3 ++-
>  target/riscv/cpu_helper.c  | 19 +++
>  target/riscv/tcg/tcg-cpu.c | 15 +--
>  3 files changed, 22 insertions(+), 15 deletions(-)
>
> diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
> index 9045f87481..50ac7845a8 100644
> --- a/target/riscv/cpu.c
> +++ b/target/riscv/cpu.c
> @@ -960,7 +960,8 @@ static void riscv_cpu_reset_hold(Object *obj)
>  env->two_stage_lookup = false;
>
>  env->menvcfg = (cpu->cfg.ext_svpbmt ? MENVCFG_PBMTE : 0) |
> -   (cpu->cfg.ext_svadu ? MENVCFG_ADUE : 0);
> +   (!cpu->cfg.ext_svade && cpu->cfg.ext_svadu ?
> +MENVCFG_ADUE : 0);
>  env->henvcfg = 0;
>
>  /* Initialized default priorities of local interrupts. */
> diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
> index 8da9104da4..3a440833f8 100644
> --- a/target/riscv/cpu_helper.c
> +++ b/target/riscv/cpu_helper.c
> @@ -907,7 +907,9 @@ static int get_physical_address(CPURISCVState *env, 
> hwaddr *physical,
>  }
>
>  bool pbmte = env->menvcfg & MENVCFG_PBMTE;
> -bool adue = env->menvcfg & MENVCFG_ADUE;
> +bool svade = riscv_cpu_cfg(env)->ext_svade;
> +bool svadu = riscv_cpu_cfg(env)->ext_svadu;
> +bool adue = svadu ? env->menvcfg & MENVCFG_ADUE : !svade;
>
>  if (first_stage && two_stage && env->virt_enabled) {
>  pbmte = pbmte && (env->henvcfg & HENVCFG_PBMTE);
> @@ -1082,9 +1084,18 @@ restart:
>  return TRANSLATE_FAIL;
>  }
>
> -/* If necessary, set accessed and dirty bits. */
> -target_ulong updated_pte = pte | PTE_A |
> -(access_type == MMU_DATA_STORE ? PTE_D : 0);
> +target_ulong updated_pte = pte;
> +
> +/*
> + * If ADUE is enabled, set accessed and dirty bits.
> + * Otherwise raise an exception if necessary.
> + */
> +if (adue) {
> +updated_pte |= PTE_A | (access_type == MMU_DATA_STORE ? PTE_D : 0);
> +} else if (!(pte & PTE_A) ||
> +   (access_type == MMU_DATA_STORE && !(pte & PTE_D))) {
> +return TRANSLATE_FAIL;
> +}
>
>  /* Page table updates need to be atomic with MTTCG enabled */
>  if (updated_pte != pte && !is_debug) {
> diff --git a/target/riscv/tcg/tcg-cpu.c b/target/riscv/tcg/tcg-cpu.c
> index 673097c6e4..43c32b4a15 100644
> --- a/target/riscv/tcg/tcg-cpu.c
> +++ b/target/riscv/tcg/tcg-cpu.c
> @@ -196,17 +196,14 @@ static bool cpu_cfg_offset_is_named_feat(uint32_t 
> ext_offset)
>
>  static void riscv_cpu_enable_named_feat(RISCVCPU *cpu, uint32_t feat_offset)
>  {
> -switch (feat_offset) {
> -case CPU_CFG_OFFSET(ext_zic64b):
> + /*
> +  * All other named features are already enabled
> +  * in riscv_tcg_cpu_instance_init().
> +  */
> +if (feat_offset == CPU_CFG_OFFSET(ext_zic64b)) {
>  cpu->cfg.cbom_blocksize = 64;
>  cpu->cfg.cbop_blocksize = 64;
>  cpu->cfg.cboz_blocksize = 64;
> -break;
> -case CPU_CFG_OFFSET(ext_svade):
> -cpu->cfg.ext_svadu = false;
> -break;
> -default:
> -g_assert_not_reached();
>  }
>  }
>
> @@ -348,8 +345,6 @@ static void riscv_cpu_update_named_features(RISCVCPU *cpu)
>  cpu->cfg.ext_zic64b = cpu->cfg.cbom_blocksize == 64 &&
>cpu->cfg.cbop_blocksize == 64 &&
>cpu->cfg.cboz_blocksize == 64;
> -
> -cpu->cfg.ext_svade = !cpu->cfg.ext_svadu;
>  }
>
>  static void riscv_cpu_validate_g(RISCVCPU *cpu)
> --
> 2.43.0
>
>



Re: [PATCH v3 6/6] target/riscv: Promote svade to a normal extension

2024-02-14 Thread Alistair Francis
On Sat, Feb 3, 2024 at 1:24 AM Daniel Henrique Barboza
 wrote:
>
> From: Andrew Jones 
>
> Named features are extensions which don't make sense for users to
> control and are therefore not exposed on the command line. However,
> svade is an extension which makes sense for users to control, so treat
> it like a "normal" extension. The default is false, even for the max
> cpu type, since QEMU has always implemented hardware A/D PTE bit
> updating, so users must opt into svade (or get it from a CPU type
> which enables it by default).
>
> Reviewed-by: Daniel Henrique Barboza 
> Signed-off-by: Andrew Jones 

Reviewed-by: Alistair Francis 

Alistair

> ---
>  target/riscv/cpu.c | 9 ++---
>  target/riscv/tcg/tcg-cpu.c | 6 ++
>  2 files changed, 8 insertions(+), 7 deletions(-)
>
> diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
> index 50ac7845a8..f036b153a1 100644
> --- a/target/riscv/cpu.c
> +++ b/target/riscv/cpu.c
> @@ -1422,6 +1422,7 @@ const RISCVCPUMultiExtConfig riscv_cpu_extensions[] = {
>
>  MULTI_EXT_CFG_BOOL("smepmp", ext_smepmp, false),
>  MULTI_EXT_CFG_BOOL("smstateen", ext_smstateen, false),
> +MULTI_EXT_CFG_BOOL("svade", ext_svade, false),
>  MULTI_EXT_CFG_BOOL("svadu", ext_svadu, true),
>  MULTI_EXT_CFG_BOOL("svinval", ext_svinval, false),
>  MULTI_EXT_CFG_BOOL("svnapot", ext_svnapot, false),
> @@ -1534,7 +1535,6 @@ const RISCVCPUMultiExtConfig 
> riscv_cpu_experimental_exts[] = {
>   * and priv_ver like regular extensions.
>   */
>  const RISCVCPUMultiExtConfig riscv_cpu_named_features[] = {
> -MULTI_EXT_CFG_BOOL("svade", ext_svade, true),
>  MULTI_EXT_CFG_BOOL("zic64b", ext_zic64b, true),
>
>  /*
> @@ -2182,8 +2182,6 @@ static RISCVCPUProfile RVA22U64 = {
>   * Other named features that we already implement: Sstvecd, Sstvala,
>   * Sscounterenw
>   *
> - * Named features that we need to enable: svade
> - *
>   * The remaining features/extensions comes from RVA22U64.
>   */
>  static RISCVCPUProfile RVA22S64 = {
> @@ -2195,10 +2193,7 @@ static RISCVCPUProfile RVA22S64 = {
>  .ext_offsets = {
>  /* rva22s64 exts */
>  CPU_CFG_OFFSET(ext_zifencei), CPU_CFG_OFFSET(ext_svpbmt),
> -CPU_CFG_OFFSET(ext_svinval),
> -
> -/* rva22s64 named features */
> -CPU_CFG_OFFSET(ext_svade),
> +CPU_CFG_OFFSET(ext_svinval), CPU_CFG_OFFSET(ext_svade),
>
>  RISCV_PROFILE_EXT_LIST_END
>  }
> diff --git a/target/riscv/tcg/tcg-cpu.c b/target/riscv/tcg/tcg-cpu.c
> index 43c32b4a15..9fc64979f1 100644
> --- a/target/riscv/tcg/tcg-cpu.c
> +++ b/target/riscv/tcg/tcg-cpu.c
> @@ -1314,6 +1314,12 @@ static void riscv_init_max_cpu_extensions(Object *obj)
>  isa_ext_update_enabled(cpu, prop->offset, true);
>  }
>
> +/*
> + * Some extensions can't be added without backward compatibilty concerns.
> + * Disable those, the user can still opt in to them on the command line.
> + */
> +cpu->cfg.ext_svade = false;
> +
>  /* set vector version */
>  env->vext_ver = VEXT_VERSION_1_00_0;
>
> --
> 2.43.0
>
>



Re: [PATCH v3 4/6] target/riscv: Reset henvcfg to zero

2024-02-14 Thread Alistair Francis
On Sat, Feb 3, 2024 at 1:23 AM Daniel Henrique Barboza
 wrote:
>
> From: Andrew Jones 
>
> The hypervisor should decide what it wants to enable. Zero all
> configuration enable bits on reset.
>
> Also, commit ed67d63798f2 ("target/riscv: Update CSR bits name for
> svadu extension") missed one reference to 'hade'. Change it now.
>
> Fixes: 0af3f115e68e ("target/riscv: Add *envcfg.HADE related check in address 
> translation")
> Fixes: ed67d63798f2 ("target/riscv: Update CSR bits name for svadu extension")
> Reviewed-by: Daniel Henrique Barboza 
> Signed-off-by: Andrew Jones 

Reviewed-by: Alistair Francis 

Alistair

> ---
>  target/riscv/cpu.c | 3 +--
>  target/riscv/csr.c | 2 +-
>  2 files changed, 2 insertions(+), 3 deletions(-)
>
> diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
> index 94843c4f6e..9045f87481 100644
> --- a/target/riscv/cpu.c
> +++ b/target/riscv/cpu.c
> @@ -961,8 +961,7 @@ static void riscv_cpu_reset_hold(Object *obj)
>
>  env->menvcfg = (cpu->cfg.ext_svpbmt ? MENVCFG_PBMTE : 0) |
> (cpu->cfg.ext_svadu ? MENVCFG_ADUE : 0);
> -env->henvcfg = (cpu->cfg.ext_svpbmt ? HENVCFG_PBMTE : 0) |
> -   (cpu->cfg.ext_svadu ? HENVCFG_ADUE : 0);
> +env->henvcfg = 0;
>
>  /* Initialized default priorities of local interrupts. */
>  for (i = 0; i < ARRAY_SIZE(env->miprio); i++) {
> diff --git a/target/riscv/csr.c b/target/riscv/csr.c
> index d9a010387f..93f7bc2cb4 100644
> --- a/target/riscv/csr.c
> +++ b/target/riscv/csr.c
> @@ -2115,7 +2115,7 @@ static RISCVException read_henvcfg(CPURISCVState *env, 
> int csrno,
>  /*
>   * henvcfg.pbmte is read_only 0 when menvcfg.pbmte = 0
>   * henvcfg.stce is read_only 0 when menvcfg.stce = 0
> - * henvcfg.hade is read_only 0 when menvcfg.hade = 0
> + * henvcfg.adue is read_only 0 when menvcfg.adue = 0
>   */
>  *val = env->henvcfg & (~(HENVCFG_PBMTE | HENVCFG_STCE | HENVCFG_ADUE) |
> env->menvcfg);
> --
> 2.43.0
>
>



Re: [PATCH 6/6] tests/libqos: add riscv/virt machine nodes

2024-02-14 Thread Alistair Francis
On Wed, Feb 14, 2024 at 5:18 AM Daniel Henrique Barboza
 wrote:
>
> Add a RISC-V 'virt' machine to the graph. This implementation is a
> modified copy of the existing arm machine in arm-virt-machine.c
>
> It contains a virtio-mmio and a generic-pcihost controller. The
> generic-pcihost controller hardcodes assumptions from the ARM 'virt'
> machine, like ecam and pio_base addresses, so we'll add an extra step to
> set its parameters after creating it.
>
> Our command line is incremented with 'aclint' parameters to allow the
> machine to run MSI tests.
>
> Signed-off-by: Daniel Henrique Barboza 

Acked-by: Alistair Francis 

Alistair

> ---
>  tests/qtest/libqos/meson.build  |   1 +
>  tests/qtest/libqos/riscv-virt-machine.c | 137 
>  2 files changed, 138 insertions(+)
>  create mode 100644 tests/qtest/libqos/riscv-virt-machine.c
>
> diff --git a/tests/qtest/libqos/meson.build b/tests/qtest/libqos/meson.build
> index 90aae42a22..3aed6efcb8 100644
> --- a/tests/qtest/libqos/meson.build
> +++ b/tests/qtest/libqos/meson.build
> @@ -60,6 +60,7 @@ libqos_srcs = files(
>  'arm-xilinx-zynq-a9-machine.c',
>  'ppc64_pseries-machine.c',
>  'x86_64_pc-machine.c',
> +'riscv-virt-machine.c',
>  )
>
>  if have_virtfs
> diff --git a/tests/qtest/libqos/riscv-virt-machine.c 
> b/tests/qtest/libqos/riscv-virt-machine.c
> new file mode 100644
> index 00..c4364c9c5d
> --- /dev/null
> +++ b/tests/qtest/libqos/riscv-virt-machine.c
> @@ -0,0 +1,137 @@
> +/*
> + * libqos driver framework for risc-v
> + *
> + * Initial version based on arm-virt-machine.c
> + *
> + * Copyright (c) 2024 Ventana Micro
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License version 2.1 as published by the Free Software Foundation.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see 
> 
> + */
> +
> +#include "qemu/osdep.h"
> +#include "../libqtest.h"
> +#include "qemu/module.h"
> +#include "libqos-malloc.h"
> +#include "qgraph.h"
> +#include "virtio-mmio.h"
> +#include "generic-pcihost.h"
> +#include "hw/pci/pci_regs.h"
> +
> +#define RISCV_PAGE_SIZE4096
> +
> +/* VIRT_DRAM */
> +#define RISCV_VIRT_RAM_ADDR0x8000
> +#define RISCV_VIRT_RAM_SIZE0x2000
> +
> +/*
> + * VIRT_VIRTIO. BASE_ADDR  points to the last
> + * virtio_mmio device.
> + */
> +#define VIRTIO_MMIO_BASE_ADDR  0x10008000
> +#define VIRTIO_MMIO_SIZE   0x1000
> +
> +/* VIRT_PCIE_PIO  */
> +#define RISCV_GPEX_PIO_BASE0x300
> +#define RISCV_BUS_PIO_LIMIT0x1
> +
> +/* VIRT_PCIE_MMIO */
> +#define RISCV_BUS_MMIO_ALLOC_PTR   0x4000
> +#define RISCV_BUS_MMIO_LIMIT   0x8000
> +
> +/* VIRT_PCIE_ECAM */
> +#define RISCV_ECAM_ALLOC_PTR   0x3000
> +
> +typedef struct QVirtMachine QVirtMachine;
> +
> +struct QVirtMachine {
> +QOSGraphObject obj;
> +QGuestAllocator alloc;
> +QVirtioMMIODevice virtio_mmio;
> +QGenericPCIHost bridge;
> +};
> +
> +static void virt_destructor(QOSGraphObject *obj)
> +{
> +QVirtMachine *machine = (QVirtMachine *) obj;
> +alloc_destroy(>alloc);
> +}
> +
> +static void *virt_get_driver(void *object, const char *interface)
> +{
> +QVirtMachine *machine = object;
> +if (!g_strcmp0(interface, "memory")) {
> +return >alloc;
> +}
> +
> +fprintf(stderr, "%s not present in riscv/virtio\n", interface);
> +g_assert_not_reached();
> +}
> +
> +static QOSGraphObject *virt_get_device(void *obj, const char *device)
> +{
> +QVirtMachine *machine = obj;
> +if (!g_strcmp0(device, "generic-pcihost")) {
> +return >bridge.obj;
> +} else if (!g_strcmp0(device, "virtio-mmio")) {
> +return >virtio_mmio.obj;
> +}
> +
> +fprintf(stderr, "%s not present in riscv/virt\n", device);
> +g_assert_not_reached();
> +}
> +
> +static void riscv_config_qpci_bus(QGenericPCIBus *qpci)
> +{
> +qpci->gpex_pio_base = RISCV_GPEX_PIO_BASE;
> +qpci->bus.pio_limit = RISCV_BUS_PIO_LIMIT;
> +
> +qpci->bus.mmio_alloc_ptr = RISCV_BUS_MMIO_ALLOC_PTR;
> +qpci->bus.mmio_limit = RISCV_BUS_MMIO_LIMIT;
> +
> +qpci->ecam_alloc_ptr = RISCV_ECAM_ALLOC_PTR;
> +}
> +
> +static void *qos_create_machine_riscv_virt(QTestState *qts)
> +{
> +QVirtMachine *machine = g_new0(QVirtMachine, 1);
> +
> +alloc_init(>alloc, 0,
> +   RISCV_VIRT_RAM_ADDR,
> +   RISCV_VIRT_RAM_ADDR + RISCV_VIRT_RAM_SIZE,
> +   RISCV_PAGE_SIZE);
> +qvirtio_mmio_init_device(>virtio_mmio, 

Re: [PATCH 5/6] hw/riscv/virt.c: make aclint compatible with 'qtest' accel

2024-02-14 Thread Alistair Francis
On Wed, Feb 14, 2024 at 5:18 AM Daniel Henrique Barboza
 wrote:
>
> The 'virt' machine makes assumptions on the Advanced Core-Local
> Interruptor, or aclint, based on 'tcg_enabled()' conditionals.  This
> will impact MSI related tests support when adding a RISC-V 'virt' libqos
> machine. The accelerator used in that case, 'qtest', isn't being
> accounted for and we'll error out if we try to enable aclint.
>
> Create a new virt_aclint_allowed() helper to gate the aclint code
> considering both TCG and 'qtest' accelerators. The error message is
> left untouched, mentioning TCG only, because we don't expect the
> regular user to be aware of 'qtest'.
>
> We want to add 'qtest' support for aclint only, leaving the TCG specific
> bits out of it. This is done by changing the current format we use
> today:
>
> if (tcg_enabled()) {
>if (s->have_aclint) { - aclint logic - }
>else { - non-aclint, TCG logic - }
> }
>
> into:
>
> if (virt_aclint_allowed() && s->have_aclint) {
>  - aclint logic -
> } else if (tcg_enabled()) {
>  - non-aclint, TCG logic -
> }
>
> Signed-off-by: Daniel Henrique Barboza 

Reviewed-by: Alistair Francis 

Alistair

> ---
>  hw/riscv/virt.c | 52 +
>  1 file changed, 27 insertions(+), 25 deletions(-)
>
> diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c
> index 54ad809b44..a094af97c3 100644
> --- a/hw/riscv/virt.c
> +++ b/hw/riscv/virt.c
> @@ -48,6 +48,7 @@
>  #include "sysemu/tcg.h"
>  #include "sysemu/kvm.h"
>  #include "sysemu/tpm.h"
> +#include "sysemu/qtest.h"
>  #include "hw/pci/pci.h"
>  #include "hw/pci-host/gpex.h"
>  #include "hw/display/ramfb.h"
> @@ -61,6 +62,11 @@ static bool virt_use_kvm_aia(RISCVVirtState *s)
>  return kvm_irqchip_in_kernel() && s->aia_type == 
> VIRT_AIA_TYPE_APLIC_IMSIC;
>  }
>
> +static bool virt_aclint_allowed(void)
> +{
> +return tcg_enabled() || qtest_enabled();
> +}
> +
>  static const MemMapEntry virt_memmap[] = {
>  [VIRT_DEBUG] ={0x0, 0x100 },
>  [VIRT_MROM] = { 0x1000,0xf000 },
> @@ -725,14 +731,12 @@ static void create_fdt_sockets(RISCVVirtState *s, const 
> MemMapEntry *memmap,
>
>  create_fdt_socket_memory(s, memmap, socket);
>
> -if (tcg_enabled()) {
> -if (s->have_aclint) {
> -create_fdt_socket_aclint(s, memmap, socket,
> -_phandles[phandle_pos]);
> -} else {
> -create_fdt_socket_clint(s, memmap, socket,
> -_phandles[phandle_pos]);
> -}
> +if (virt_aclint_allowed() && s->have_aclint) {
> +create_fdt_socket_aclint(s, memmap, socket,
> + _phandles[phandle_pos]);
> +} else if (tcg_enabled()) {
> +create_fdt_socket_clint(s, memmap, socket,
> +_phandles[phandle_pos]);
>  }
>  }
>
> @@ -1409,7 +1413,7 @@ static void virt_machine_init(MachineState *machine)
>  exit(1);
>  }
>
> -if (!tcg_enabled() && s->have_aclint) {
> +if (!virt_aclint_allowed() && s->have_aclint) {
>  error_report("'aclint' is only available with TCG acceleration");
>  exit(1);
>  }
> @@ -1446,23 +1450,22 @@ static void virt_machine_init(MachineState *machine)
>  hart_count, _abort);
>  sysbus_realize(SYS_BUS_DEVICE(>soc[i]), _fatal);
>
> -if (tcg_enabled()) {
> -if (s->have_aclint) {
> -if (s->aia_type == VIRT_AIA_TYPE_APLIC_IMSIC) {
> -/* Per-socket ACLINT MTIMER */
> -riscv_aclint_mtimer_create(memmap[VIRT_CLINT].base +
> +if (virt_aclint_allowed() && s->have_aclint) {
> +if (s->aia_type == VIRT_AIA_TYPE_APLIC_IMSIC) {
> +/* Per-socket ACLINT MTIMER */
> +riscv_aclint_mtimer_create(memmap[VIRT_CLINT].base +
>  i * RISCV_ACLINT_DEFAULT_MTIMER_SIZE,
>  RISCV_ACLINT_DEFAULT_MTIMER_SIZE,
>  base_hartid, hart_count,
>  RISCV_ACLINT_DEFAULT_MTIMECMP,
>  RISCV_ACLINT_DEFAULT_MTIME,
>  RISCV_ACLINT_DEFAULT_TIMEBASE_FREQ, true);
> -} else {
> -/* Per-socket ACLINT MSWI, MTIMER, and SSWI */
> -riscv_aclint_swi_create(memmap[VIRT_CLINT].base +
> +} else {
> +/* Per-socket ACLINT MSWI, MTIMER, and SSWI */
> +riscv_aclint_swi_create(memmap[VIRT_CLINT].base +
>  i * memmap[VIRT_CLINT].size,
>  base_hartid, hart_count, false);
> -riscv_aclint_mtimer_create(memmap[VIRT_CLINT].base +
> +riscv_aclint_mtimer_create(memmap[VIRT_CLINT].base +
>  i * memmap[VIRT_CLINT].size +
>  

Re: [PATCH 4/6] hw/riscv/virt.c: add virtio-iommu-pci hotplug support

2024-02-14 Thread Alistair Francis
On Wed, Feb 14, 2024 at 5:18 AM Daniel Henrique Barboza
 wrote:
>
> We want to add a RISC-V 'virt' libqos machine to increase our test
> coverage. Some of the tests will try to plug a virtio-iommu-pci
> device into the board and do some tests with it.
>
> Enable virtio-iommu-pci in the 'virt' machine.
>
> Signed-off-by: Daniel Henrique Barboza 

Acked-by: Alistair Francis 

Alistair

> ---
>  hw/riscv/virt.c | 36 +++-
>  1 file changed, 35 insertions(+), 1 deletion(-)
>
> diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c
> index b540f4d3da..54ad809b44 100644
> --- a/hw/riscv/virt.c
> +++ b/hw/riscv/virt.c
> @@ -53,6 +53,7 @@
>  #include "hw/display/ramfb.h"
>  #include "hw/acpi/aml-build.h"
>  #include "qapi/qapi-visit-common.h"
> +#include "hw/virtio/virtio-iommu.h"
>
>  /* KVM AIA only supports APLIC MSI. APLIC Wired is always emulated by QEMU. 
> */
>  static bool virt_use_kvm_aia(RISCVVirtState *s)
> @@ -971,6 +972,34 @@ static void create_fdt_fw_cfg(RISCVVirtState *s, const 
> MemMapEntry *memmap)
>  qemu_fdt_setprop(ms->fdt, nodename, "dma-coherent", NULL, 0);
>  }
>
> +static void create_fdt_virtio_iommu(RISCVVirtState *s, uint16_t bdf)
> +{
> +const char compat[] = "virtio,pci-iommu\0pci1af4,1057";
> +void *fdt = MACHINE(s)->fdt;
> +uint32_t iommu_phandle;
> +g_autofree char *iommu_node = NULL;
> +g_autofree char *pci_node = NULL;
> +
> +pci_node = g_strdup_printf("/soc/pci@%lx",
> +   (long) virt_memmap[VIRT_PCIE_ECAM].base);
> +iommu_node = g_strdup_printf("%s/virtio_iommu@%x,%x", pci_node,
> + PCI_SLOT(bdf), PCI_FUNC(bdf));
> +iommu_phandle = qemu_fdt_alloc_phandle(fdt);
> +
> +qemu_fdt_add_subnode(fdt, iommu_node);
> +
> +qemu_fdt_setprop(fdt, iommu_node, "compatible", compat, sizeof(compat));
> +qemu_fdt_setprop_sized_cells(fdt, iommu_node, "reg",
> + 1, bdf << 8, 1, 0, 1, 0,
> + 1, 0, 1, 0);
> +qemu_fdt_setprop_cell(fdt, iommu_node, "#iommu-cells", 1);
> +qemu_fdt_setprop_cell(fdt, iommu_node, "phandle", iommu_phandle);
> +
> +qemu_fdt_setprop_cells(fdt, pci_node, "iommu-map",
> +   0, iommu_phandle, 0, bdf,
> +   bdf + 1, iommu_phandle, bdf + 1, 0x - bdf);
> +}
> +
>  static void finalize_fdt(RISCVVirtState *s)
>  {
>  uint32_t phandle = 1, irq_mmio_phandle = 1, msi_pcie_phandle = 1;
> @@ -1680,7 +1709,8 @@ static HotplugHandler 
> *virt_machine_get_hotplug_handler(MachineState *machine,
>  {
>  MachineClass *mc = MACHINE_GET_CLASS(machine);
>
> -if (device_is_dynamic_sysbus(mc, dev)) {
> +if (device_is_dynamic_sysbus(mc, dev) ||
> +object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) {
>  return HOTPLUG_HANDLER(machine);
>  }
>  return NULL;
> @@ -1699,6 +1729,10 @@ static void virt_machine_device_plug_cb(HotplugHandler 
> *hotplug_dev,
>   SYS_BUS_DEVICE(dev));
>  }
>  }
> +
> +if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) {
> +create_fdt_virtio_iommu(s, pci_get_bdf(PCI_DEVICE(dev)));
> +}
>  }
>
>  static void virt_machine_class_init(ObjectClass *oc, void *data)
> --
> 2.43.0
>
>



Re: [PATCH 3/6] hw/riscv/virt.c: create '/soc/pci@...' fdt node earlier

2024-02-14 Thread Alistair Francis
On Wed, Feb 14, 2024 at 5:19 AM Daniel Henrique Barboza
 wrote:
>
> Hotplugged FDT nodes will attempt to write this node that, at this
> moment, is being created only in create_fdt_pcie() during
> finalize_fdt().
>
> Create it earlier.
>
> Signed-off-by: Daniel Henrique Barboza 

Reviewed-by: Alistair Francis 

Alistair

> ---
>  hw/riscv/virt.c | 9 -
>  1 file changed, 8 insertions(+), 1 deletion(-)
>
> diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c
> index fd35c74781..b540f4d3da 100644
> --- a/hw/riscv/virt.c
> +++ b/hw/riscv/virt.c
> @@ -826,7 +826,6 @@ static void create_fdt_pcie(RISCVVirtState *s, const 
> MemMapEntry *memmap,
>
>  name = g_strdup_printf("/soc/pci@%lx",
>  (long) memmap[VIRT_PCIE_ECAM].base);
> -qemu_fdt_add_subnode(ms->fdt, name);
>  qemu_fdt_setprop_cell(ms->fdt, name, "#address-cells",
>  FDT_PCI_ADDR_CELLS);
>  qemu_fdt_setprop_cell(ms->fdt, name, "#interrupt-cells",
> @@ -996,6 +995,7 @@ static void create_fdt(RISCVVirtState *s, const 
> MemMapEntry *memmap)
>  {
>  MachineState *ms = MACHINE(s);
>  uint8_t rng_seed[32];
> +g_autofree char *name = NULL;
>
>  ms->fdt = create_device_tree(>fdt_size);
>  if (!ms->fdt) {
> @@ -1014,6 +1014,13 @@ static void create_fdt(RISCVVirtState *s, const 
> MemMapEntry *memmap)
>  qemu_fdt_setprop_cell(ms->fdt, "/soc", "#size-cells", 0x2);
>  qemu_fdt_setprop_cell(ms->fdt, "/soc", "#address-cells", 0x2);
>
> +/*
> + * The "/soc/pci@..." node is needed for PCIE hotplugs
> + * that might happen before finalize_fdt().
> + */
> +name = g_strdup_printf("/soc/pci@%lx", (long) 
> memmap[VIRT_PCIE_ECAM].base);
> +qemu_fdt_add_subnode(ms->fdt, name);
> +
>  qemu_fdt_add_subnode(ms->fdt, "/chosen");
>
>  /* Pass seed to RNG */
> --
> 2.43.0
>
>



Re: [PATCH 2/6] libqos/virtio.c: fix 'avail_event' offset in qvring_init()

2024-02-14 Thread Alistair Francis
On Wed, Feb 14, 2024 at 5:18 AM Daniel Henrique Barboza
 wrote:
>
> In qvring_init() we're writing vq->used->avail_event at "vq->used + 2 +
> array_size".  The struct pointed by vq->used is, from virtio_ring.h
> Linux header):
>
>  *  // A ring of used descriptor heads with free-running index.
>  *  __virtio16 used_flags;
>  *  __virtio16 used_idx;
>  *  struct vring_used_elem used[num];
>  *  __virtio16 avail_event_idx;
>
> So 'flags' is the word right at vq->used. 'idx' is vq->used + 2. We need
> to skip 'used_idx' by adding + 2 bytes, and then sum the vector size, to
> reach avail_event_idx. An example on how to properly access this field
> can be found in qvirtqueue_kick():
>
> avail_event = qvirtio_readw(d, qts, vq->used + 4 +
> sizeof(struct vring_used_elem) * vq->size);
>
> This error was detected when enabling the RISC-V 'virt' libqos machine.
> The 'idx' test from vhost-user-blk-test.c errors out with a timeout in
> qvirtio_wait_used_elem(). The timeout happens because when processing
> the first element, 'avail_event' is read in qvirtqueue_kick() as non-zero
> because we didn't initialize it properly (and the memory at that point
> happened to be non-zero). 'idx' is 0.
>
> All of this makes this condition fail because "idx - avail_event" will
> overflow and be non-zero:
>
> /* < 1 because we add elements to avail queue one by one */
> if ((flags & VRING_USED_F_NO_NOTIFY) == 0 &&
> (!vq->event || (uint16_t)(idx-avail_event) < 1)) {
> d->bus->virtqueue_kick(d, vq);
> }
>
> As a result the virtqueue is never kicked and we'll timeout waiting for it.
>
> Fixes: 1053587c3f ("libqos: Added EVENT_IDX support")
> Signed-off-by: Daniel Henrique Barboza 

Reviewed-by: Alistair Francis 

Alistair

> ---
>  tests/qtest/libqos/virtio.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/tests/qtest/libqos/virtio.c b/tests/qtest/libqos/virtio.c
> index 4f39124eba..82a6e122bf 100644
> --- a/tests/qtest/libqos/virtio.c
> +++ b/tests/qtest/libqos/virtio.c
> @@ -265,7 +265,7 @@ void qvring_init(QTestState *qts, const QGuestAllocator 
> *alloc, QVirtQueue *vq,
>  /* vq->used->idx */
>  qvirtio_writew(vq->vdev, qts, vq->used + 2, 0);
>  /* vq->used->avail_event */
> -qvirtio_writew(vq->vdev, qts, vq->used + 2 +
> +qvirtio_writew(vq->vdev, qts, vq->used + 4 +
> sizeof(struct vring_used_elem) * vq->size, 0);
>  }
>
> --
> 2.43.0
>
>



Re: [PATCH 1/6] libqos/virtio.c: init all elems in qvring_indirect_desc_setup()

2024-02-14 Thread Alistair Francis
On Wed, Feb 14, 2024 at 5:18 AM Daniel Henrique Barboza
 wrote:
>
> The loop isn't setting the values for the last element. Every other
> element is being initialized with addr = 0, flags = VRING_DESC_F_NEXT
> and next = i + 1. The last elem is never touched.
>
> This became a problem when enabling a RISC-V 'virt' libqos machine in
> the 'indirect' test of virti-blk-test.c. The 'flags' for the last
> element will end up being an odd number (since we didn't touch it).
> Being an odd number it will be mistaken by VRING_DESC_F_NEXT, which
> happens to be 1.
>
> Deep into hw/virt/virtio.c, in virtqueue_split_pop(), into
> virtqueue_split_read_next_desc(), a check for VRING_DESC_F_NEXT will be
> made to see if we're supposed to chain. The code will keep up chaining
> in the last element because the unintialized value happens to be odd.
> We'll error out right after that because desc->next (which is also
> uninitialized) will be >= max. A VIRTQUEUE_READ_DESC_ERROR will be
> returned, with an error message like this in the stderr:
>
> qemu-system-riscv64: Desc next is 49391
>
> Since we never returned, w'll end up timing out at qvirtio_wait_used_elem():
>
> ERROR:../tests/qtest/libqos/virtio.c:236:qvirtio_wait_used_elem:
> assertion failed: (g_get_monotonic_time() - start_time <= timeout_us)
>
> The root cause is using unintialized values from guest_alloc() in
> qvring_indirect_desc_setup(). There's no guarantee that the memory pages
> retrieved will be zeroed, so we can't make assumptions. In fact, commit
> 5b4f72f5e8 ("tests/qtest: properly initialise the vring used idx") fixed a
> similar problem stating "It is probably not wise to assume guest memory
> is zeroed anyway". I concur.
>
> Initialize all elems in qvring_indirect_desc_setup().
>
> Fixes: f294b029aa ("libqos: Added indirect descriptor support to virtio 
> implementation")
> Signed-off-by: Daniel Henrique Barboza 

Reviewed-by: Alistair Francis 

Alistair

> ---
>  tests/qtest/libqos/virtio.c | 25 +++--
>  1 file changed, 19 insertions(+), 6 deletions(-)
>
> diff --git a/tests/qtest/libqos/virtio.c b/tests/qtest/libqos/virtio.c
> index 410513225f..4f39124eba 100644
> --- a/tests/qtest/libqos/virtio.c
> +++ b/tests/qtest/libqos/virtio.c
> @@ -280,14 +280,27 @@ QVRingIndirectDesc 
> *qvring_indirect_desc_setup(QTestState *qs, QVirtioDevice *d,
>  indirect->elem = elem;
>  indirect->desc = guest_alloc(alloc, sizeof(struct vring_desc) * elem);
>
> -for (i = 0; i < elem - 1; ++i) {
> +for (i = 0; i < elem; ++i) {
>  /* indirect->desc[i].addr */
>  qvirtio_writeq(d, qs, indirect->desc + (16 * i), 0);
> -/* indirect->desc[i].flags */
> -qvirtio_writew(d, qs, indirect->desc + (16 * i) + 12,
> -   VRING_DESC_F_NEXT);
> -/* indirect->desc[i].next */
> -qvirtio_writew(d, qs, indirect->desc + (16 * i) + 14, i + 1);
> +
> +/*
> + * If it's not the last element of the ring, set
> + * the chain (VRING_DESC_F_NEXT) flag and
> + * desc->next. Clear the last element - there's
> + * no guarantee that guest_alloc() will do it.
> + */
> +if (i != elem - 1) {
> +/* indirect->desc[i].flags */
> +qvirtio_writew(d, qs, indirect->desc + (16 * i) + 12,
> +   VRING_DESC_F_NEXT);
> +
> +/* indirect->desc[i].next */
> +qvirtio_writew(d, qs, indirect->desc + (16 * i) + 14, i + 1);
> +} else {
> +qvirtio_writew(d, qs, indirect->desc + (16 * i) + 12, 0);
> +qvirtio_writew(d, qs, indirect->desc + (16 * i) + 14, 0);
> +}
>  }
>
>  return indirect;
> --
> 2.43.0
>
>



Re: [PATCH v2] hw: riscv: Allow large kernels to boot by moving the initrd further away in RAM

2024-02-14 Thread Alistair Francis
On Wed, Feb 7, 2024 at 1:42 AM Alexandre Ghiti  wrote:
>
> Currently, the initrd is placed at 128MB, which overlaps with the kernel
> when it is large (for example syzbot kernels are). From the kernel side,
> there is no reason we could not push the initrd further away in memory
> to accommodate large kernels, so move the initrd at 512MB when possible.
>
> The ideal solution would have been to place the initrd based on the
> kernel size but we actually can't since the bss size is not known when
> the image is loaded by load_image_targphys_as() and the initrd would
> then overlap with this section.
>
> Signed-off-by: Alexandre Ghiti 

Reviewed-by: Alistair Francis 

Alistair

> ---
>
> Changes in v2:
> - Fix typos in commit log (Daniel) and title
> - Added to the commit log why using the kernel size does not work
>   (Daniel)
>
>  hw/riscv/boot.c | 12 ++--
>  1 file changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/hw/riscv/boot.c b/hw/riscv/boot.c
> index 0ffca05189..9a367af2fa 100644
> --- a/hw/riscv/boot.c
> +++ b/hw/riscv/boot.c
> @@ -188,13 +188,13 @@ static void riscv_load_initrd(MachineState *machine, 
> uint64_t kernel_entry)
>   * kernel is uncompressed it will not clobber the initrd. However
>   * on boards without much RAM we must ensure that we still leave
>   * enough room for a decent sized initrd, and on boards with large
> - * amounts of RAM we must avoid the initrd being so far up in RAM
> - * that it is outside lowmem and inaccessible to the kernel.
> - * So for boards with less  than 256MB of RAM we put the initrd
> - * halfway into RAM, and for boards with 256MB of RAM or more we put
> - * the initrd at 128MB.
> + * amounts of RAM, we put the initrd at 512MB to allow large kernels
> + * to boot.
> + * So for boards with less than 1GB of RAM we put the initrd
> + * halfway into RAM, and for boards with 1GB of RAM or more we put
> + * the initrd at 512MB.
>   */
> -start = kernel_entry + MIN(mem_size / 2, 128 * MiB);
> +start = kernel_entry + MIN(mem_size / 2, 512 * MiB);
>
>  size = load_ramdisk(filename, start, mem_size - start);
>  if (size == -1) {
> --
> 2.39.2
>
>



Re: [PATCH v4 5/5] target/riscv: Implement privilege mode filtering for cycle/instret

2024-02-14 Thread Alistair Francis
On Wed, Jan 24, 2024 at 10:15 AM Atish Kumar Patra  wrote:
>
> On Sun, Jan 21, 2024 at 9:04 PM Alistair Francis  wrote:
> >
> > On Tue, Jan 9, 2024 at 10:29 AM Atish Patra  wrote:
> > >
> > > Privilege mode filtering can also be emulated for cycle/instret by
> > > tracking host_ticks/icount during each privilege mode switch. This
> > > patch implements that for both cycle/instret and mhpmcounters. The
> > > first one requires Smcntrpmf while the other one requires Sscofpmf
> > > to be enabled.
> > >
> > > The cycle/instret are still computed using host ticks when icount
> > > is not enabled. Otherwise, they are computed using raw icount which
> > > is more accurate in icount mode.
> > >
> > > Reviewed-by: Daniel Henrique Barboza 
> > > Signed-off-by: Atish Patra 
> > > ---
> > >  target/riscv/cpu.h| 11 +
> > >  target/riscv/cpu_helper.c |  9 +++-
> > >  target/riscv/csr.c| 95 ++-
> > >  target/riscv/pmu.c| 43 ++
> > >  target/riscv/pmu.h|  2 +
> > >  5 files changed, 136 insertions(+), 24 deletions(-)
> > >
> > > diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h
> > > index 34617c4c4bab..40d10726155b 100644
> > > --- a/target/riscv/cpu.h
> > > +++ b/target/riscv/cpu.h
> > > @@ -136,6 +136,15 @@ typedef struct PMUCTRState {
> > >  target_ulong irq_overflow_left;
> > >  } PMUCTRState;
> > >
> > > +typedef struct PMUFixedCtrState {
> > > +/* Track cycle and icount for each privilege mode */
> > > +uint64_t counter[4];
> > > +uint64_t counter_prev[4];
> >
> > Are these two used?
> >
>
> Yes. That's where it tracks the current/previous value cycle/instret.
> riscv_pmu_icount_update_priv/riscv_pmu_cycle_update_priv
>
> The priv mode based filtering is enabled in 
> riscv_pmu_ctr_get_fixed_counters_val
> using "counter" afterwards.

Ah! Yeah sorry was not reading this correctly

Alistair



Re: [PATCH v4 5/5] target/riscv: Implement privilege mode filtering for cycle/instret

2024-02-14 Thread Alistair Francis
On Tue, Jan 9, 2024 at 10:29 AM Atish Patra  wrote:
>
> Privilege mode filtering can also be emulated for cycle/instret by
> tracking host_ticks/icount during each privilege mode switch. This
> patch implements that for both cycle/instret and mhpmcounters. The
> first one requires Smcntrpmf while the other one requires Sscofpmf
> to be enabled.
>
> The cycle/instret are still computed using host ticks when icount
> is not enabled. Otherwise, they are computed using raw icount which
> is more accurate in icount mode.
>
> Reviewed-by: Daniel Henrique Barboza 
> Signed-off-by: Atish Patra 
> ---
>  target/riscv/cpu.h| 11 +
>  target/riscv/cpu_helper.c |  9 +++-
>  target/riscv/csr.c| 95 ++-
>  target/riscv/pmu.c| 43 ++
>  target/riscv/pmu.h|  2 +
>  5 files changed, 136 insertions(+), 24 deletions(-)
>
> diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h
> index 34617c4c4bab..40d10726155b 100644
> --- a/target/riscv/cpu.h
> +++ b/target/riscv/cpu.h
> @@ -136,6 +136,15 @@ typedef struct PMUCTRState {
>  target_ulong irq_overflow_left;
>  } PMUCTRState;
>
> +typedef struct PMUFixedCtrState {
> +/* Track cycle and icount for each privilege mode */
> +uint64_t counter[4];
> +uint64_t counter_prev[4];
> +/* Track cycle and icount for each privilege mode when V = 1*/
> +uint64_t counter_virt[2];
> +uint64_t counter_virt_prev[2];
> +} PMUFixedCtrState;
> +
>  struct CPUArchState {
>  target_ulong gpr[32];
>  target_ulong gprh[32]; /* 64 top bits of the 128-bit registers */
> @@ -334,6 +343,8 @@ struct CPUArchState {
>  /* PMU event selector configured values for RV32 */
>  target_ulong mhpmeventh_val[RV_MAX_MHPMEVENTS];
>
> +PMUFixedCtrState pmu_fixed_ctrs[2];
> +
>  target_ulong sscratch;
>  target_ulong mscratch;
>
> diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
> index e7e23b34f455..3dddb1b433e8 100644
> --- a/target/riscv/cpu_helper.c
> +++ b/target/riscv/cpu_helper.c
> @@ -715,8 +715,13 @@ void riscv_cpu_set_mode(CPURISCVState *env, target_ulong 
> newpriv)
>  {
>  g_assert(newpriv <= PRV_M && newpriv != PRV_RESERVED);
>
> -if (icount_enabled() && newpriv != env->priv) {
> -riscv_itrigger_update_priv(env);
> +if (newpriv != env->priv) {
> +if (icount_enabled()) {
> +riscv_itrigger_update_priv(env);
> +riscv_pmu_icount_update_priv(env, newpriv);
> +} else {
> +riscv_pmu_cycle_update_priv(env, newpriv);
> +}
>  }
>  /* tlb_flush is unnecessary as mode is contained in mmu_idx */
>  env->priv = newpriv;
> diff --git a/target/riscv/csr.c b/target/riscv/csr.c
> index 3bd4aa22374f..307d052021c5 100644
> --- a/target/riscv/csr.c
> +++ b/target/riscv/csr.c
> @@ -782,32 +782,16 @@ static int write_vcsr(CPURISCVState *env, int csrno, 
> target_ulong val)
>  return RISCV_EXCP_NONE;
>  }
>
> +#if defined(CONFIG_USER_ONLY)
>  /* User Timers and Counters */
>  static target_ulong get_ticks(bool shift)
>  {
> -int64_t val;
> -target_ulong result;
> -
> -#if !defined(CONFIG_USER_ONLY)
> -if (icount_enabled()) {
> -val = icount_get();
> -} else {
> -val = cpu_get_host_ticks();
> -}
> -#else
> -val = cpu_get_host_ticks();
> -#endif
> -
> -if (shift) {
> -result = val >> 32;
> -} else {
> -result = val;
> -}
> +int64_t val = cpu_get_host_ticks();
> +target_ulong result = shift ? val >> 32 : val;
>
>  return result;
>  }
>
> -#if defined(CONFIG_USER_ONLY)
>  static RISCVException read_time(CPURISCVState *env, int csrno,
>  target_ulong *val)
>  {
> @@ -932,6 +916,70 @@ static int write_mhpmeventh(CPURISCVState *env, int 
> csrno, target_ulong val)
>  return RISCV_EXCP_NONE;
>  }
>
> +static target_ulong riscv_pmu_ctr_get_fixed_counters_val(CPURISCVState *env,
> + int counter_idx,
> + bool upper_half)
> +{
> +uint64_t curr_val = 0;
> +target_ulong result = 0;
> +uint64_t *counter_arr = icount_enabled() ? 
> env->pmu_fixed_ctrs[1].counter :
> +env->pmu_fixed_ctrs[0].counter;

I don't follow why we access different arrays depending if
icount_enabled(). Can we at least comment this?

Alistair

> +uint64_t *counter_arr_virt = icount_enabled() ?
> + env->pmu_fixed_ctrs[1].counter_virt :
> + env->pmu_fixed_ctrs[0].counter_virt;
> +uint64_t cfg_val = 0;
> +
> +if (counter_idx == 0) {
> +cfg_val = upper_half ? ((uint64_t)env->mcyclecfgh << 32) :
> +  env->mcyclecfg;
> +} else if (counter_idx == 2) {
> +cfg_val = upper_half ? ((uint64_t)env->minstretcfgh << 32) :
> +  

Re: [PATCH v2 2/2] target/riscv: Support xtheadmaee for thead-c906

2024-02-14 Thread Alistair Francis
On Mon, Feb 5, 2024 at 6:37 PM Christoph Müllner
 wrote:
>
> On Mon, Feb 5, 2024 at 3:42 AM Alistair Francis  wrote:
> >
> > On Sun, Feb 4, 2024 at 3:44 PM LIU Zhiwei  
> > wrote:
> > >
> > > This patch set fix the regression on kernel pointed by Björn Töpel in
> > > https://www.mail-archive.com/qemu-devel@nongnu.org/msg1018232.html.
> > >
> > > thead-c906 uses some flags in pte [60-63] bits. It has history reasons 
> > > that
> > > SVPBMT didn't exist when thead-c906 came to wotrld. We named this feature 
> > > as
> > > xtheadmaee[1]. this feature is controlled by an custom CSR named mxstatus,
> > > whose maee field encodes whether enable the pte [60-63] bits.
> > >
> > > [1]:https://github.com/T-head-Semi/thead-extension-spec/blob/master/xtheadmaee.adoc
> > >
> > > Signed-off-by: LIU Zhiwei 
> > > ---
> > > v1->v2:
> > > 1) Remove mxstatus user mode access
> > > 2) Add reference documentation to the commit log
> > > ---
> > >  target/riscv/cpu.c |  6 
> > >  target/riscv/cpu.h |  9 ++
> > >  target/riscv/cpu_bits.h|  6 
> > >  target/riscv/cpu_cfg.h |  4 ++-
> > >  target/riscv/cpu_helper.c  | 25 ---
> > >  target/riscv/meson.build   |  1 +
> > >  target/riscv/tcg/tcg-cpu.c |  7 +++-
> > >  target/riscv/xthead_csr.c  | 65 ++
> > >  8 files changed, 110 insertions(+), 13 deletions(-)
> > >  create mode 100644 target/riscv/xthead_csr.c
> > >
> > > diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
> > > index 2dcbc9ff32..bfdbb0539a 100644
> > > --- a/target/riscv/cpu.c
> > > +++ b/target/riscv/cpu.c
> > > @@ -171,6 +171,7 @@ const RISCVIsaExtData isa_edata_arr[] = {
> > >  ISA_EXT_DATA_ENTRY(xtheadmemidx, PRIV_VERSION_1_11_0, 
> > > ext_xtheadmemidx),
> > >  ISA_EXT_DATA_ENTRY(xtheadmempair, PRIV_VERSION_1_11_0, 
> > > ext_xtheadmempair),
> > >  ISA_EXT_DATA_ENTRY(xtheadsync, PRIV_VERSION_1_11_0, ext_xtheadsync),
> > > +ISA_EXT_DATA_ENTRY(xtheadmaee, PRIV_VERSION_1_11_0, ext_xtheadmaee),
> > >  ISA_EXT_DATA_ENTRY(xventanacondops, PRIV_VERSION_1_12_0, 
> > > ext_XVentanaCondOps),
> > >
> > >  DEFINE_PROP_END_OF_LIST(),
> > > @@ -506,6 +507,7 @@ static void rv64_thead_c906_cpu_init(Object *obj)
> > >
> > >  cpu->cfg.mvendorid = THEAD_VENDOR_ID;
> > >  #ifndef CONFIG_USER_ONLY
> > > +cpu->cfg.ext_xtheadmaee = true;
> > >  set_satp_mode_max_supported(cpu, VM_1_10_SV39);
> > >  #endif
> > >
> > > @@ -949,6 +951,9 @@ static void riscv_cpu_reset_hold(Object *obj)
> > >  }
> > >
> > >  pmp_unlock_entries(env);
> > > +if (riscv_cpu_cfg(env)->ext_xtheadmaee) {
> > > +env->th_mxstatus |= TH_MXSTATUS_MAEE;
> > > +}
> > >  #endif
> > >  env->xl = riscv_cpu_mxl(env);
> > >  riscv_cpu_update_mask(env);
> > > @@ -1439,6 +1444,7 @@ const RISCVCPUMultiExtConfig 
> > > riscv_cpu_vendor_exts[] = {
> > >  MULTI_EXT_CFG_BOOL("xtheadmemidx", ext_xtheadmemidx, false),
> > >  MULTI_EXT_CFG_BOOL("xtheadmempair", ext_xtheadmempair, false),
> > >  MULTI_EXT_CFG_BOOL("xtheadsync", ext_xtheadsync, false),
> > > +MULTI_EXT_CFG_BOOL("xtheadmaee", ext_xtheadmaee, false),
> > >  MULTI_EXT_CFG_BOOL("xventanacondops", ext_XVentanaCondOps, false),
> > >
> > >  DEFINE_PROP_END_OF_LIST(),
> > > diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h
> > > index 5f3955c38d..1bacf40355 100644
> > > --- a/target/riscv/cpu.h
> > > +++ b/target/riscv/cpu.h
> > > @@ -412,6 +412,14 @@ struct CPUArchState {
> > >  target_ulong cur_pmmask;
> > >  target_ulong cur_pmbase;
> > >
> > > +union {
> > > +/* Custom CSR for Xuantie CPU */
> > > +struct {
> > > +#ifndef CONFIG_USER_ONLY
> > > +target_ulong th_mxstatus;
> > > +#endif
> > > +};
> > > +};
> > >  /* Fields from here on are preserved across CPU reset. */
> > >  QEMUTimer *stimer; /* Internal timer for S-mode interrupt */
> > >  QEMUTimer *vstimer; /* Internal timer for VS-mode interrupt */
> > > @@ -799,6 +807,7 @@ void riscv_add_satp_mode_properties(Object *obj);
> > >  bool riscv_cpu_accelerator_compatible(RISCVCPU *cpu);
> > >
> > >  /* CSR function table */
> > > +extern riscv_csr_operations th_csr_ops[CSR_TABLE_SIZE];
> > >  extern riscv_csr_operations csr_ops[CSR_TABLE_SIZE];
> > >
> > >  extern const bool valid_vm_1_10_32[], valid_vm_1_10_64[];
> > > diff --git a/target/riscv/cpu_bits.h b/target/riscv/cpu_bits.h
> > > index e116f6c252..67ebb1cefe 100644
> > > --- a/target/riscv/cpu_bits.h
> > > +++ b/target/riscv/cpu_bits.h
> > > @@ -897,4 +897,10 @@ typedef enum RISCVException {
> > >  /* JVT CSR bits */
> > >  #define JVT_MODE   0x3F
> > >  #define JVT_BASE   (~0x3F)
> > > +
> > > +/* Xuantie custom CSRs */
> > > +#define CSR_TH_MXSTATUS 0x7c0
> > > +
> > > +#define TH_MXSTATUS_MAEE_SHIFT  21
> > > +#define TH_MXSTATUS_MAEE(0x1 << TH_MXSTATUS_MAEE_SHIFT)
> > >  #endif
> > > diff --git 

Re: [PATCH v3 3/6] target/riscv: add remaining named features

2024-02-14 Thread Alistair Francis
On Sat, Feb 3, 2024 at 1:24 AM Daniel Henrique Barboza
 wrote:
>
> The RVA22U64 and RVA22S64 profiles mandates certain extensions that,
> until now, we were implying that they were available.
>
> We can't do this anymore since named features also has a riscv,isa
> entry. Let's add them to riscv_cpu_named_features[].
>
> Instead of adding one bool for each named feature that we'll always
> implement, i.e. can't be turned off, add a 'ext_always_enabled' bool in
> cpu->cfg. This bool will be set to 'true' in TCG accel init, and all
> named features will point to it. This also means that KVM won't see
> these features as always enable, which is our intention.
>
> If any accelerator adds support to disable one of these features, we'll
> have to promote them to regular extensions and allow users to disable it
> via command line.
>
> After this patch, here's the riscv,isa from a buildroot using the
> 'rva22s64' CPU:
>
>  # cat /proc/device-tree/cpus/cpu@0/riscv,isa
> rv64imafdc_zic64b_zicbom_zicbop_zicboz_ziccamoa_ziccif_zicclsm_ziccrse_
> zicntr_zicsr_zifencei_zihintpause_zihpm_za64rs_zfhmin_zca_zcd_zba_zbb_
> zbs_zkt_ssccptr_sscounterenw_sstvala_sstvecd_svade_svinval_svpbmt#
>
> Signed-off-by: Daniel Henrique Barboza 

Reviewed-by: Alistair Francis 

Alistair

> ---
>  target/riscv/cpu.c | 42 +++---
>  target/riscv/cpu_cfg.h |  6 ++
>  target/riscv/tcg/tcg-cpu.c |  2 ++
>  3 files changed, 43 insertions(+), 7 deletions(-)
>
> diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
> index 28d3cfa8ce..94843c4f6e 100644
> --- a/target/riscv/cpu.c
> +++ b/target/riscv/cpu.c
> @@ -101,6 +101,10 @@ const RISCVIsaExtData isa_edata_arr[] = {
>  ISA_EXT_DATA_ENTRY(zicbom, PRIV_VERSION_1_12_0, ext_zicbom),
>  ISA_EXT_DATA_ENTRY(zicbop, PRIV_VERSION_1_12_0, ext_zicbop),
>  ISA_EXT_DATA_ENTRY(zicboz, PRIV_VERSION_1_12_0, ext_zicboz),
> +ISA_EXT_DATA_ENTRY(ziccamoa, PRIV_VERSION_1_11_0, ext_always_enabled),
> +ISA_EXT_DATA_ENTRY(ziccif, PRIV_VERSION_1_11_0, ext_always_enabled),
> +ISA_EXT_DATA_ENTRY(zicclsm, PRIV_VERSION_1_11_0, ext_always_enabled),
> +ISA_EXT_DATA_ENTRY(ziccrse, PRIV_VERSION_1_11_0, ext_always_enabled),
>  ISA_EXT_DATA_ENTRY(zicond, PRIV_VERSION_1_12_0, ext_zicond),
>  ISA_EXT_DATA_ENTRY(zicntr, PRIV_VERSION_1_12_0, ext_zicntr),
>  ISA_EXT_DATA_ENTRY(zicsr, PRIV_VERSION_1_10_0, ext_zicsr),
> @@ -109,6 +113,7 @@ const RISCVIsaExtData isa_edata_arr[] = {
>  ISA_EXT_DATA_ENTRY(zihintpause, PRIV_VERSION_1_10_0, ext_zihintpause),
>  ISA_EXT_DATA_ENTRY(zihpm, PRIV_VERSION_1_12_0, ext_zihpm),
>  ISA_EXT_DATA_ENTRY(zmmul, PRIV_VERSION_1_12_0, ext_zmmul),
> +ISA_EXT_DATA_ENTRY(za64rs, PRIV_VERSION_1_12_0, ext_always_enabled),
>  ISA_EXT_DATA_ENTRY(zacas, PRIV_VERSION_1_12_0, ext_zacas),
>  ISA_EXT_DATA_ENTRY(zawrs, PRIV_VERSION_1_12_0, ext_zawrs),
>  ISA_EXT_DATA_ENTRY(zfa, PRIV_VERSION_1_12_0, ext_zfa),
> @@ -170,8 +175,12 @@ const RISCVIsaExtData isa_edata_arr[] = {
>  ISA_EXT_DATA_ENTRY(smepmp, PRIV_VERSION_1_12_0, ext_smepmp),
>  ISA_EXT_DATA_ENTRY(smstateen, PRIV_VERSION_1_12_0, ext_smstateen),
>  ISA_EXT_DATA_ENTRY(ssaia, PRIV_VERSION_1_12_0, ext_ssaia),
> +ISA_EXT_DATA_ENTRY(ssccptr, PRIV_VERSION_1_11_0, ext_always_enabled),
>  ISA_EXT_DATA_ENTRY(sscofpmf, PRIV_VERSION_1_12_0, ext_sscofpmf),
> +ISA_EXT_DATA_ENTRY(sscounterenw, PRIV_VERSION_1_12_0, 
> ext_always_enabled),
>  ISA_EXT_DATA_ENTRY(sstc, PRIV_VERSION_1_12_0, ext_sstc),
> +ISA_EXT_DATA_ENTRY(sstvala, PRIV_VERSION_1_12_0, ext_always_enabled),
> +ISA_EXT_DATA_ENTRY(sstvecd, PRIV_VERSION_1_12_0, ext_always_enabled),
>  ISA_EXT_DATA_ENTRY(svade, PRIV_VERSION_1_11_0, ext_svade),
>  ISA_EXT_DATA_ENTRY(svadu, PRIV_VERSION_1_12_0, ext_svadu),
>  ISA_EXT_DATA_ENTRY(svinval, PRIV_VERSION_1_12_0, ext_svinval),
> @@ -1512,6 +1521,11 @@ const RISCVCPUMultiExtConfig 
> riscv_cpu_experimental_exts[] = {
>  DEFINE_PROP_END_OF_LIST(),
>  };
>
> +#define ALWAYS_ENABLED_FEATURE(_name) \
> +{.name = _name, \
> + .offset = CPU_CFG_OFFSET(ext_always_enabled), \
> + .enabled = true}
> +
>  /*
>   * 'Named features' is the name we give to extensions that we
>   * don't want to expose to users. They are either immutable
> @@ -1523,6 +1537,23 @@ const RISCVCPUMultiExtConfig 
> riscv_cpu_named_features[] = {
>  MULTI_EXT_CFG_BOOL("svade", ext_svade, true),
>  MULTI_EXT_CFG_BOOL("zic64b", ext_zic64b, true),
>
> +/*
> + * cache-related extensions that are always enabled
> + * in TCG since QEMU RISC-V does not have a cache
> + * model.
> + */
> +ALWAYS_ENABLED_FEATURE("za64rs"),
> +ALWAYS_ENABLED_FEATURE("ziccif"),
> +ALWAYS_ENABLED_FEATURE("ziccrse"),
> +ALWAYS_ENABLED_FEATURE("ziccamoa"),
> +ALWAYS_ENABLED_FEATURE("zicclsm"),
> +ALWAYS_ENABLED_FEATURE("ssccptr"),
> +
> +/* Other named features that TCG always implements */
> +

Re: [PATCH v2 3/6] target/riscv: add remaining named features

2024-02-14 Thread Alistair Francis
On Fri, Jan 26, 2024 at 11:33 PM Andrew Jones  wrote:
>
> From: Daniel Henrique Barboza 
>
> The RVA22U64 and RVA22S64 profiles mandates certain extensions that,
> until now, we were implying that they were available.
>
> We can't do this anymore since named features also has a riscv,isa
> entry.  Let's add them to riscv_cpu_named_features[].
>
> They will also need to be explicitly enabled in both profile
> descriptions. TCG will enable the named features it already implements,
> other accelerators are free to handle it as they like.
>
> After this patch, here's the riscv,isa from a buildroot using the
> 'rva22s64' CPU:
>
>  # cat /proc/device-tree/cpus/cpu@0/riscv,isa
> rv64imafdc_zic64b_zicbom_zicbop_zicboz_ziccamoa_ziccif_zicclsm_ziccrse_
> zicntr_zicsr_zifencei_zihintpause_zihpm_za64rs_zfhmin_zca_zcd_zba_zbb_
> zbs_zkt_sscounterenw_sstvala_sstvecd_svade_svinval_svpbmt#
>
> Signed-off-by: Daniel Henrique Barboza 
> Reviewed-by: Andrew Jones 
> ---
>  target/riscv/cpu.c | 41 +-
>  target/riscv/cpu_cfg.h |  9 +
>  target/riscv/tcg/tcg-cpu.c | 19 +-
>  3 files changed, 59 insertions(+), 10 deletions(-)
>
> diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
> index 28d3cfa8ce59..1ecd8a57ed02 100644
> --- a/target/riscv/cpu.c
> +++ b/target/riscv/cpu.c
> @@ -101,6 +101,10 @@ const RISCVIsaExtData isa_edata_arr[] = {
>  ISA_EXT_DATA_ENTRY(zicbom, PRIV_VERSION_1_12_0, ext_zicbom),
>  ISA_EXT_DATA_ENTRY(zicbop, PRIV_VERSION_1_12_0, ext_zicbop),
>  ISA_EXT_DATA_ENTRY(zicboz, PRIV_VERSION_1_12_0, ext_zicboz),
> +ISA_EXT_DATA_ENTRY(ziccamoa, PRIV_VERSION_1_11_0, ext_ziccamoa),
> +ISA_EXT_DATA_ENTRY(ziccif, PRIV_VERSION_1_11_0, ext_ziccif),
> +ISA_EXT_DATA_ENTRY(zicclsm, PRIV_VERSION_1_11_0, ext_zicclsm),
> +ISA_EXT_DATA_ENTRY(ziccrse, PRIV_VERSION_1_11_0, ext_ziccrse),
>  ISA_EXT_DATA_ENTRY(zicond, PRIV_VERSION_1_12_0, ext_zicond),
>  ISA_EXT_DATA_ENTRY(zicntr, PRIV_VERSION_1_12_0, ext_zicntr),
>  ISA_EXT_DATA_ENTRY(zicsr, PRIV_VERSION_1_10_0, ext_zicsr),
> @@ -109,6 +113,7 @@ const RISCVIsaExtData isa_edata_arr[] = {
>  ISA_EXT_DATA_ENTRY(zihintpause, PRIV_VERSION_1_10_0, ext_zihintpause),
>  ISA_EXT_DATA_ENTRY(zihpm, PRIV_VERSION_1_12_0, ext_zihpm),
>  ISA_EXT_DATA_ENTRY(zmmul, PRIV_VERSION_1_12_0, ext_zmmul),
> +ISA_EXT_DATA_ENTRY(za64rs, PRIV_VERSION_1_12_0, ext_za64rs),
>  ISA_EXT_DATA_ENTRY(zacas, PRIV_VERSION_1_12_0, ext_zacas),
>  ISA_EXT_DATA_ENTRY(zawrs, PRIV_VERSION_1_12_0, ext_zawrs),
>  ISA_EXT_DATA_ENTRY(zfa, PRIV_VERSION_1_12_0, ext_zfa),
> @@ -170,8 +175,12 @@ const RISCVIsaExtData isa_edata_arr[] = {
>  ISA_EXT_DATA_ENTRY(smepmp, PRIV_VERSION_1_12_0, ext_smepmp),
>  ISA_EXT_DATA_ENTRY(smstateen, PRIV_VERSION_1_12_0, ext_smstateen),
>  ISA_EXT_DATA_ENTRY(ssaia, PRIV_VERSION_1_12_0, ext_ssaia),
> +ISA_EXT_DATA_ENTRY(ssccptr, PRIV_VERSION_1_11_0, ext_ssccptr),
>  ISA_EXT_DATA_ENTRY(sscofpmf, PRIV_VERSION_1_12_0, ext_sscofpmf),
> +ISA_EXT_DATA_ENTRY(sscounterenw, PRIV_VERSION_1_12_0, ext_sscounterenw),
>  ISA_EXT_DATA_ENTRY(sstc, PRIV_VERSION_1_12_0, ext_sstc),
> +ISA_EXT_DATA_ENTRY(sstvala, PRIV_VERSION_1_12_0, ext_sstvala),
> +ISA_EXT_DATA_ENTRY(sstvecd, PRIV_VERSION_1_12_0, ext_sstvecd),
>  ISA_EXT_DATA_ENTRY(svade, PRIV_VERSION_1_11_0, ext_svade),
>  ISA_EXT_DATA_ENTRY(svadu, PRIV_VERSION_1_12_0, ext_svadu),
>  ISA_EXT_DATA_ENTRY(svinval, PRIV_VERSION_1_12_0, ext_svinval),
> @@ -1523,6 +1532,22 @@ const RISCVCPUMultiExtConfig 
> riscv_cpu_named_features[] = {
>  MULTI_EXT_CFG_BOOL("svade", ext_svade, true),
>  MULTI_EXT_CFG_BOOL("zic64b", ext_zic64b, true),
>
> +/*
> + * cache-related extensions that are always enabled
> + * since QEMU RISC-V does not have a cache model.
> + */
> +MULTI_EXT_CFG_BOOL("za64rs", ext_za64rs, true),
> +MULTI_EXT_CFG_BOOL("ziccif", ext_ziccif, true),
> +MULTI_EXT_CFG_BOOL("ziccrse", ext_ziccrse, true),
> +MULTI_EXT_CFG_BOOL("ziccamoa", ext_ziccamoa, true),
> +MULTI_EXT_CFG_BOOL("zicclsm", ext_zicclsm, true),
> +MULTI_EXT_CFG_BOOL("ssccptr", ext_ssccptr, true),
> +
> +/* Other named features that QEMU TCG always implements */
> +MULTI_EXT_CFG_BOOL("sstvecd", ext_sstvecd, true),
> +MULTI_EXT_CFG_BOOL("sstvala", ext_sstvala, true),
> +MULTI_EXT_CFG_BOOL("sscounterenw", ext_sscounterenw, true),
> +
>  DEFINE_PROP_END_OF_LIST(),
>  };
>
> @@ -2116,13 +2141,8 @@ static const PropertyInfo prop_marchid = {
>  };
>
>  /*
> - * RVA22U64 defines some 'named features' or 'synthetic extensions'
> - * that are cache related: Za64rs, Zic64b, Ziccif, Ziccrse, Ziccamoa
> - * and Zicclsm. We do not implement caching in QEMU so we'll consider
> - * all these named features as always enabled.
> - *
> - * There's no riscv,isa update for them (nor for zic64b, despite it
> - * having a cfg offset) at this moment.
> + * RVA22U64 defines 

Re: [PATCH v2 2/6] target/riscv: add riscv,isa to named features

2024-02-14 Thread Alistair Francis
Alistair

On Fri, Jan 26, 2024 at 11:32 PM Andrew Jones  wrote:
>
> From: Daniel Henrique Barboza 
>
> Further discussions after the introduction of rva22 support in QEMU
> revealed that what we've been calling 'named features' are actually
> regular extensions, with their respective riscv,isa DTs. This is
> clarified in [1]. [2] is a bug tracker asking for the profile spec to be
> less cryptic about it.
>
> As far as QEMU goes we understand extensions as something that the user
> can enable/disable in the command line. This isn't the case for named
> features, so we'll have to reach a middle ground.
>
> We'll keep our existing nomenclature 'named features' to refer to any
> extension that the user can't control in the command line. We'll also do
> the following:
>
> - 'svade' and 'zic64b' flags are renamed to 'ext_svade' and
>   'ext_zic64b'. 'ext_svade' and 'ext_zic64b' now have riscv,isa strings and
>   priv_spec versions;
>
> - skip name feature check in cpu_bump_multi_ext_priv_ver(). Now that
>   named features have a riscv,isa and an entry in isa_edata_arr[] we
>   don't need to gate the call to cpu_cfg_ext_get_min_version() anymore.
>
> [1] https://github.com/riscv/riscv-profiles/issues/121
> [2] https://github.com/riscv/riscv-profiles/issues/142
>
> Signed-off-by: Daniel Henrique Barboza 
> Reviewed-by: Andrew Jones 

Reviewed-by: Alistair Francis 

Alistair

> ---
>  target/riscv/cpu.c | 17 +
>  target/riscv/cpu_cfg.h |  6 --
>  target/riscv/tcg/tcg-cpu.c | 16 ++--
>  3 files changed, 23 insertions(+), 16 deletions(-)
>
> diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
> index 88e8cc868144..28d3cfa8ce59 100644
> --- a/target/riscv/cpu.c
> +++ b/target/riscv/cpu.c
> @@ -97,6 +97,7 @@ bool riscv_cpu_option_set(const char *optname)
>   * instead.
>   */
>  const RISCVIsaExtData isa_edata_arr[] = {
> +ISA_EXT_DATA_ENTRY(zic64b, PRIV_VERSION_1_12_0, ext_zic64b),
>  ISA_EXT_DATA_ENTRY(zicbom, PRIV_VERSION_1_12_0, ext_zicbom),
>  ISA_EXT_DATA_ENTRY(zicbop, PRIV_VERSION_1_12_0, ext_zicbop),
>  ISA_EXT_DATA_ENTRY(zicboz, PRIV_VERSION_1_12_0, ext_zicboz),
> @@ -171,6 +172,7 @@ const RISCVIsaExtData isa_edata_arr[] = {
>  ISA_EXT_DATA_ENTRY(ssaia, PRIV_VERSION_1_12_0, ext_ssaia),
>  ISA_EXT_DATA_ENTRY(sscofpmf, PRIV_VERSION_1_12_0, ext_sscofpmf),
>  ISA_EXT_DATA_ENTRY(sstc, PRIV_VERSION_1_12_0, ext_sstc),
> +ISA_EXT_DATA_ENTRY(svade, PRIV_VERSION_1_11_0, ext_svade),
>  ISA_EXT_DATA_ENTRY(svadu, PRIV_VERSION_1_12_0, ext_svadu),
>  ISA_EXT_DATA_ENTRY(svinval, PRIV_VERSION_1_12_0, ext_svinval),
>  ISA_EXT_DATA_ENTRY(svnapot, PRIV_VERSION_1_12_0, ext_svnapot),
> @@ -1510,9 +1512,16 @@ const RISCVCPUMultiExtConfig 
> riscv_cpu_experimental_exts[] = {
>  DEFINE_PROP_END_OF_LIST(),
>  };
>
> +/*
> + * 'Named features' is the name we give to extensions that we
> + * don't want to expose to users. They are either immutable
> + * (always enabled/disable) or they'll vary depending on
> + * the resulting CPU state. They have riscv,isa strings
> + * and priv_ver like regular extensions.
> + */
>  const RISCVCPUMultiExtConfig riscv_cpu_named_features[] = {
> -MULTI_EXT_CFG_BOOL("svade", svade, true),
> -MULTI_EXT_CFG_BOOL("zic64b", zic64b, true),
> +MULTI_EXT_CFG_BOOL("svade", ext_svade, true),
> +MULTI_EXT_CFG_BOOL("zic64b", ext_zic64b, true),
>
>  DEFINE_PROP_END_OF_LIST(),
>  };
> @@ -2130,7 +2139,7 @@ static RISCVCPUProfile RVA22U64 = {
>  CPU_CFG_OFFSET(ext_zicbop), CPU_CFG_OFFSET(ext_zicboz),
>
>  /* mandatory named features for this profile */
> -CPU_CFG_OFFSET(zic64b),
> +CPU_CFG_OFFSET(ext_zic64b),
>
>  RISCV_PROFILE_EXT_LIST_END
>  }
> @@ -2161,7 +2170,7 @@ static RISCVCPUProfile RVA22S64 = {
>  CPU_CFG_OFFSET(ext_svinval),
>
>  /* rva22s64 named features */
> -CPU_CFG_OFFSET(svade),
> +CPU_CFG_OFFSET(ext_svade),
>
>  RISCV_PROFILE_EXT_LIST_END
>  }
> diff --git a/target/riscv/cpu_cfg.h b/target/riscv/cpu_cfg.h
> index e241922f89c4..698f926ab1be 100644
> --- a/target/riscv/cpu_cfg.h
> +++ b/target/riscv/cpu_cfg.h
> @@ -117,13 +117,15 @@ struct RISCVCPUConfig {
>  bool ext_smepmp;
>  bool rvv_ta_all_1s;
>  bool rvv_ma_all_1s;
> -bool svade;
> -bool zic64b;
>
>  uint32_t mvendorid;
>  uint64_t marchid;
>  uint64_t mimpid;
>
> +/* Named features  */
> +bool ext_svade;
> +bool ext_zic64b;
> +
>  /* Vendor-specific custom extensions */
>  bool ext_xtheadba;
>  bool ext_xtheadbb;
> diff --git a/target/riscv/tcg/tcg-cpu.c b/target/riscv/tcg/tcg-cpu.c
> index 88f92d1c7d2c..90861cc065e5 100644
> --- a/target/riscv/tcg/tcg-cpu.c
> +++ b/target/riscv/tcg/tcg-cpu.c
> @@ -197,12 +197,12 @@ static bool cpu_cfg_offset_is_named_feat(uint32_t 
> ext_offset)
>  static void riscv_cpu_enable_named_feat(RISCVCPU *cpu, uint32_t feat_offset)
>  {
>  switch (feat_offset) {
> 

[PATCH] pcie: Support PCIe Gen5/Gen6 link speeds

2024-02-14 Thread Lukas Stockner
This patch extends the PCIe link speed option so that slots can be
configured as supporting 32GT/s (Gen5) or 64GT/s (Gen5) speeds.
This is as simple as setting the appropriate bit in LnkCap2 and
the appropriate value in LnkCap and LnkCtl2.

Signed-off-by: Lukas Stockner 
---
 hw/core/qdev-properties-system.c | 16 ++--
 hw/pci/pcie.c|  8 
 include/hw/pci/pcie_regs.h   |  2 ++
 qapi/common.json |  6 +-
 4 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c
index 1a396521d5..106a31c233 100644
--- a/hw/core/qdev-properties-system.c
+++ b/hw/core/qdev-properties-system.c
@@ -941,7 +941,7 @@ const PropertyInfo qdev_prop_off_auto_pcibar = {
 .set_default_value = qdev_propinfo_set_default_value_enum,
 };
 
-/* --- PCIELinkSpeed 2_5/5/8/16 -- */
+/* --- PCIELinkSpeed 2_5/5/8/16/32/64 -- */
 
 static void get_prop_pcielinkspeed(Object *obj, Visitor *v, const char *name,
void *opaque, Error **errp)
@@ -963,6 +963,12 @@ static void get_prop_pcielinkspeed(Object *obj, Visitor 
*v, const char *name,
 case QEMU_PCI_EXP_LNK_16GT:
 speed = PCIE_LINK_SPEED_16;
 break;
+case QEMU_PCI_EXP_LNK_32GT:
+speed = PCIE_LINK_SPEED_32;
+break;
+case QEMU_PCI_EXP_LNK_64GT:
+speed = PCIE_LINK_SPEED_64;
+break;
 default:
 /* Unreachable */
 abort();
@@ -996,6 +1002,12 @@ static void set_prop_pcielinkspeed(Object *obj, Visitor 
*v, const char *name,
 case PCIE_LINK_SPEED_16:
 *p = QEMU_PCI_EXP_LNK_16GT;
 break;
+case PCIE_LINK_SPEED_32:
+*p = QEMU_PCI_EXP_LNK_32GT;
+break;
+case PCIE_LINK_SPEED_64:
+*p = QEMU_PCI_EXP_LNK_64GT;
+break;
 default:
 /* Unreachable */
 abort();
@@ -1004,7 +1016,7 @@ static void set_prop_pcielinkspeed(Object *obj, Visitor 
*v, const char *name,
 
 const PropertyInfo qdev_prop_pcie_link_speed = {
 .name = "PCIELinkSpeed",
-.description = "2_5/5/8/16",
+.description = "2_5/5/8/16/32/64",
 .enum_table = _lookup,
 .get = get_prop_pcielinkspeed,
 .set = set_prop_pcielinkspeed,
diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
index 6db0cf69cd..0b4817e144 100644
--- a/hw/pci/pcie.c
+++ b/hw/pci/pcie.c
@@ -153,6 +153,14 @@ static void pcie_cap_fill_slot_lnk(PCIDevice *dev)
 pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2,
PCI_EXP_LNKCAP2_SLS_16_0GB);
 }
+if (s->speed > QEMU_PCI_EXP_LNK_16GT) {
+pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2,
+   PCI_EXP_LNKCAP2_SLS_32_0GB);
+}
+if (s->speed > QEMU_PCI_EXP_LNK_32GT) {
+pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2,
+   PCI_EXP_LNKCAP2_SLS_64_0GB);
+}
 }
 }
 
diff --git a/include/hw/pci/pcie_regs.h b/include/hw/pci/pcie_regs.h
index 4972106c42..9d3b6868dc 100644
--- a/include/hw/pci/pcie_regs.h
+++ b/include/hw/pci/pcie_regs.h
@@ -39,6 +39,8 @@ typedef enum PCIExpLinkSpeed {
 QEMU_PCI_EXP_LNK_5GT,
 QEMU_PCI_EXP_LNK_8GT,
 QEMU_PCI_EXP_LNK_16GT,
+QEMU_PCI_EXP_LNK_32GT,
+QEMU_PCI_EXP_LNK_64GT,
 } PCIExpLinkSpeed;
 
 #define QEMU_PCI_EXP_LNKCAP_MLS(speed)  (speed)
diff --git a/qapi/common.json b/qapi/common.json
index f1bb841951..867a9ad9b0 100644
--- a/qapi/common.json
+++ b/qapi/common.json
@@ -107,10 +107,14 @@
 #
 # @16: 16.0GT/s
 #
+# @32: 32.0GT/s
+#
+# @64: 64.0GT/s
+#
 # Since: 4.0
 ##
 { 'enum': 'PCIELinkSpeed',
-  'data': [ '2_5', '5', '8', '16' ] }
+  'data': [ '2_5', '5', '8', '16', '32', '64' ] }
 
 ##
 # @PCIELinkWidth:
-- 
2.43.1




Re: [RFC PATCH 3/5] cxl/core: introduce cxl_mem_report_poison()

2024-02-14 Thread Tony Luck
On Fri, Feb 09, 2024 at 07:54:15PM +0800, Shiyang Ruan wrote:
> If poison is detected(reported from cxl memdev), OS should be notified to
> handle it.  Introduce this function:
>   1. translate DPA to HPA;
>   2. construct a MCE instance; (TODO: more details need to be filled)
>   3. log it into MCE event queue;
> 
> After that, MCE mechanism can walk over its notifier chain to execute
> specific handlers.

This looks like a useful proof of concept patch to pass errors to all
the existing logging systems (console, mcelog, rasdaemon, EDAC). But
it's a bare minimum (just passing the address and dropping any other
interesting information about the error). I think we need something
more advanced that covers more CXL error types.

> Signed-off-by: Shiyang Ruan 
> ---
>  arch/x86/kernel/cpu/mce/core.c |  1 +
>  drivers/cxl/core/mbox.c| 33 +
>  2 files changed, 34 insertions(+)
> 
> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
> index bc39252bc54f..a64c0aceb7e0 100644
> --- a/arch/x86/kernel/cpu/mce/core.c
> +++ b/arch/x86/kernel/cpu/mce/core.c
> @@ -131,6 +131,7 @@ void mce_setup(struct mce *m)
>   m->ppin = cpu_data(m->extcpu).ppin;
>   m->microcode = boot_cpu_data.microcode;
>  }
> +EXPORT_SYMBOL_GPL(mce_setup);
>  
>  DEFINE_PER_CPU(struct mce, injectm);
>  EXPORT_PER_CPU_SYMBOL_GPL(injectm);
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 27166a411705..f9b6f50fbe80 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -4,6 +4,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -1290,6 +1291,38 @@ int cxl_set_timestamp(struct cxl_memdev_state *mds)
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_set_timestamp, CXL);
>  
> +static void cxl_mem_report_poison(struct cxl_memdev *cxlmd,
> +   struct cxl_poison_record *poison)
> +{
> + struct mce m;
> + u64 dpa = le64_to_cpu(poison->address) & CXL_POISON_START_MASK;
> + u64 len = le64_to_cpu(poison->length), i;
> + phys_addr_t phys_addr = cxl_memdev_dpa_to_hpa(cxlmd, dpa);
> +
> + if (phys_addr)
> + return;
> +
> + /*
> +  * Initialize struct mce.  Call preempt_disable() to avoid
> +  * "BUG: using smp_processor_id() in preemptible" for now, not sure
> +  * if this is a correct way.
> +  */
> + preempt_disable();
> + mce_setup();
> + preempt_enable();
> +
> + m.bank = -1;
> + /* Fake a memory read error with unknown channel */
> + m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV |
> +MCI_STATUS_MISCV | 0x9f;
> + m.misc = (MCI_MISC_ADDR_PHYS << 6);
> +
> + for (i = 0; i < len; i++) {
> + m.addr = phys_addr++;
> + mce_log();

This loop looks wrong. What values do you expect for "len" (a.k.a.
poison->length)? Creating one log for each byte in the range will
be very noisy!

> + }
> +}
> +
>  int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
>  struct cxl_region *cxlr)
>  {
> -- 
> 2.34.1

-Tony



Re: [PATCH v2 1/6] target/riscv/tcg: set 'mmu' with 'satp' in cpu_set_profile()

2024-02-14 Thread Alistair Francis
On Sat, Jan 27, 2024 at 12:18 AM Andrew Jones  wrote:
>
> From: Daniel Henrique Barboza 
>
> Recent changes in options handling removed the 'mmu' default the bare
> CPUs had, meaning that we must enable 'mmu' by hand when using the
> rva22s64 profile CPU.
>
> Given that this profile is setting a satp mode, it already implies that
> we need a 'mmu'. Enable the 'mmu' in this case.
>
> Signed-off-by: Daniel Henrique Barboza 

Reviewed-by: Alistair Francis 

Alistair

> ---
>  target/riscv/tcg/tcg-cpu.c | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/target/riscv/tcg/tcg-cpu.c b/target/riscv/tcg/tcg-cpu.c
> index da437975b429..88f92d1c7d2c 100644
> --- a/target/riscv/tcg/tcg-cpu.c
> +++ b/target/riscv/tcg/tcg-cpu.c
> @@ -1107,6 +1107,7 @@ static void cpu_set_profile(Object *obj, Visitor *v, 
> const char *name,
>
>  #ifndef CONFIG_USER_ONLY
>  if (profile->satp_mode != RISCV_PROFILE_ATTR_UNUSED) {
> +object_property_set_bool(obj, "mmu", true, NULL);
>  const char *satp_prop = satp_mode_str(profile->satp_mode,
>riscv_cpu_is_32bit(cpu));
>  object_property_set_bool(obj, satp_prop, profile->enabled, NULL);
> --
> 2.43.0
>
>



Re: [PATCH v3 2/2] target/riscv: UPDATE xATP write CSR

2024-02-14 Thread Alistair Francis
On Wed, Jan 10, 2024 at 2:07 AM Irina Ryapolova
 wrote:
>
> Added xATP_MODE validation for vsatp/hgatp CSRs.
> The xATP register is an SXLEN-bit read/write WARL register, so
> the legal value must be returned (See riscv-privileged-20211203, 
> SATP/VSATP/HGATP CSRs).
>
> Signed-off-by: Irina Ryapolova 
> Reviewed-by: Daniel Henrique Barboza 

Reviewed-by: Alistair Francis 

Alistair

> ---
>  target/riscv/csr.c | 52 ++
>  1 file changed, 29 insertions(+), 23 deletions(-)
>
> diff --git a/target/riscv/csr.c b/target/riscv/csr.c
> index 735fb27be7..6d7a3dd9aa 100644
> --- a/target/riscv/csr.c
> +++ b/target/riscv/csr.c
> @@ -1282,6 +1282,32 @@ static bool validate_vm(CPURISCVState *env, 
> target_ulong vm)
>  return get_field(mode_supported, (1 << vm));
>  }
>
> +static target_ulong legalize_xatp(CPURISCVState *env, target_ulong old_xatp,
> +  target_ulong val)
> +{
> +target_ulong mask;
> +bool vm;
> +if (riscv_cpu_mxl(env) == MXL_RV32) {
> +vm = validate_vm(env, get_field(val, SATP32_MODE));
> +mask = (val ^ old_xatp) & (SATP32_MODE | SATP32_ASID | SATP32_PPN);
> +} else {
> +vm = validate_vm(env, get_field(val, SATP64_MODE));
> +mask = (val ^ old_xatp) & (SATP64_MODE | SATP64_ASID | SATP64_PPN);
> +}
> +
> +if (vm && mask) {
> +/*
> + * The ISA defines SATP.MODE=Bare as "no translation", but we still
> + * pass these through QEMU's TLB emulation as it improves
> + * performance.  Flushing the TLB on SATP writes with paging
> + * enabled avoids leaking those invalid cached mappings.
> + */
> +tlb_flush(env_cpu(env));
> +return val;
> +}
> +return old_xatp;
> +}
> +
>  static target_ulong legalize_mpp(CPURISCVState *env, target_ulong old_mpp,
>   target_ulong val)
>  {
> @@ -2997,31 +3023,11 @@ static RISCVException read_satp(CPURISCVState *env, 
> int csrno,
>  static RISCVException write_satp(CPURISCVState *env, int csrno,
>   target_ulong val)
>  {
> -target_ulong mask;
> -bool vm;
> -
>  if (!riscv_cpu_cfg(env)->mmu) {
>  return RISCV_EXCP_NONE;
>  }
>
> -if (riscv_cpu_mxl(env) == MXL_RV32) {
> -vm = validate_vm(env, get_field(val, SATP32_MODE));
> -mask = (val ^ env->satp) & (SATP32_MODE | SATP32_ASID | SATP32_PPN);
> -} else {
> -vm = validate_vm(env, get_field(val, SATP64_MODE));
> -mask = (val ^ env->satp) & (SATP64_MODE | SATP64_ASID | SATP64_PPN);
> -}
> -
> -if (vm && mask) {
> -/*
> - * The ISA defines SATP.MODE=Bare as "no translation", but we still
> - * pass these through QEMU's TLB emulation as it improves
> - * performance.  Flushing the TLB on SATP writes with paging
> - * enabled avoids leaking those invalid cached mappings.
> - */
> -tlb_flush(env_cpu(env));
> -env->satp = val;
> -}
> +env->satp = legalize_xatp(env, env->satp, val);
>  return RISCV_EXCP_NONE;
>  }
>
> @@ -3506,7 +3512,7 @@ static RISCVException read_hgatp(CPURISCVState *env, 
> int csrno,
>  static RISCVException write_hgatp(CPURISCVState *env, int csrno,
>target_ulong val)
>  {
> -env->hgatp = val;
> +env->hgatp = legalize_xatp(env, env->hgatp, val);
>  return RISCV_EXCP_NONE;
>  }
>
> @@ -3772,7 +3778,7 @@ static RISCVException read_vsatp(CPURISCVState *env, 
> int csrno,
>  static RISCVException write_vsatp(CPURISCVState *env, int csrno,
>target_ulong val)
>  {
> -env->vsatp = val;
> +env->vsatp = legalize_xatp(env, env->vsatp, val);
>  return RISCV_EXCP_NONE;
>  }
>
> --
> 2.25.1
>
>



Re: [PATCH v3 1/2] target/riscv: FIX xATP_MODE validation

2024-02-14 Thread Alistair Francis
On Wed, Jan 10, 2024 at 1:00 AM Irina Ryapolova
 wrote:
>
> The SATP register is an SXLEN-bit read/write WARL register. It means that CSR 
> fields are only defined
> for a subset of bit encodings, but allow any value to be written while 
> guaranteeing to return a legal
> value whenever read (See riscv-privileged-20211203, SATP CSR).
>
> For example on rv64 we are trying to write to SATP CSR val = 
> 0x1000 (SATP_MODE = 1 - Reserved for standard use)
> and after that we are trying to read SATP_CSR. We read from the SATP CSR 
> value = 0x1000, which is not a correct
> operation (return illegal value).
>
> Signed-off-by: Irina Ryapolova 
> Reviewed-by: Daniel Henrique Barboza 

Reviewed-by: Alistair Francis 

Alistair

> ---
> Changes for v2:
>   -used satp_mode.map instead of satp_mode.supported
> Changes for v3:
>   -patch formatting corrected
> ---
>  target/riscv/csr.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/target/riscv/csr.c b/target/riscv/csr.c
> index fde7ce1a53..735fb27be7 100644
> --- a/target/riscv/csr.c
> +++ b/target/riscv/csr.c
> @@ -1278,8 +1278,8 @@ static RISCVException read_mstatus(CPURISCVState *env, 
> int csrno,
>
>  static bool validate_vm(CPURISCVState *env, target_ulong vm)
>  {
> -return (vm & 0xf) <=
> -   satp_mode_max_from_map(riscv_cpu_cfg(env)->satp_mode.map);
> +uint64_t mode_supported = riscv_cpu_cfg(env)->satp_mode.map;
> +return get_field(mode_supported, (1 << vm));
>  }
>
>  static target_ulong legalize_mpp(CPURISCVState *env, target_ulong old_mpp,
> --
> 2.25.1
>
>



Re: Assessment of the difficulty in porting CPU architecture for qemu

2024-02-14 Thread Alistair Francis
On Fri, Nov 17, 2023 at 5:35 PM 方 <1584389...@qq.com> wrote:
>
>  Hello everyone! I am working on implementing a tool to assess the complexity 
> of CPU architecture porting. It primarily focuses on RISC-V architecture 
> porting. In fact, the tool may have an average estimate of various 
> architecture porting efforts.My focus is on the overall workload and 
> difficulty of transplantation in the past and future,even if a project has 
> already been ported.As part of my dataset, I have collected the **qemu** 
> project. **I would like to gather community opinions to support my 
> assessment. I appreciate your help and response!** Based on scanning tools, 
> the porting complexity is determined to be high, with a significant amount of 
> code related to the CPU architecture in the project.  Is this assessment 
> accurate?Do you have any opinions on personnel allocation and consumption 
> time? I look forward to your help and response.

The people who did the original QEMU RISC-V port aren't involved any more.

You are correct that QEMU is significantly complex to port to a new
architecture compared to most other userspace software. I think it
would be similar to other JIT software in that regard.

Alistair



Re: [PATCH v2 2/2] hw/riscv/virt-acpi-build.c: Generate SPCR table

2024-02-14 Thread Alistair Francis
On Tue, Jan 16, 2024 at 11:11 AM Sia Jee Heng
 wrote:
>
> Generate Serial Port Console Redirection Table (SPCR) for RISC-V
> virtual machine.
>
> Signed-off-by: Sia Jee Heng 
> Reviewed-by: Daniel Henrique Barboza 

Reviewed-by: Alistair Francis 

Alistair

> ---
>  hw/riscv/virt-acpi-build.c | 39 ++
>  1 file changed, 39 insertions(+)
>
> diff --git a/hw/riscv/virt-acpi-build.c b/hw/riscv/virt-acpi-build.c
> index 26c7e4482d..7fc5071c84 100644
> --- a/hw/riscv/virt-acpi-build.c
> +++ b/hw/riscv/virt-acpi-build.c
> @@ -174,6 +174,42 @@ acpi_dsdt_add_uart(Aml *scope, const MemMapEntry 
> *uart_memmap,
>  aml_append(scope, dev);
>  }
>
> +/*
> + * Serial Port Console Redirection Table (SPCR)
> + * Rev: 1.07
> + */
> +
> +static void
> +spcr_setup(GArray *table_data, BIOSLinker *linker, RISCVVirtState *s)
> +{
> +AcpiSpcrData serial = {
> +.interface_type = 0,   /* 16550 compatible */
> +.base_addr.id = AML_AS_SYSTEM_MEMORY,
> +.base_addr.width = 32,
> +.base_addr.offset = 0,
> +.base_addr.size = 1,
> +.base_addr.addr = s->memmap[VIRT_UART0].base,
> +.interrupt_type = (1 << 4),/* Bit[4] RISC-V PLIC/APLIC */
> +.pc_interrupt = 0,
> +.interrupt = UART0_IRQ,
> +.baud_rate = 7,/* 15200 */
> +.parity = 0,
> +.stop_bits = 1,
> +.flow_control = 0,
> +.terminal_type = 3,/* ANSI */
> +.language = 0, /* Language */
> +.pci_device_id = 0x,   /* not a PCI device*/
> +.pci_vendor_id = 0x,   /* not a PCI device*/
> +.pci_bus = 0,
> +.pci_device = 0,
> +.pci_function = 0,
> +.pci_flags = 0,
> +.pci_segment = 0,
> +};
> +
> +build_spcr(table_data, linker, , 2, s->oem_id, s->oem_table_id);
> +}
> +
>  /* RHCT Node[N] starts at offset 56 */
>  #define RHCT_NODE_ARRAY_OFFSET 56
>
> @@ -555,6 +591,9 @@ static void virt_acpi_build(RISCVVirtState *s, 
> AcpiBuildTables *tables)
>  acpi_add_table(table_offsets, tables_blob);
>  build_rhct(tables_blob, tables->linker, s);
>
> +acpi_add_table(table_offsets, tables_blob);
> +spcr_setup(tables_blob, tables->linker, s);
> +
>  acpi_add_table(table_offsets, tables_blob);
>  {
>  AcpiMcfgInfo mcfg = {
> --
> 2.34.1
>
>



Re: [PATCH v2 1/2] hw/arm/virt-acpi-build.c: Migrate SPCR creation to common location

2024-02-14 Thread Alistair Francis
On Tue, Jan 16, 2024 at 11:11 AM Sia Jee Heng
 wrote:
>
> RISC-V should also generate the SPCR in a manner similar to ARM.
> Therefore, instead of replicating the code, relocate this function
> to the common AML build.
>
> Signed-off-by: Sia Jee Heng 

Reviewed-by: Alistair Francis 

Alistair

> ---
>  hw/acpi/aml-build.c | 51 
>  hw/arm/virt-acpi-build.c| 68 +++--
>  include/hw/acpi/acpi-defs.h | 33 ++
>  include/hw/acpi/aml-build.h |  4 +++
>  4 files changed, 115 insertions(+), 41 deletions(-)
>
> diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
> index af66bde0f5..f3904650e4 100644
> --- a/hw/acpi/aml-build.c
> +++ b/hw/acpi/aml-build.c
> @@ -1994,6 +1994,57 @@ static void build_processor_hierarchy_node(GArray 
> *tbl, uint32_t flags,
>  }
>  }
>
> +void build_spcr(GArray *table_data, BIOSLinker *linker,
> +const AcpiSpcrData *f, const uint8_t rev,
> +const char *oem_id, const char *oem_table_id)
> +{
> +AcpiTable table = { .sig = "SPCR", .rev = rev, .oem_id = oem_id,
> +.oem_table_id = oem_table_id };
> +
> +acpi_table_begin(, table_data);
> +/* Interface type */
> +build_append_int_noprefix(table_data, f->interface_type, 1);
> +/* Reserved */
> +build_append_int_noprefix(table_data, 0, 3);
> +/* Base Address */
> +build_append_gas(table_data, f->base_addr.id, f->base_addr.width,
> + f->base_addr.offset, f->base_addr.size,
> + f->base_addr.addr);
> +/* Interrupt type */
> +build_append_int_noprefix(table_data, f->interrupt_type, 1);
> +/* IRQ */
> +build_append_int_noprefix(table_data, f->pc_interrupt, 1);
> +/* Global System Interrupt */
> +build_append_int_noprefix(table_data, f->interrupt, 4);
> +/* Baud Rate */
> +build_append_int_noprefix(table_data, f->baud_rate, 1);
> +/* Parity */
> +build_append_int_noprefix(table_data, f->parity, 1);
> +/* Stop Bits */
> +build_append_int_noprefix(table_data, f->stop_bits, 1);
> +/* Flow Control */
> +build_append_int_noprefix(table_data, f->flow_control, 1);
> +/* Terminal Type */
> +build_append_int_noprefix(table_data, f->terminal_type, 1);
> +/* PCI Device ID  */
> +build_append_int_noprefix(table_data, f->pci_device_id, 2);
> +/* PCI Vendor ID */
> +build_append_int_noprefix(table_data, f->pci_vendor_id, 2);
> +/* PCI Bus Number */
> +build_append_int_noprefix(table_data, f->pci_bus, 1);
> +/* PCI Device Number */
> +build_append_int_noprefix(table_data, f->pci_device, 1);
> +/* PCI Function Number */
> +build_append_int_noprefix(table_data, f->pci_function, 1);
> +/* PCI Flags */
> +build_append_int_noprefix(table_data, f->pci_flags, 4);
> +/* PCI Segment */
> +build_append_int_noprefix(table_data, f->pci_segment, 1);
> +/* Reserved */
> +build_append_int_noprefix(table_data, 0, 4);
> +
> +acpi_table_end(linker, );
> +}
>  /*
>   * ACPI spec, Revision 6.3
>   * 5.2.29 Processor Properties Topology Table (PPTT)
> diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
> index a22a2f43a5..195767c0f0 100644
> --- a/hw/arm/virt-acpi-build.c
> +++ b/hw/arm/virt-acpi-build.c
> @@ -431,48 +431,34 @@ build_iort(GArray *table_data, BIOSLinker *linker, 
> VirtMachineState *vms)
>   * Rev: 1.07
>   */
>  static void
> -build_spcr(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
> +spcr_setup(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>  {
> -AcpiTable table = { .sig = "SPCR", .rev = 2, .oem_id = vms->oem_id,
> -.oem_table_id = vms->oem_table_id };
> -
> -acpi_table_begin(, table_data);
> -
> -/* Interface Type */
> -build_append_int_noprefix(table_data, 3, 1); /* ARM PL011 UART */
> -build_append_int_noprefix(table_data, 0, 3); /* Reserved */
> -/* Base Address */
> -build_append_gas(table_data, AML_AS_SYSTEM_MEMORY, 32, 0, 3,
> - vms->memmap[VIRT_UART].base);
> -/* Interrupt Type */
> -build_append_int_noprefix(table_data,
> -(1 << 3) /* Bit[3] ARMH GIC interrupt */, 1);
> -build_append_int_noprefix(table_data, 0, 1); /* IRQ */
> -/* Global System Interrupt */
> -build_append_int_noprefix(table_data,
> -  vms->irqmap[VIRT_UART] + ARM_SPI_BASE, 4);
> -build_append_int_noprefix(table_data, 3 /* 9600 */, 1); /* Baud Rate */
> -build_append_int_noprefix(table_data, 0 /* No Parity */, 1); /* Parity */
> -/* Stop Bits */
> -build_append_int_noprefix(table_data, 1 /* 1 Stop bit */, 1);
> -/* Flow Control */
> -build_append_int_noprefix(table_data,
> -(1 << 1) /* RTS/CTS hardware flow control */, 1);
> -/* Terminal Type */
> -build_append_int_noprefix(table_data, 0 /* VT100 */, 1);
> -

RE: [PATCH v2 2/2] aspeed: fix hardcode boot address 0

2024-02-14 Thread Jamin Lin
> -Original Message-
> From: Cédric Le Goater 
> Sent: Friday, February 9, 2024 4:32 PM
> To: Jamin Lin ; Peter Maydell
> ; Andrew Jeffery ;
> Joel Stanley ; open list:ASPEED BMCs
> ; open list:All patches CC here
> 
> Cc: Troy Lee 
> Subject: Re: [PATCH v2 2/2] aspeed: fix hardcode boot address 0
> 
> On 2/7/24 20:52, Jamin Lin wrote:
> > In the previous design of ASPEED SOCs QEMU model, it set the boot
> > address at "0" which was the hardcode setting for ast10x0, ast2600,
> > ast2500 and ast2400.
> >
> > According to the design of ast2700, it has bootmcu which is used for
> > executing SPL and initialize DRAM, then, CPUs(cortex-a35) execute
> > u-boot, kernel and rofs. QEMU will only support CPU(cortex-a35) parts
> > and the boot address is "0x4 " for ast2700.
> > Therefore, fixed hardcode boot address 0.
> >
> > Signed-off-by: Troy Lee 
> > Signed-off-by: Jamin Lin 
> 
> I agree with Philippe that the justification could be simpler. This change is 
> just
> a cleanup preparing ground for future models using a different mapping
> address.
> 
Will fix
Jamin
> Reviewed-by: Cédric Le Goater 
> 
> Thanks,
> 
> C.
> 
> 
> > ---
> >   hw/arm/aspeed.c | 4 +++-
> >   hw/arm/aspeed_ast2400.c | 4 ++--
> >   hw/arm/aspeed_ast2600.c | 2 +-
> >   include/hw/arm/aspeed_soc.h | 2 --
> >   4 files changed, 6 insertions(+), 6 deletions(-)
> >
> > diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c index
> > 06d863958b..39758557be 100644
> > --- a/hw/arm/aspeed.c
> > +++ b/hw/arm/aspeed.c
> > @@ -289,12 +289,14 @@ static void
> aspeed_install_boot_rom(AspeedMachineState *bmc, BlockBackend *blk,
> >   uint64_t rom_size)
> >   {
> >   AspeedSoCState *soc = bmc->soc;
> > +AspeedSoCClass *sc = ASPEED_SOC_GET_CLASS(soc);
> >
> >   memory_region_init_rom(>boot_rom, NULL,
> "aspeed.boot_rom", rom_size,
> >  _abort);
> >   memory_region_add_subregion_overlap(>spi_boot_container,
> 0,
> >   >boot_rom, 1);
> > -write_boot_rom(blk, ASPEED_SOC_SPI_BOOT_ADDR, rom_size,
> _abort);
> > +write_boot_rom(blk, sc->memmap[ASPEED_DEV_SPI_BOOT],
> > +   rom_size, _abort);
> >   }
> >
> >   void aspeed_board_init_flashes(AspeedSMCState *s, const char
> > *flashtype, diff --git a/hw/arm/aspeed_ast2400.c
> > b/hw/arm/aspeed_ast2400.c index 95da85fee0..d125886207 100644
> > --- a/hw/arm/aspeed_ast2400.c
> > +++ b/hw/arm/aspeed_ast2400.c
> > @@ -26,7 +26,7 @@
> >   #define ASPEED_SOC_IOMEM_SIZE   0x0020
> >
> >   static const hwaddr aspeed_soc_ast2400_memmap[] = {
> > -[ASPEED_DEV_SPI_BOOT]  =  ASPEED_SOC_SPI_BOOT_ADDR,
> > +[ASPEED_DEV_SPI_BOOT]  = 0x,
> >   [ASPEED_DEV_IOMEM]  = 0x1E60,
> >   [ASPEED_DEV_FMC]= 0x1E62,
> >   [ASPEED_DEV_SPI1]   = 0x1E63,
> > @@ -61,7 +61,7 @@ static const hwaddr aspeed_soc_ast2400_memmap[] =
> {
> >   };
> >
> >   static const hwaddr aspeed_soc_ast2500_memmap[] = {
> > -[ASPEED_DEV_SPI_BOOT]  = ASPEED_SOC_SPI_BOOT_ADDR,
> > +[ASPEED_DEV_SPI_BOOT]  = 0x,
> >   [ASPEED_DEV_IOMEM]  = 0x1E60,
> >   [ASPEED_DEV_FMC]= 0x1E62,
> >   [ASPEED_DEV_SPI1]   = 0x1E63,
> > diff --git a/hw/arm/aspeed_ast2600.c b/hw/arm/aspeed_ast2600.c index
> > f74561ecdc..174be53770 100644
> > --- a/hw/arm/aspeed_ast2600.c
> > +++ b/hw/arm/aspeed_ast2600.c
> > @@ -22,7 +22,7 @@
> >   #define ASPEED_SOC_DPMCU_SIZE   0x0004
> >
> >   static const hwaddr aspeed_soc_ast2600_memmap[] = {
> > -[ASPEED_DEV_SPI_BOOT]  = ASPEED_SOC_SPI_BOOT_ADDR,
> > +[ASPEED_DEV_SPI_BOOT]  = 0x,
> >   [ASPEED_DEV_SRAM]  = 0x1000,
> >   [ASPEED_DEV_DPMCU] = 0x1800,
> >   /* 0x1600 0x17FF : AHB BUS do LPC Bus bridge */
> > diff --git a/include/hw/arm/aspeed_soc.h b/include/hw/arm/aspeed_soc.h
> > index 5ab0902da0..bf43ad8351 100644
> > --- a/include/hw/arm/aspeed_soc.h
> > +++ b/include/hw/arm/aspeed_soc.h
> > @@ -224,8 +224,6 @@ enum {
> >   ASPEED_DEV_FSI2,
> >   };
> >
> > -#define ASPEED_SOC_SPI_BOOT_ADDR 0x0
> > -
> >   qemu_irq aspeed_soc_get_irq(AspeedSoCState *s, int dev);
> >   bool aspeed_soc_uart_realize(AspeedSoCState *s, Error **errp);
> >   void aspeed_soc_uart_set_chr(AspeedSoCState *s, int dev, Chardev
> > *chr);



RE: [PATCH v2 1/2] aspeed: introduce a new UART0 device name

2024-02-14 Thread Jamin Lin
> -Original Message-
> From: Cédric Le Goater 
> Sent: Friday, February 9, 2024 4:27 PM
> To: Jamin Lin ; Cédric Le Goater ;
> Peter Maydell ; Andrew Jeffery
> ; Joel Stanley ; open
> list:ASPEED BMCs ; open list:All patches CC here
> 
> Cc: Troy Lee 
> Subject: Re: [PATCH v2 1/2] aspeed: introduce a new UART0 device name
> 
> Hello Jamin,
> 
Thanks for review and sorry reply you late due to my Chinese new year holiday.
> On 2/7/24 21:02, Jamin Lin via wrote:
> > The Aspeed datasheet refers to the UART controllers as UART1 - UART13
> > for the ast10x0, ast2600, ast2500 and ast2400 SoCs and the Aspeed
> > ast2700 introduces an UART0 and the UART controllers as UART0 -
> > UART12.
> >
> > To keep the naming in the QEMU models
> > in sync with the datasheet, let's introduce a new  UART0 device name
> > and do the required adjustements, etc ...
> 
> Please drop the etc...
Will fix

> 
> >
> > Signed-off-by: Troy Lee 
> > Signed-off-by: Jamin Lin 
> > ---
> >   hw/arm/aspeed.c | 13 -
> >   hw/arm/aspeed_ast10x0.c |  1 +
> >   hw/arm/aspeed_ast2400.c |  2 ++
> >   hw/arm/aspeed_ast2600.c |  1 +
> >   hw/arm/aspeed_soc_common.c  | 14 +-
> >   include/hw/arm/aspeed_soc.h |  2 ++
> >   6 files changed, 23 insertions(+), 10 deletions(-)
> >
> > diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c index
> > 09b1e823ba..06d863958b 100644
> > --- a/hw/arm/aspeed.c
> > +++ b/hw/arm/aspeed.c
> > @@ -342,7 +342,7 @@ static void
> connect_serial_hds_to_uarts(AspeedMachineState *bmc)
> >   int uart_chosen = bmc->uart_chosen ? bmc->uart_chosen :
> > amc->uart_default;
> >
> >   aspeed_soc_uart_set_chr(s, uart_chosen, serial_hd(0));
> > -for (int i = 1, uart = ASPEED_DEV_UART1; i < sc->uarts_num; i++,
> uart++) {
> > +for (int i = 0, uart = sc->uarts_base; i < sc->uarts_num; i++,
> > + uart++) {
> >   if (uart == uart_chosen) {
> >   continue;
> >   }
> > @@ -1094,7 +1094,7 @@ static char *aspeed_get_bmc_console(Object *obj,
> Error **errp)
> >   AspeedMachineClass *amc = ASPEED_MACHINE_GET_CLASS(bmc);
> >   int uart_chosen = bmc->uart_chosen ? bmc->uart_chosen :
> > amc->uart_default;
> >
> > -return g_strdup_printf("uart%d", uart_chosen - ASPEED_DEV_UART1 +
> 1);
> > +return g_strdup_printf("uart%d", uart_chosen - ASPEED_DEV_UART0);
> >   }
> >
> >   static void aspeed_set_bmc_console(Object *obj, const char *value,
> > Error **errp) @@ -1103,6 +1103,8 @@ static void
> aspeed_set_bmc_console(Object *obj, const char *value, Error **errp)
> >   AspeedMachineClass *amc = ASPEED_MACHINE_GET_CLASS(bmc);
> >   AspeedSoCClass *sc =
> ASPEED_SOC_CLASS(object_class_by_name(amc->soc_name));
> >   int val;
> > +int start = sc->uarts_base - ASPEED_DEV_UART0;
> > +int end = start + sc->uarts_num;
> 
> 
> To help the reader, I would introduce these helpers at the end of 
> aspeed_soc.h :
> 
>  static inline int aspeed_uart_index(int uart_dev)
>  {
>  return uart_dev - ASPEED_DEV_UART0;
>  }
> 
>  static inline int aspeed_uart_first(AspeedSoCClass *sc)
>  {
>  return aspeed_uart_index(sc->uarts_base);
>  }
> 
>  static inline int aspeed_uart_last(AspeedSoCClass *sc)
>  {
>  return aspeed_uart_first(sc) + sc->uarts_num - 1;
>  }
> 
Will add
> 
> >   if (sscanf(value, "uart%u", ) != 1) {
> >   error_setg(errp, "Bad value for \"uart\" property"); @@
> > -1110,11 +1112,12 @@ static void aspeed_set_bmc_console(Object *obj,
> const char *value, Error **errp)
> >   }
> >
> >   /* The number of UART depends on the SoC */
> > -if (val < 1 || val > sc->uarts_num) {
> > -error_setg(errp, "\"uart\" should be in range [1 - %d]",
> sc->uarts_num);
> > +if (val < start || val >= end) {
> > +error_setg(errp, "\"uart\" should be in range [%d - %d]",
> > +   start, end - 1);
> >   return;
> >   }
> > -bmc->uart_chosen = ASPEED_DEV_UART1 + val - 1;
> > +bmc->uart_chosen = val + ASPEED_DEV_UART0;
> >   }
> >
> >   static void aspeed_machine_class_props_init(ObjectClass *oc) diff
> > --git a/hw/arm/aspeed_ast10x0.c b/hw/arm/aspeed_ast10x0.c index
> > c3b5116a6a..2634e0f654 100644
> > --- a/hw/arm/aspeed_ast10x0.c
> > +++ b/hw/arm/aspeed_ast10x0.c
> > @@ -436,6 +436,7 @@ static void
> aspeed_soc_ast1030_class_init(ObjectClass *klass, void *data)
> >   sc->wdts_num = 4;
> >   sc->macs_num = 1;
> >   sc->uarts_num = 13;
> > +sc->uarts_base = ASPEED_DEV_UART1;
> >   sc->irqmap = aspeed_soc_ast1030_irqmap;
> >   sc->memmap = aspeed_soc_ast1030_memmap;
> >   sc->num_cpus = 1;
> > diff --git a/hw/arm/aspeed_ast2400.c b/hw/arm/aspeed_ast2400.c index
> > 8829561bb6..95da85fee0 100644
> > --- a/hw/arm/aspeed_ast2400.c
> > +++ b/hw/arm/aspeed_ast2400.c
> > @@ -523,6 +523,7 @@ static void
> aspeed_soc_ast2400_class_init(ObjectClass *oc, void *data)
> >   sc->wdts_num = 

RE: [PATCH v2 2/2] aspeed: fix hardcode boot address 0

2024-02-14 Thread Jamin Lin
> -Original Message-
> From: Philippe Mathieu-Daudé 
> Sent: Thursday, February 8, 2024 4:29 AM
> To: Jamin Lin ; Cédric Le Goater ;
> Peter Maydell ; Andrew Jeffery
> ; Joel Stanley ; open
> list:ASPEED BMCs ; open list:All patches CC here
> 
> Cc: Troy Lee 
> Subject: Re: [PATCH v2 2/2] aspeed: fix hardcode boot address 0
> 
> Hi Jamin,
> 
> On 7/2/24 20:52, Jamin Lin via wrote:
> > In the previous design of ASPEED SOCs QEMU model, it set the boot
> > address at "0" which was the hardcode setting for ast10x0, ast2600,
> > ast2500 and ast2400.
> >
> > According to the design of ast2700, it has bootmcu which is used for
> > executing SPL and initialize DRAM, then, CPUs(cortex-a35) execute
> > u-boot, kernel and rofs. QEMU will only support CPU(cortex-a35) parts
> > and the boot address is "0x4 " for ast2700.
> 
> This justification from here ...
> 
> > Therefore, fixed hardcode boot address 0.
> 
> ... to here is still unclear. You provided an explanation in previous patch, 
> maybe
> worth including it in this description?
> 
> Otherwise for the code changes:
Thanks for review and sorry reply you late due to my Chinese new year holiday.
Will add.
Jamin
> Reviewed-by: Philippe Mathieu-Daudé 
> 
> > Signed-off-by: Troy Lee 
> > Signed-off-by: Jamin Lin 
> > ---
> >   hw/arm/aspeed.c | 4 +++-
> >   hw/arm/aspeed_ast2400.c | 4 ++--
> >   hw/arm/aspeed_ast2600.c | 2 +-
> >   include/hw/arm/aspeed_soc.h | 2 --
> >   4 files changed, 6 insertions(+), 6 deletions(-)



Re: possible deprecation and removal of some old QEMU Arm machine types (pxa2xx, omap, sa1110)

2024-02-14 Thread Arnd Bergmann
On Wed, Feb 14, 2024, at 13:26, Dmitry Baryshkov wrote:
> On Tue, 13 Feb 2024 at 23:22, Linus Walleij  wrote:
>> On Tue, Feb 13, 2024 at 9:12 PM Arnd Bergmann  wrote:
>> > On Tue, Feb 13, 2024, at 16:36, Guenter Roeck wrote:
>> > > On Tue, Feb 13, 2024 at 03:14:21PM +, Peter Maydell wrote:
>>
>> Andrea Adami and Dmitry Eremin-Solenikov did the work in 2017 to
>> modernize it a bit, and Russell helped out. I was under the impression
>> that they only used real hardware though!
>
> I used both Qemu and actual hardware (having collie, poodle, tosa and
> c860 that was easy).
>
> The biggest issue with Zaurus PDAs was that supporting interesting
> parts of the platform (PCMCIA, companion chips) required almost
> rebootstrapping of the corresponding drivers.
> E.g. I had a separate driver for the LoCoMo chip which worked properly
> with the DT systems.
> PCMCIA was a huuuge trouble and it didn't play well at all. The driver
> must be rewritten to use the component framework.

If we want to actually go there, I think the best option for PCMCIA
support is likely to replace the entire "soc_common" pcmcia driver
with a simple drivers/pata/ storage driver and no support for
other cards. There was a driver until commit 38943cbd25a2
("ata: remove palmld pata driver") that could serve as an
template.

  Arnd



Re: possible deprecation and removal of some old QEMU Arm machine types (pxa2xx, omap, sa1110)

2024-02-14 Thread Andrea Adami
On Wed, Feb 14, 2024 at 2:39 PM Marcin Juszkiewicz
 wrote:
>
> Most of OpenZaurus/Ångström developers abandoned Zaurus devices in 2008.
> Usually in favour of Nokia 770/n800/n810 tablets.
>
> Both OpenZaurus and Ångström used own hosting in handhelds.org era.

I joined OpenEmbedded in that years and it was the only reliable repository.
The most recent works for kernel have been committed there, in the
meta-handheld layer.
Due to my limited knowledge I could only contribute to finish MTD
support for Zaurus.

I don't have anymore the devices, in perspective I can only imagine
Qemu support if any.
It was fun, thanks to all for your support!

Cheers
Andrea Adami



[PATCH 0/3] hw/nvme: FDP and SR-IOV enhancements

2024-02-14 Thread Minwoo Im
Hello,

This patchset includes patches for adding Identify data for the
recently added Endurance Group (endgrpid=1) used in FDP, and patches
for increasing the maximum number of SR-IOV VF Resources to support
more resources to enable testing as recent SSDs.

Thanks,

Minwoo Im (3):
  hw/nvme: add Identify Endurance Group List
  hw/nvme: Support SR-IOV VFs more than 127
  hw/nvme: Expand VI/VQ resource to uint32

 hw/nvme/ctrl.c   | 36 +---
 hw/nvme/nvme.h   |  6 +++---
 include/block/nvme.h |  1 +
 3 files changed, 33 insertions(+), 10 deletions(-)

-- 
2.34.1




[PATCH 1/3] hw/nvme: add Identify Endurance Group List

2024-02-14 Thread Minwoo Im
From: Minwoo Im 

Commit 73064edfb864 ("hw/nvme: flexible data placement emulation")
intorudced NVMe FDP feature to nvme-subsys and nvme-ctrl with a
single endurance group #1 supported.  This means that controller should
return proper identify data to host with Identify Endurance Group List
(CNS 19h).  But, yes, only just for the endurance group #1.  This patch
allows host applications to ask for which endurance group is available
and utilize FDP through that endurance group.

Signed-off-by: Minwoo Im 
Reviewed-by: Klaus Jensen 
---
 hw/nvme/ctrl.c   | 22 ++
 include/block/nvme.h |  1 +
 2 files changed, 23 insertions(+)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index f026245d1e9e..cfe53a358871 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -5629,6 +5629,26 @@ static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, 
NvmeRequest *req,
 return nvme_c2h(n, list, data_len, req);
 }
 
+static uint16_t nvme_endurance_group_list(NvmeCtrl *n, NvmeRequest *req)
+{
+uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
+uint16_t *nr_ids = [0];
+uint16_t *ids = [1];
+uint16_t endgid = le32_to_cpu(req->cmd.cdw11) & 0x;
+
+/*
+ * The current nvme-subsys only supports Endurance Group #1.
+ */
+if (!endgid) {
+*nr_ids = 1;
+ids[0] = 1;
+} else {
+*nr_ids = 0;
+}
+
+return nvme_c2h(n, list, sizeof(list), req);
+}
+
 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
 {
 NvmeNamespace *ns;
@@ -5732,6 +5752,8 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest 
*req)
 return nvme_identify_nslist(n, req, false);
 case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
 return nvme_identify_nslist_csi(n, req, true);
+case NVME_ID_CNS_ENDURANCE_GROUP_LIST:
+return nvme_endurance_group_list(n, req);
 case NVME_ID_CNS_CS_NS_PRESENT_LIST:
 return nvme_identify_nslist_csi(n, req, false);
 case NVME_ID_CNS_NS_DESCR_LIST:
diff --git a/include/block/nvme.h b/include/block/nvme.h
index bb231d0b9ad0..7c77d38174a7 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -1074,6 +1074,7 @@ enum NvmeIdCns {
 NVME_ID_CNS_CTRL_LIST = 0x13,
 NVME_ID_CNS_PRIMARY_CTRL_CAP  = 0x14,
 NVME_ID_CNS_SECONDARY_CTRL_LIST   = 0x15,
+NVME_ID_CNS_ENDURANCE_GROUP_LIST  = 0x19,
 NVME_ID_CNS_CS_NS_PRESENT_LIST= 0x1a,
 NVME_ID_CNS_CS_NS_PRESENT = 0x1b,
 NVME_ID_CNS_IO_COMMAND_SET= 0x1c,
-- 
2.34.1




[PATCH 2/3] hw/nvme: Support SR-IOV VFs more than 127

2024-02-14 Thread Minwoo Im
From: Minwoo Im 

The number of virtual functions(VFs) supported in SR-IOV is 64k as per
spec.  To test a large number of MSI-X vectors mapping to CPU matrix in
the QEMU system, we need much more than 127 VFs.  This patch made
support for 256 VFs per a physical function(PF).

Signed-off-by: Minwoo Im 
Reviewed-by: Klaus Jensen 
---
 hw/nvme/ctrl.c | 4 ++--
 hw/nvme/nvme.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index cfe53a358871..8198fd2d8e46 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -219,7 +219,7 @@
 #define NVME_TEMPERATURE_CRITICAL 0x175
 #define NVME_NUM_FW_SLOTS 1
 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
-#define NVME_MAX_VFS 127
+#define NVME_MAX_VFS 256
 #define NVME_VF_RES_GRANULARITY 1
 #define NVME_VF_OFFSET 0x1
 #define NVME_VF_STRIDE 1
@@ -8425,7 +8425,7 @@ static Property nvme_props[] = {
 DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
 DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
  params.auto_transition_zones, true),
-DEFINE_PROP_UINT8("sriov_max_vfs", NvmeCtrl, params.sriov_max_vfs, 0),
+DEFINE_PROP_UINT16("sriov_max_vfs", NvmeCtrl, params.sriov_max_vfs, 0),
 DEFINE_PROP_UINT16("sriov_vq_flexible", NvmeCtrl,
params.sriov_vq_flexible, 0),
 DEFINE_PROP_UINT16("sriov_vi_flexible", NvmeCtrl,
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 5f2ae7b28b9c..db2cda098ebd 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -517,7 +517,7 @@ typedef struct NvmeParams {
 bool auto_transition_zones;
 bool legacy_cmb;
 bool ioeventfd;
-uint8_t  sriov_max_vfs;
+uint16_t  sriov_max_vfs;
 uint16_t sriov_vq_flexible;
 uint16_t sriov_vi_flexible;
 uint8_t  sriov_max_vq_per_vf;
-- 
2.34.1




[PATCH 3/3] hw/nvme: Expand VI/VQ resource to uint32

2024-02-14 Thread Minwoo Im
From: Minwoo Im 

VI and VQ resources cover queue resources in each VFs in SR-IOV.
Current maximum I/O queue pair size is 0x, we can expand them to
cover the full number of I/O queue pairs.

This patch also fixed Identify Secondary Controller List overflow due to
expand of number of secondary controllers.

Signed-off-by: Minwoo Im 
Reviewed-by: Klaus Jensen 
---
 hw/nvme/ctrl.c | 10 +-
 hw/nvme/nvme.h |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 8198fd2d8e46..6f3fd96f7572 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -5486,7 +5486,7 @@ static uint16_t nvme_identify_sec_ctrl_list(NvmeCtrl *n, 
NvmeRequest *req)
 
 for (i = 0; i < num_sec_ctrl; i++) {
 if (n->sec_ctrl_list.sec[i].scid >= min_id) {
-list.numcntl = num_sec_ctrl - i;
+list.numcntl = (num_sec_ctrl - i > 127) ? 127 : num_sec_ctrl - i;
 memcpy(, n->sec_ctrl_list.sec + i,
list.numcntl * sizeof(NvmeSecCtrlEntry));
 break;
@@ -8430,10 +8430,10 @@ static Property nvme_props[] = {
params.sriov_vq_flexible, 0),
 DEFINE_PROP_UINT16("sriov_vi_flexible", NvmeCtrl,
params.sriov_vi_flexible, 0),
-DEFINE_PROP_UINT8("sriov_max_vi_per_vf", NvmeCtrl,
-  params.sriov_max_vi_per_vf, 0),
-DEFINE_PROP_UINT8("sriov_max_vq_per_vf", NvmeCtrl,
-  params.sriov_max_vq_per_vf, 0),
+DEFINE_PROP_UINT32("sriov_max_vi_per_vf", NvmeCtrl,
+   params.sriov_max_vi_per_vf, 0),
+DEFINE_PROP_UINT32("sriov_max_vq_per_vf", NvmeCtrl,
+   params.sriov_max_vq_per_vf, 0),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index db2cda098ebd..d0f4c6c9b7af 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -520,8 +520,8 @@ typedef struct NvmeParams {
 uint16_t  sriov_max_vfs;
 uint16_t sriov_vq_flexible;
 uint16_t sriov_vi_flexible;
-uint8_t  sriov_max_vq_per_vf;
-uint8_t  sriov_max_vi_per_vf;
+uint32_t  sriov_max_vq_per_vf;
+uint32_t  sriov_max_vi_per_vf;
 } NvmeParams;
 
 typedef struct NvmeCtrl {
-- 
2.34.1




Fwd: How do I make my emulated device's DMA go through viommu ?

2024-02-14 Thread Xu Liu


Begin forwarded message:

From: Xu Liu 
Subject: How do I make my emulated device's DMA go through viommu ?
Date: February 14, 2024 at 11:46:04 AM EST
To: "qemu-disc...@nongnu.org" 
Cc: 刘旭 , Xu Liu 

Hello, All

I am working a project, which has a custom emulated PCI device for the QEMU.


In side the emulated device,  I call pci_dma_read/write to write to some IOVA 
address, for example  0X4003.

After start the QEMU, I bind the emulated device to vfio-pci driver .

It always complains:

Invalid access at addr 0x4003, size 4, region '(null)', reason: rejected


From  my application log, I can see the IOVA is mapped:

{ iova_range: RangeInclusive { start: 70368744374272, end: 70368744378367 }, 
memfd: Memfd { file: File { fd: 167, path: "/memfd:dma_memory_0x4003 
(deleted)", read: true, write: true } }, mmap: MmapRaw { ptr: 0x7feb9bc7a000, 
len: 4096 } }


From the QEMU OS kernel  trace, I can see the mapping is there:

workload-697 [000] . 31.476109: map: IOMMU: iova=0x4003 - 
0x40031000 paddr=0x000109eca000 size=4096



From QEMU trace event, I can see the IOMMU is enabled.  the  02:00.0 is a PCI 
bridge, and 03:00.0 is my emulated device

pci_update_mappings_add d=0x56461fcf1a40 03:00.0 0,0xfe80+0x10
pci_update_mappings_add d=0x56461fcf1a40 03:00.0 1,0xfe90+0x10
pci_update_mappings_add d=0x56461fcf1a40 03:00.0 2,0x80+0x100
pci_update_mappings_add d=0x56461fcf1a40 03:00.0 4,0x40+0x40
vtd_switch_address_space Device 02:00.0 switching address space (iommu 
enabled=1)
vtd_switch_address_space Device 02:00.0 switching address space (iommu 
enabled=1)
vtd_switch_address_space Device 03:00.0 switching address space (iommu 
enabled=1)
vtd_switch_address_space Device 03:00.0 switching address space (iommu 
enabled=1)
vtd_switch_address_space Device 02:00.0 switching address space (iommu 
enabled=1)
vtd_switch_address_space Device 02:00.0 switching address space (iommu 
enabled=1)
vtd_switch_address_space Device 03:00.0 switching address space (iommu 
enabled=1)
vtd_switch_address_space Device 03:00.0 switching address space (iommu 
enabled=1)


From GDB, looks like the IOVA is io_mem_unassigned:

$20 = {size = 18446744073709551616, mr = 0x5613a6519840 , fv 
= 0x5613a67ab3e0, offset_within_region = 0, offset_within_address_space = 0, 
readonly = false, nonvolatile = false}



Any ideas about this ?

Thanks.

Xu




Re: [PATCH 04/12] vdpa: factor out vhost_vdpa_net_get_nc_vdpa

2024-02-14 Thread Si-Wei Liu




On 2/14/2024 10:54 AM, Eugenio Perez Martin wrote:

On Wed, Feb 14, 2024 at 1:39 PM Si-Wei Liu  wrote:

Introduce new API. No functional change on existing API.

Acked-by: Jason Wang 
Signed-off-by: Si-Wei Liu 

I'm ok with the new function, but doesn't the compiler complain
because adding a static function is not used?
Hmmm, which one? vhost_vdpa_net_get_nc_vdpa is used by 
vhost_vdpa_net_first_nc_vdpa internally, and 
vhost_vdpa_net_first_nc_vdpa is used by vhost_vdpa_net_cvq_start (Patch 
01). I think we should be fine?


-Siwei



---
  net/vhost-vdpa.c | 13 +
  1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index 06c83b4..4168cad 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -281,13 +281,18 @@ static ssize_t vhost_vdpa_receive(NetClientState *nc, 
const uint8_t *buf,
  }


-/** From any vdpa net client, get the netclient of the first queue pair */
-static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s)
+/** From any vdpa net client, get the netclient of the i-th queue pair */
+static VhostVDPAState *vhost_vdpa_net_get_nc_vdpa(VhostVDPAState *s, int i)
  {
  NICState *nic = qemu_get_nic(s->nc.peer);
-NetClientState *nc0 = qemu_get_peer(nic->ncs, 0);
+NetClientState *nc_i = qemu_get_peer(nic->ncs, i);
+
+return DO_UPCAST(VhostVDPAState, nc, nc_i);
+}

-return DO_UPCAST(VhostVDPAState, nc, nc0);
+static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s)
+{
+return vhost_vdpa_net_get_nc_vdpa(s, 0);
  }

  static void vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool enable)
--
1.8.3.1






Re: [RFC PATCH 14/14] migration: Fix return-path thread exit

2024-02-14 Thread Fabiano Rosas
Cédric Le Goater  writes:

> Hello Fabiano
>
> On 2/8/24 14:29, Fabiano Rosas wrote:
>> Cédric Le Goater  writes:
>> 
>>> In case of error, close_return_path_on_source() can perform a shutdown
>>> to exit the return-path thread.  However, in migrate_fd_cleanup(),
>>> 'to_dst_file' is closed before calling close_return_path_on_source()
>>> and the shutdown fails, leaving the source and destination waiting for
>>> an event to occur.
>> 
>> Hi, Cédric
>> 
>> Are you sure this is not caused by patch 13? 
>
> It happens with upstream QEMU without any patch.

I might have taken that "shutdown fails" in the commit message too
literaly. Anyway, I have a proposed solution:

-->8--
>From 729aa7b5b7f130f756d41649fdd0862bd2e90430 Mon Sep 17 00:00:00 2001
From: Fabiano Rosas 
Date: Wed, 14 Feb 2024 16:45:43 -0300
Subject: [PATCH] migration: Join the return path thread before releasing
 to_dst_file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The return path thread might hang at a blocking system call. Before
joining the thread we might need to issue a shutdown() on the socket
file descriptor to release it. To determine whether the shutdown() is
necessary we look at the QEMUFile error.

Make sure we only clean up the QEMUFile after the return path has been
waited for.

This fixes a hang when qemu_savevm_state_setup() produced an error
that was detected by migration_detect_error(). That skips
migration_completion() so close_return_path_on_source() would get
stuck waiting for the RP thread to terminate.

At migrate_fd_cleanup() I'm keeping the relative order of joining the
migration thread and the return path just in case.

Reported-by: Cédric Le Goater 
Signed-off-by: Fabiano Rosas 
---
 migration/migration.c | 36 
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index ab21de2cad..f0b70e8a9d 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1326,17 +1326,19 @@ static void migrate_fd_cleanup(MigrationState *s)
 
 qemu_savevm_state_cleanup();
 
+bql_unlock();
+if (s->migration_thread_running) {
+qemu_thread_join(>thread);
+s->migration_thread_running = false;
+}
+bql_lock();
+
+close_return_path_on_source(s);
+
 if (s->to_dst_file) {
 QEMUFile *tmp;
 
 trace_migrate_fd_cleanup();
-bql_unlock();
-if (s->migration_thread_running) {
-qemu_thread_join(>thread);
-s->migration_thread_running = false;
-}
-bql_lock();
-
 multifd_send_shutdown();
 qemu_mutex_lock(>qemu_file_lock);
 tmp = s->to_dst_file;
@@ -1350,12 +1352,6 @@ static void migrate_fd_cleanup(MigrationState *s)
 qemu_fclose(tmp);
 }
 
-/*
- * We already cleaned up to_dst_file, so errors from the return
- * path might be due to that, ignore them.
- */
-close_return_path_on_source(s);
-
 assert(!migration_is_active(s));
 
 if (s->state == MIGRATION_STATUS_CANCELLING) {
@@ -2874,6 +2870,13 @@ static MigThrError postcopy_pause(MigrationState *s)
 while (true) {
 QEMUFile *file;
 
+/*
+ * We're already pausing, so ignore any errors on the return
+ * path and just wait for the thread to finish. It will be
+ * re-created when we resume.
+ */
+close_return_path_on_source(s);
+
 /*
  * Current channel is possibly broken. Release it.  Note that this is
  * guaranteed even without lock because to_dst_file should only be
@@ -2893,13 +2896,6 @@ static MigThrError postcopy_pause(MigrationState *s)
 qemu_file_shutdown(file);
 qemu_fclose(file);
 
-/*
- * We're already pausing, so ignore any errors on the return
- * path and just wait for the thread to finish. It will be
- * re-created when we resume.
- */
-close_return_path_on_source(s);
-
 migrate_set_state(>state, s->state,
   MIGRATION_STATUS_POSTCOPY_PAUSED);
 
-- 
2.35.3



Re: [PATCH v2 6/7] vdpa: move iova_tree allocation to net_vhost_vdpa_init

2024-02-14 Thread Eugenio Perez Martin
On Wed, Feb 14, 2024 at 7:29 PM Si-Wei Liu  wrote:
>
> Hi Michael,
>
> On 2/13/2024 2:22 AM, Michael S. Tsirkin wrote:
> > On Mon, Feb 05, 2024 at 05:10:36PM -0800, Si-Wei Liu wrote:
> >> Hi Eugenio,
> >>
> >> I thought this new code looks good to me and the original issue I saw with
> >> x-svq=on should be gone. However, after rebase my tree on top of this,
> >> there's a new failure I found around setting up guest mappings at early
> >> boot, please see attached the specific QEMU config and corresponding event
> >> traces. Haven't checked into the detail yet, thinking you would need to be
> >> aware of ahead.
> >>
> >> Regards,
> >> -Siwei
> > Eugenio were you able to reproduce? Siwei did you have time to
> > look into this?
> Didn't get a chance to look into the detail yet in the past week, but
> thought it may have something to do with the (internals of) iova tree
> range allocation and the lookup routine. It started to fall apart at the
> first vhost_vdpa_dma_unmap call showing up in the trace events, where it
> should've gotten IOVA=0x201000,  but an incorrect IOVA address
> 0x1000 was ended up returning from the iova tree lookup routine.
>
> HVAGPAIOVA
> -
> Map
> [0x7f7903e0, 0x7f7983e0)[0x0, 0x8000) [0x1000, 0x8000)
> [0x7f7983e0, 0x7f9903e0)[0x1, 0x208000)
> [0x80001000, 0x201000)
> [0x7f7903ea, 0x7f7903ec)[0xfeda, 0xfedc)
> [0x201000, 0x221000)
>
> Unmap
> [0x7f7903ea, 0x7f7903ec)[0xfeda, 0xfedc) [0x1000,
> 0x2) ???
>  shouldn't it be [0x201000,
> 0x221000) ???
>

Yes, I'm still not able to reproduce. In particular, I don't know how
how the memory listener adds a region and then release a region with a
different size. I'm talking about these log entries:

1706854838.154394:vhost_vdpa_listener_region_add vdpa: 0x556d45c75140
iova 0x0 llend 0x8000 vaddr: 0x7f7903e0 read-only: 0
452:vhost_vdpa_listener_region_del vdpa: 0x556d45c75140 iova 0x0 llend
0x7fff

Is it possible for you to also trace the skipped regions? We should
add a debug trace there too...

Thanks!

> PS, I will be taking off from today and for the next two weeks. Will try
> to help out looking more closely after I get back.
>
> -Siwei
> >   Can't merge patches which are known to break things ...
>




Re: [PATCH v2] target/ppc: Move add and subf type fixed-point arithmetic instructions to decodetree

2024-02-14 Thread Richard Henderson

On 2/13/24 23:40, Chinmay Rath wrote:

This patch moves the below instructions to decodetree specification:

 {add, subf}[c,e,me,ze][o][.]   : XO-form
 addic[.], subfic   : D-form
 addex  : Z23-form

This patch introduces XO form instructions into decode tree specification, for
which all the four variations([o][.]) have been handled with a single pattern.
The changes were verified by validating that the tcg ops generated by those
instructions remain the same, which were captured with the '-d in_asm,op' flag.

Signed-off-by: Chinmay Rath
---
Changes v1 -> v2 :
Reused X format for ADDEX instead of creating a new Z23_tab_cy
format. (Richard)
Added necessary instruction flag checks for ADDEX. (self-review)
---
  target/ppc/insn32.decode   |  26 
  target/ppc/translate.c | 136 -
  target/ppc/translate/fixedpoint-impl.c.inc |  70 +++
  3 files changed, 96 insertions(+), 136 deletions(-)


Reviewed-by: Richard Henderson 

r~



Re: [PATCH 04/12] vdpa: factor out vhost_vdpa_net_get_nc_vdpa

2024-02-14 Thread Eugenio Perez Martin
On Wed, Feb 14, 2024 at 1:39 PM Si-Wei Liu  wrote:
>
> Introduce new API. No functional change on existing API.
>
> Acked-by: Jason Wang 
> Signed-off-by: Si-Wei Liu 

I'm ok with the new function, but doesn't the compiler complain
because adding a static function is not used?

> ---
>  net/vhost-vdpa.c | 13 +
>  1 file changed, 9 insertions(+), 4 deletions(-)
>
> diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
> index 06c83b4..4168cad 100644
> --- a/net/vhost-vdpa.c
> +++ b/net/vhost-vdpa.c
> @@ -281,13 +281,18 @@ static ssize_t vhost_vdpa_receive(NetClientState *nc, 
> const uint8_t *buf,
>  }
>
>
> -/** From any vdpa net client, get the netclient of the first queue pair */
> -static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s)
> +/** From any vdpa net client, get the netclient of the i-th queue pair */
> +static VhostVDPAState *vhost_vdpa_net_get_nc_vdpa(VhostVDPAState *s, int i)
>  {
>  NICState *nic = qemu_get_nic(s->nc.peer);
> -NetClientState *nc0 = qemu_get_peer(nic->ncs, 0);
> +NetClientState *nc_i = qemu_get_peer(nic->ncs, i);
> +
> +return DO_UPCAST(VhostVDPAState, nc, nc_i);
> +}
>
> -return DO_UPCAST(VhostVDPAState, nc, nc0);
> +static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s)
> +{
> +return vhost_vdpa_net_get_nc_vdpa(s, 0);
>  }
>
>  static void vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool enable)
> --
> 1.8.3.1
>




Re: [PATCH] ci: Fix again build-previous-qemu

2024-02-14 Thread Fabiano Rosas
Paolo Bonzini  writes:

> The build-previous-qemu job is now trying to fetch from the upstream
> repository, but the tag is only fetched into FETCH_HEAD:
>
> $ git remote add upstream https://gitlab.com/qemu-project/qemu 00:00
> $ git fetch upstream $QEMU_PREV_VERSION 00:02
> warning: redirecting to https://gitlab.com/qemu-project/qemu.git/
> From https://gitlab.com/qemu-project/qemu
>  * tag v8.2.0 -> FETCH_HEAD
> $ git checkout $QEMU_PREV_VERSION 00:02
> error: pathspec v8.2.0 did not match any file(s) known to git
>
> Fix by fetching the tag into the checkout itself.
>
> Signed-off-by: Paolo Bonzini 

Reviewed-by: Fabiano Rosas 



Re: [PATCH v2 1/2] vhost: dirty log should be per backend type

2024-02-14 Thread Si-Wei Liu

Hi Michael,

I'm taking off for 2+ weeks, but please feel free to provide comment and 
feedback while I'm off. I'll be checking emails still, and am about to 
address any opens as soon as I am back.


Thanks,
-Siwei

On 2/14/2024 3:50 AM, Si-Wei Liu wrote:

There could be a mix of both vhost-user and vhost-kernel clients
in the same QEMU process, where separate vhost loggers for the
specific vhost type have to be used. Make the vhost logger per
backend type, and have them properly reference counted.

Suggested-by: Michael S. Tsirkin 
Signed-off-by: Si-Wei Liu 
---
  hw/virtio/vhost.c | 49 +
  1 file changed, 37 insertions(+), 12 deletions(-)

diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 2c9ac79..ef6d9b5 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -43,8 +43,8 @@
  do { } while (0)
  #endif
  
-static struct vhost_log *vhost_log;

-static struct vhost_log *vhost_log_shm;
+static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
+static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
  
  /* Memslots used by backends that support private memslots (without an fd). */

  static unsigned int used_memslots;
@@ -287,6 +287,8 @@ static int vhost_set_backend_type(struct vhost_dev *dev,
  r = -1;
  }
  
+assert(dev->vhost_ops->backend_type == backend_type || r < 0);

+
  return r;
  }
  
@@ -319,16 +321,23 @@ static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)

  return log;
  }
  
-static struct vhost_log *vhost_log_get(uint64_t size, bool share)

+static struct vhost_log *vhost_log_get(VhostBackendType backend_type,
+   uint64_t size, bool share)
  {
-struct vhost_log *log = share ? vhost_log_shm : vhost_log;
+struct vhost_log *log;
+
+if (backend_type == VHOST_BACKEND_TYPE_NONE ||
+backend_type >= VHOST_BACKEND_TYPE_MAX)
+return NULL;
+
+log = share ? vhost_log_shm[backend_type] : vhost_log[backend_type];
  
  if (!log || log->size != size) {

  log = vhost_log_alloc(size, share);
  if (share) {
-vhost_log_shm = log;
+vhost_log_shm[backend_type] = log;
  } else {
-vhost_log = log;
+vhost_log[backend_type] = log;
  }
  } else {
  ++log->refcnt;
@@ -340,11 +349,20 @@ static struct vhost_log *vhost_log_get(uint64_t size, 
bool share)
  static void vhost_log_put(struct vhost_dev *dev, bool sync)
  {
  struct vhost_log *log = dev->log;
+VhostBackendType backend_type;
  
  if (!log) {

  return;
  }
  
+assert(dev->vhost_ops);

+backend_type = dev->vhost_ops->backend_type;
+
+if (backend_type == VHOST_BACKEND_TYPE_NONE ||
+backend_type >= VHOST_BACKEND_TYPE_MAX) {
+return;
+}
+
  --log->refcnt;
  if (log->refcnt == 0) {
  /* Sync only the range covered by the old log */
@@ -352,13 +370,13 @@ static void vhost_log_put(struct vhost_dev *dev, bool 
sync)
  vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
  }
  
-if (vhost_log == log) {

+if (vhost_log[backend_type] == log) {
  g_free(log->log);
-vhost_log = NULL;
-} else if (vhost_log_shm == log) {
+vhost_log[backend_type] = NULL;
+} else if (vhost_log_shm[backend_type] == log) {
  qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
  log->fd);
-vhost_log_shm = NULL;
+vhost_log_shm[backend_type] = NULL;
  }
  
  g_free(log);

@@ -376,7 +394,8 @@ static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
  
  static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)

  {
-struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
+struct vhost_log *log = vhost_log_get(dev->vhost_ops->backend_type,
+  size, vhost_dev_log_is_shared(dev));
  uint64_t log_base = (uintptr_t)log->log;
  int r;
  
@@ -2037,8 +2056,14 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)

  uint64_t log_base;
  
  hdev->log_size = vhost_get_log_size(hdev);

-hdev->log = vhost_log_get(hdev->log_size,
+hdev->log = vhost_log_get(hdev->vhost_ops->backend_type,
+  hdev->log_size,
vhost_dev_log_is_shared(hdev));
+if (!hdev->log) {
+VHOST_OPS_DEBUG(r, "vhost_log_get failed");
+goto fail_vq;
+}
+
  log_base = (uintptr_t)hdev->log->log;
  r = hdev->vhost_ops->vhost_set_log_base(hdev,
  hdev->log_size ? log_base : 0,





Re: [PATCH v2 6/7] vdpa: move iova_tree allocation to net_vhost_vdpa_init

2024-02-14 Thread Si-Wei Liu

Hi Eugenio,

Just to answer the question you had in the sync meeting as I've just 
tried, it seems that the issue is also reproducible even with VGA device 
and VNC display removed, and also reproducible with 8G mem size. You 
already knew that I can only repro with x-svq=on.


Regards,
-Siwei

On 2/13/2024 8:26 AM, Eugenio Perez Martin wrote:

On Tue, Feb 13, 2024 at 11:22 AM Michael S. Tsirkin  wrote:

On Mon, Feb 05, 2024 at 05:10:36PM -0800, Si-Wei Liu wrote:

Hi Eugenio,

I thought this new code looks good to me and the original issue I saw with
x-svq=on should be gone. However, after rebase my tree on top of this,
there's a new failure I found around setting up guest mappings at early
boot, please see attached the specific QEMU config and corresponding event
traces. Haven't checked into the detail yet, thinking you would need to be
aware of ahead.

Regards,
-Siwei

Eugenio were you able to reproduce? Siwei did you have time to
look into this? Can't merge patches which are known to break things ...


Sorry for the lack of news, I'll try to reproduce this week. Meanwhile
this patch should not be merged, as you mention.

Thanks!






Re: [PATCH v2 6/7] vdpa: move iova_tree allocation to net_vhost_vdpa_init

2024-02-14 Thread Si-Wei Liu

Hi Michael,

On 2/13/2024 2:22 AM, Michael S. Tsirkin wrote:

On Mon, Feb 05, 2024 at 05:10:36PM -0800, Si-Wei Liu wrote:

Hi Eugenio,

I thought this new code looks good to me and the original issue I saw with
x-svq=on should be gone. However, after rebase my tree on top of this,
there's a new failure I found around setting up guest mappings at early
boot, please see attached the specific QEMU config and corresponding event
traces. Haven't checked into the detail yet, thinking you would need to be
aware of ahead.

Regards,
-Siwei

Eugenio were you able to reproduce? Siwei did you have time to
look into this?
Didn't get a chance to look into the detail yet in the past week, but 
thought it may have something to do with the (internals of) iova tree 
range allocation and the lookup routine. It started to fall apart at the 
first vhost_vdpa_dma_unmap call showing up in the trace events, where it 
should've gotten IOVA=0x201000,  but an incorrect IOVA address 
0x1000 was ended up returning from the iova tree lookup routine.


HVA                    GPA                IOVA
-
Map
[0x7f7903e0, 0x7f7983e0)    [0x0, 0x8000) [0x1000, 0x8000)
[0x7f7983e0, 0x7f9903e0)    [0x1, 0x208000) 
[0x80001000, 0x201000)
[0x7f7903ea, 0x7f7903ec)    [0xfeda, 0xfedc) 
[0x201000, 0x221000)


Unmap
[0x7f7903ea, 0x7f7903ec)    [0xfeda, 0xfedc) [0x1000, 
0x2) ???
                                shouldn't it be [0x201000, 
0x221000) ???


PS, I will be taking off from today and for the next two weeks. Will try 
to help out looking more closely after I get back.


-Siwei

  Can't merge patches which are known to break things ...




Re: [PULL 00/60] virtio,pc,pci: features, cleanups, fixes

2024-02-14 Thread Peter Maydell
On Wed, 14 Feb 2024 at 11:19, Michael S. Tsirkin  wrote:
>
> On Wed, Feb 14, 2024 at 06:13:16AM -0500, Michael S. Tsirkin wrote:
> > The following changes since commit df50424b4dcfde823047d3717abd6a61224ea205:
> >
> >   Merge tag 'pull-riscv-to-apply-20240209' of 
> > https://github.com/alistair23/qemu into staging (2024-02-09 16:15:01 +)
> >
> > are available in the Git repository at:
> >
> >   https://git.kernel.org/pub/scm/virt/kvm/mst/qemu.git tags/for_upstream
> >
> > for you to fetch changes up to 3afdb6d18e9ccd6470be30f151a562cf4537d13f:
>
>
> 1dd6954c3f5c5c610cf94b6f740118e565957293 now - dropped a duplicate
> SOB from commit log.
>
> >   MAINTAINERS: Switch to my Enfabrica email (2024-02-14 06:09:33 -0500)
> >
> > 
> > virtio,pc,pci: features, cleanups, fixes
> >
> > vhost-user-snd support
> > x2APIC mode with TCG support
> > CXL update to r3.1
> >
> > fixes, cleanups all over the place.
> >
> > Signed-off-by: Michael S. Tsirkin 
> >
> > 


Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/9.0
for any user-visible changes.

-- PMM



Re: [External] [PATCH v2 05/23] migration/multifd: Drop MultiFDSendParams.normal[] array

2024-02-14 Thread Fabiano Rosas
Hao Xiang  writes:

> On Fri, Feb 9, 2024 at 4:20 AM Fabiano Rosas  wrote:
>>
>> Hao Xiang  writes:
>>
>> > On Fri, Feb 2, 2024 at 2:30 AM  wrote:
>> >>
>> >> From: Peter Xu 
>> >>
>> >> This array is redundant when p->pages exists.  Now we extended the life of
>> >> p->pages to the whole period where pending_job is set, it should be safe 
>> >> to
>> >> always use p->pages->offset[] rather than p->normal[].  Drop the array.
>> >>
>> >> Alongside, the normal_num is also redundant, which is the same to
>> >> p->pages->num.
>> >
>> > Can we not drop p->normal and p_normal_num? It is redundant now but I
>> > think it will be needed for multifd zero page checking. In multifd
>> > zero page, we find out all zero pages and we sort the normal pages and
>> > zero pages in two seperate arrays. p->offset is the original array of
>> > pages, p->normal will contain the array of normal pages and p->zero
>> > will contain the array of zero pages.
>>
>> We're moving send_fill_packet into send_prepare(), so you should be able
>> to do whatever data transformation at send_prepare() and add any fields
>> you need into p->pages.
>>
>> If we keep p->normal we will not be able to switch into an opaque
>> payload later on. There should be no mention of pages outside of
>> hooks. This is long-term work, but let's avoid blocking it if possible.
>>
>
> Got it. I will make the proper changes.
>
> Aside from that, I would like to get opinions from you guys regarding
> zero page detection interface.
> Here are the options I am thinking:
> 1) Do zero page detection in send_prepare().
> This means no dedicated hook for zero_page_detection() otherwise we
> will be calling a hook from inside a hook. But we will need a new
> function multifd_zero_page_check_send() similar to how we use
> multifd_send_fill_packet() now. multifd_zero_page_check_send() will
> need to be called by all send_prepare() implementations.
> 2) Do zero page detection in a new hook zero_page_detection().
> zero_page_detection will be called before send_prepare(). Seems like
> extra complexity but I can go with that routine if you guys think it's
> a cleaner way.
>
> I am leaning towards 1) right now.

That's fine. Zero page detection is only needed for ram migration. Once
we start using multifd to transfer generic device state, then there will
be no zero page detection. So send_prepare() seems like a good place to
put it.

>> >>
>> >> This doesn't apply to recv side, because there's no extra buffering on 
>> >> recv
>> >> side, so p->normal[] array is still needed.
>> >>
>> >> Reviewed-by: Fabiano Rosas 
>> >> Signed-off-by: Peter Xu 
>> >> ---
>> >>  migration/multifd.h  |  4 
>> >>  migration/multifd-zlib.c |  7 ---
>> >>  migration/multifd-zstd.c |  7 ---
>> >>  migration/multifd.c  | 33 +
>> >>  4 files changed, 21 insertions(+), 30 deletions(-)
>> >>
>> >> diff --git a/migration/multifd.h b/migration/multifd.h
>> >> index 7c040cb85a..3920bdbcf1 100644
>> >> --- a/migration/multifd.h
>> >> +++ b/migration/multifd.h
>> >> @@ -122,10 +122,6 @@ typedef struct {
>> >>  struct iovec *iov;
>> >>  /* number of iovs used */
>> >>  uint32_t iovs_num;
>> >> -/* Pages that are not zero */
>> >> -ram_addr_t *normal;
>> >> -/* num of non zero pages */
>> >> -uint32_t normal_num;
>> >>  /* used for compression methods */
>> >>  void *data;
>> >>  }  MultiFDSendParams;
>> >> diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
>> >> index 37ce48621e..100809abc1 100644
>> >> --- a/migration/multifd-zlib.c
>> >> +++ b/migration/multifd-zlib.c
>> >> @@ -116,17 +116,18 @@ static void zlib_send_cleanup(MultiFDSendParams *p, 
>> >> Error **errp)
>> >>   */
>> >>  static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
>> >>  {
>> >> +MultiFDPages_t *pages = p->pages;
>> >>  struct zlib_data *z = p->data;
>> >>  z_stream *zs = >zs;
>> >>  uint32_t out_size = 0;
>> >>  int ret;
>> >>  uint32_t i;
>> >>
>> >> -for (i = 0; i < p->normal_num; i++) {
>> >> +for (i = 0; i < pages->num; i++) {
>> >>  uint32_t available = z->zbuff_len - out_size;
>> >>  int flush = Z_NO_FLUSH;
>> >>
>> >> -if (i == p->normal_num - 1) {
>> >> +if (i == pages->num - 1) {
>> >>  flush = Z_SYNC_FLUSH;
>> >>  }
>> >>
>> >> @@ -135,7 +136,7 @@ static int zlib_send_prepare(MultiFDSendParams *p, 
>> >> Error **errp)
>> >>   * with compression. zlib does not guarantee that this is safe,
>> >>   * therefore copy the page before calling deflate().
>> >>   */
>> >> -memcpy(z->buf, p->pages->block->host + p->normal[i], 
>> >> p->page_size);
>> >> +memcpy(z->buf, p->pages->block->host + pages->offset[i], 
>> >> p->page_size);
>> >>  zs->avail_in = p->page_size;
>> >>  zs->next_in = z->buf;
>> >>
>> >> diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c
>> >> index 

Re: [PATCH v4 9/9] hw/nvme: Refer to dev->exp.sriov_pf.num_vfs

2024-02-14 Thread Akihiko Odaki

On 2024/02/15 1:34, Michael S. Tsirkin wrote:

On Thu, Feb 15, 2024 at 01:07:29AM +0900, Akihiko Odaki wrote:

On 2024/02/15 0:46, Michael S. Tsirkin wrote:

On Wed, Feb 14, 2024 at 11:09:50PM +0900, Akihiko Odaki wrote:

On 2024/02/14 16:07, Michael S. Tsirkin wrote:

On Wed, Feb 14, 2024 at 02:13:47PM +0900, Akihiko Odaki wrote:

NumVFs may not equal to the current effective number of VFs because VF
Enable is cleared, NumVFs is set after VF Enable is set, or NumVFs is
greater than TotalVFs.

Fixes: 11871f53ef8e ("hw/nvme: Add support for the Virtualization Management 
command")
Signed-off-by: Akihiko Odaki 


I don't get what this is saying about VF enable.
This code will not trigger on numVFs write when VF enable is set.
Generally this commit makes no sense on its own, squash it with
the pci core change pls.


This code is meant to run when it is clearing VF Enable, and its
functionality is to change the state of VFs currently enabled so that we can
disable them.

However, NumVFs does not necessarily represent VFs currently being enabled,
and have a different value in the case described above.


Ah so in this case, if numvfs is changed and then VFs are disabled,
we will not call nvme_virt_set_state? OK, it should say this in commit log.
And then, what happens?


We will call nvme_virt_set_state() but only for VFs already enabled.


And? What does it cause? memory leak? some buffer is overrun?
the guest behaviour is illegal so it does not really
matter what we do as long as nothing too bad happens.


nvme_sriov_pre_write_ctrl() is intended to free resources allocated to 
VFs. Previously, nvme_sriov_pre_write_ctrl() used NumVFs to iterate VFs 
with resources allocated. If NumVFs is changed and then VFs are 
disabled, this iteration resulted in buffer overrun.


With this patch, the changed value of NumVFs will be ignored, and 
nvme_sriov_pre_write_ctrl() only frees resources allocated to VFs 
actually enabled, thus no buffer overrun happens.







Such cases exist
even before the earlier patches and this fix is independently meaningful.


yes but the previous patch causes a regression without this one.
squash it.


I'll move this patch before the previous patch.







---
hw/nvme/ctrl.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index f8df622fe590..daedda5d326f 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -8481,7 +8481,7 @@ static void nvme_sriov_pre_write_ctrl(PCIDevice *dev, 
uint32_t address,
NvmeSecCtrlEntry *sctrl;
uint16_t sriov_cap = dev->exp.sriov_cap;
uint32_t off = address - sriov_cap;
-int i, num_vfs;
+int i;
if (!sriov_cap) {
return;
@@ -8489,8 +8489,7 @@ static void nvme_sriov_pre_write_ctrl(PCIDevice *dev, 
uint32_t address,
if (range_covers_byte(off, len, PCI_SRIOV_CTRL)) {
if (!(val & PCI_SRIOV_CTRL_VFE)) {
-num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
-for (i = 0; i < num_vfs; i++) {
+for (i = 0; i < dev->exp.sriov_pf.num_vfs; i++) {


If the assumption you now make is that num_vfs only changes
when VFs are disabled, we should add a comment documenting this.
In fact, I think there's a nicer way to do this:

static void nvme_pci_write_config(PCIDevice *dev, uint32_t address,
uint32_t val, int len)
{
  int old_num_vfs = dev->exp.sriov_pf.num_vfs;

  pci_default_write_config(dev, address, val, len);
  pcie_cap_flr_write_config(dev, address, val, len);
  nvme_sriov_pre_write_ctrl(dev, address, val, len, old_num_vfs);
}

and now, nvme_sriov_pre_write_ctrl can compare:

if (old_num_vfs && !dev->exp.sriov_pf.num_vfs)
disable everything


this, without bothering with detail of SRIOV capability.
No?


It looks better. I'll do so in the next version.






Re: [PATCH v4 9/9] hw/nvme: Refer to dev->exp.sriov_pf.num_vfs

2024-02-14 Thread Michael S. Tsirkin
On Thu, Feb 15, 2024 at 01:07:29AM +0900, Akihiko Odaki wrote:
> On 2024/02/15 0:46, Michael S. Tsirkin wrote:
> > On Wed, Feb 14, 2024 at 11:09:50PM +0900, Akihiko Odaki wrote:
> > > On 2024/02/14 16:07, Michael S. Tsirkin wrote:
> > > > On Wed, Feb 14, 2024 at 02:13:47PM +0900, Akihiko Odaki wrote:
> > > > > NumVFs may not equal to the current effective number of VFs because VF
> > > > > Enable is cleared, NumVFs is set after VF Enable is set, or NumVFs is
> > > > > greater than TotalVFs.
> > > > > 
> > > > > Fixes: 11871f53ef8e ("hw/nvme: Add support for the Virtualization 
> > > > > Management command")
> > > > > Signed-off-by: Akihiko Odaki 
> > > > 
> > > > I don't get what this is saying about VF enable.
> > > > This code will not trigger on numVFs write when VF enable is set.
> > > > Generally this commit makes no sense on its own, squash it with
> > > > the pci core change pls.
> > > 
> > > This code is meant to run when it is clearing VF Enable, and its
> > > functionality is to change the state of VFs currently enabled so that we 
> > > can
> > > disable them.
> > > 
> > > However, NumVFs does not necessarily represent VFs currently being 
> > > enabled,
> > > and have a different value in the case described above.
> > 
> > Ah so in this case, if numvfs is changed and then VFs are disabled,
> > we will not call nvme_virt_set_state? OK, it should say this in commit log.
> > And then, what happens?
> 
> We will call nvme_virt_set_state() but only for VFs already enabled.

And? What does it cause? memory leak? some buffer is overrun?
the guest behaviour is illegal so it does not really
matter what we do as long as nothing too bad happens.

> > 
> > > Such cases exist
> > > even before the earlier patches and this fix is independently meaningful.
> > 
> > yes but the previous patch causes a regression without this one.
> > squash it.
> 
> I'll move this patch before the previous patch.
> 
> > 
> > 
> > > > 
> > > > > ---
> > > > >hw/nvme/ctrl.c | 5 ++---
> > > > >1 file changed, 2 insertions(+), 3 deletions(-)
> > > > > 
> > > > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> > > > > index f8df622fe590..daedda5d326f 100644
> > > > > --- a/hw/nvme/ctrl.c
> > > > > +++ b/hw/nvme/ctrl.c
> > > > > @@ -8481,7 +8481,7 @@ static void nvme_sriov_pre_write_ctrl(PCIDevice 
> > > > > *dev, uint32_t address,
> > > > >NvmeSecCtrlEntry *sctrl;
> > > > >uint16_t sriov_cap = dev->exp.sriov_cap;
> > > > >uint32_t off = address - sriov_cap;
> > > > > -int i, num_vfs;
> > > > > +int i;
> > > > >if (!sriov_cap) {
> > > > >return;
> > > > > @@ -8489,8 +8489,7 @@ static void nvme_sriov_pre_write_ctrl(PCIDevice 
> > > > > *dev, uint32_t address,
> > > > >if (range_covers_byte(off, len, PCI_SRIOV_CTRL)) {
> > > > >if (!(val & PCI_SRIOV_CTRL_VFE)) {
> > > > > -num_vfs = pci_get_word(dev->config + sriov_cap + 
> > > > > PCI_SRIOV_NUM_VF);
> > > > > -for (i = 0; i < num_vfs; i++) {
> > > > > +for (i = 0; i < dev->exp.sriov_pf.num_vfs; i++) {
> > 
> > If the assumption you now make is that num_vfs only changes
> > when VFs are disabled, we should add a comment documenting this.
> > In fact, I think there's a nicer way to do this:
> > 
> > static void nvme_pci_write_config(PCIDevice *dev, uint32_t address,
> >uint32_t val, int len)
> > {
> >  int old_num_vfs = dev->exp.sriov_pf.num_vfs;
> > 
> >  pci_default_write_config(dev, address, val, len);
> >  pcie_cap_flr_write_config(dev, address, val, len);
> >  nvme_sriov_pre_write_ctrl(dev, address, val, len, old_num_vfs);
> > }
> > 
> > and now, nvme_sriov_pre_write_ctrl can compare:
> > 
> > if (old_num_vfs && !dev->exp.sriov_pf.num_vfs)
> > disable everything
> > 
> > 
> > this, without bothering with detail of SRIOV capability.
> > No?
> 
> It looks better. I'll do so in the next version.




  1   2   3   >