date:20220629

[PATCH v3 14/14] hw/arm/aspeed: Add oby35-cl machine

2022-06-29 Thread Peter Delevoryas

From: Peter Delevoryas 

The fby35 machine includes 4 server boards, each of which has a "bridge
interconnect" (BIC). This chip abstracts the pinout for the server board
into a single endpoint that the baseboard management controller (BMC)
can talk to using IPMB.

This commit adds a machine for testing the BIC on the server board. It
runs OpenBIC (https://github.com/facebook/openbic) and the server board
is called CraterLake, so the code name is oby35-cl. There's also a
variant of the baseboard that replaces the BMC with a BIC, but that
machine is not included here.

A test image can be built from https://github.com/facebook/openbic using
the instructions in the README.md to build the meta-facebook/yv35-cl
recipe, or retrieved from my Github:

wget 
https://github.com/peterdelevoryas/OpenBIC/releases/download/oby35-cl-2022.17.01/Y35BCL.elf

And you can run this machine with the following command:

qemu-system-arm -machine oby35-cl -nographic -kernel Y35BCL.elf

It should produce output like the following:

[00:00:00.005,000]  usb_dc_aspeed: select ep[0x81] as IN endpoint
[00:00:00.006,000]  usb_dc_aspeed: select ep[0x82] as IN endpoint
[00:00:00.006,000]  usb_dc_aspeed: pre-selected ep[0x1] as IN endpoint
[00:00:00.006,000]  usb_dc_aspeed: pre-selected ep[0x2] as IN endpoint
[00:00:00.006,000]  usb_dc_aspeed: select ep[0x3] as OUT endpoint
*** Booting Zephyr OS build v00.01.05  ***
Hello, welcome to yv35 craterlake 2022.25.1
BIC class type(class-1), 1ou present status(0), 2ou present status(0), 
board revision(0x1)
check_vr_type: i2c4 0x62 page 0 [04 00 81 d2 49 3c ff ff ff ff ff ff ff ff 
ff ff]
check_vr_type: i2c4 0x76 page 0 [04 00 81 d2 49 3c ff ff ff ff ff ff ff ff 
ff ff]
check_vr_type: i2c4 0x76 page 1 [04 00 81 d2 49 3c ff ff ff ff ff ff ff ff 
ff ff]
check_vr_type: i2c4 0x60 page 0 [04 00 81 d2 49 3c ff ff ff ff ff ff ff ff 
ff ff]
check_vr_type: i2c4 0x60 page 1 [04 00 81 d2 49 3c ff ff ff ff ff ff ff ff 
ff ff]
check_vr_type: i2c4 0x62 page 0 [04 00 81 d2 49 3c ff ff ff ff ff ff ff ff 
ff ff]
check_vr_type: i2c4 0x76 page 0 [04 00 81 d2 49 3c ff ff ff ff ff ff ff ff 
ff ff]
check_vr_type: i2c4 0x76 page 1 [04 00 81 d2 49 3c ff ff ff ff ff ff ff ff 
ff ff]
check_vr_type: i2c4 0x60 page 0 [04 00 81 d2 49 3c ff ff ff ff ff ff ff ff 
ff ff]
check_vr_type: i2c4 0x60 page 1 [04 00 81 d2 49 3c ff ff ff ff ff ff ff ff 
ff ff]
check_vr_type: i2c4 0x62 page 0 [04 00 81 d2 49 3c ff ff ff ff ff ff ff ff 
ff ff]
check_vr_type: i2c4 0x76 page 0 [04 00 81 d2 49 3c ff ff ff ff ff ff ff ff 
ff ff]
check_vr_type: i2c4 0x76 page 1 [04 00 81 d2 49 3c ff ff ff ff ff ff ff ff 
ff ff]
check_vr_type: i2c4 0x60 page 0 [04 00 81 d2 49 3c ff ff ff ff ff ff ff ff 
ff ff]
check_vr_type: i2c4 0x60 page 1 [04 00 81 d2 49 3c ff ff ff ff ff ff ff ff 
ff ff]
check_vr_type: i2c4 0x62 page 0 [04 00 81 d2 49 3c ff ff ff ff ff ff ff ff 
ff ff]
check_vr_type: i2c4 0x76 page 0 [04 00 81 d2 49 3c ff ff ff ff ff ff ff ff 
ff ff]
check_vr_type: i2c4 0x76 page 1 [04 00 81 d2 49 3c ff ff ff ff ff ff ff ff 
ff ff]
check_vr_type: i2c4 0x60 page 0 [04 00 81 d2 49 3c ff ff ff ff ff ff ff ff 
ff ff]
check_vr_type: i2c4 0x60 page 1 [04 00 81 d2 49 3c ff ff ff ff ff ff ff ff 
ff ff]
[init_drive_type] sensor 0x14 post sensor read failed!

[init_drive_type] sensor 0x30 post sensor read failed!
[init_drive_type] sensor 0x39 post sensor read failed!
ipmi_init
[set_DC_status] gpio number(15) status(0)
[set_post_status] gpio number(1) status(1)
uart:~$ [00:00:01.010,000]  kcs_aspeed: KCS3: addr=0xca2, idr=0x2c, 
odr=0x38, str=0x44

[00:00:01.016,000]  spi_nor_multi_dev: [1216][spi1_cs0]SFDP magic 
 invalid
[00:00:01.016,000]  spi_nor_multi_dev: [1456]SFDP read failed: -22
[00:00:01.010,000]  kcs_aspeed: KCS3: addr=0xca2, idr=0x2c, odr=0x38, 
str=0x44

[00:00:01.016,000]  spi_nor_multi_dev: [1216][spi1_cs0]SFDP magic 
 invalid
[00:00:01.016,000]  spi_nor_multi_dev: [1456]SFDP read failed: -22
uart:~$ BIC Ready

Signed-off-by: Peter Delevoryas 
---
 hw/arm/aspeed.c | 48 
 1 file changed, 48 insertions(+)

diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c
index a06f7c1b62..75971ef2ca 100644
--- a/hw/arm/aspeed.c
+++ b/hw/arm/aspeed.c
@@ -1429,6 +1429,50 @@ static void 
aspeed_minibmc_machine_ast1030_evb_class_init(ObjectClass *oc,
 amc->macs_mask = 0;
 }
 
+static void oby35_cl_i2c_init(AspeedMachineState *bmc)
+{
+AspeedSoCState *soc = >soc;
+I2CBus *i2c[14];
+I2CBus *ssd[8];
+int i;
+
+for (i = 0; i < 14; i++) {
+i2c[i] = aspeed_i2c_get_bus(>i2c, i);
+}
+get_pca9548_channels(i2c[1], 0x71, ssd);
+
+i2c_slave_create_simple(i2c[0], "fby35-sb-cpld", 0x21);
+i2c_slave_create_simple(i2c[1], "tmp105", 0x48);
+i2c_slave_create_simple(i2c[1], "tmp105", 0x49);
+

[PATCH v3 13/14] hw/misc/aspeed: Add intel-me

2022-06-29 Thread Peter Delevoryas

From: Peter Delevoryas 

The Intel Management Engine is an IPMI endpoint that responds to various
IPMI commands. In this commit, I've added some very basic functionality that
will respond back with a respond code of zero (success), while also setting
an appropriate response NetFN (request NetFN + 1), a matching command ID and
sequence number, and the 2 standard checksums. Other data is not provided,
but the model here could be extended to respond to more kinds of requests.

Signed-off-by: Peter Delevoryas 
---
 MAINTAINERS  |   1 +
 hw/misc/intel_me.c   | 162 +++
 hw/misc/meson.build  |   3 +-
 hw/misc/trace-events |   8 +++
 4 files changed, 173 insertions(+), 1 deletion(-)
 create mode 100644 hw/misc/intel_me.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 3ffd473db1..3220644bb5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1068,6 +1068,7 @@ F: include/hw/net/ftgmac100.h
 F: docs/system/arm/aspeed.rst
 F: tests/qtest/*aspeed*
 F: hw/misc/fby35_sb_cpld.c
+F: hw/misc/intel_me.c
 
 NRF51
 M: Joel Stanley 
diff --git a/hw/misc/intel_me.c b/hw/misc/intel_me.c
new file mode 100644
index 00..933ae45101
--- /dev/null
+++ b/hw/misc/intel_me.c
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates. (http://www.meta.com)
+ *
+ * This code is licensed under the GPL version 2 or later. See the COPYING
+ * file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/main-loop.h"
+#include "hw/i2c/i2c.h"
+#include "trace.h"
+
+#define TYPE_INTEL_ME "intel-me"
+OBJECT_DECLARE_SIMPLE_TYPE(IntelMEState, INTEL_ME);
+
+struct IntelMEState {
+I2CSlave parent_obj;
+
+I2CBus *bus;
+QEMUBH *bh;
+int rx_len;
+int tx_len;
+int tx_pos;
+uint8_t rx_buf[512];
+uint8_t tx_buf[512];
+};
+
+static void intel_me_bh(void *opaque)
+{
+IntelMEState *s = opaque;
+I2CSlave *i2c = I2C_SLAVE(s);
+uint8_t target_addr;
+
+assert(s->bus->bh == s->bh);
+
+switch (s->tx_pos) {
+case 0:
+target_addr = s->tx_buf[s->tx_pos++];
+trace_intel_me_tx_start(i2c->address, target_addr);
+if (i2c_start_send_async(s->bus, target_addr) != 0) {
+break;
+}
+return;
+default:
+if (s->tx_pos >= s->tx_len) {
+break;
+}
+trace_intel_me_tx_data(i2c->address, s->tx_buf[s->tx_pos]);
+if (i2c_send_async(s->bus, s->tx_buf[s->tx_pos++]) != 0) {
+break;
+}
+return;
+}
+
+trace_intel_me_tx_end(i2c->address);
+i2c_end_transfer(s->bus);
+i2c_bus_release(s->bus);
+s->tx_len = 0;
+s->tx_pos = 0;
+memset(s->tx_buf, 0, sizeof(s->tx_buf));
+}
+
+static void intel_me_realize(DeviceState *dev, Error **errp)
+{
+IntelMEState *s = INTEL_ME(dev);
+
+s->bus = I2C_BUS(qdev_get_parent_bus(dev));
+s->bh = qemu_bh_new(intel_me_bh, s);
+s->rx_len = 0;
+s->tx_len = 0;
+s->tx_pos = 0;
+memset(s->rx_buf, 0, sizeof(s->rx_buf));
+memset(s->tx_buf, 0, sizeof(s->tx_buf));
+}
+
+static uint8_t checksum(const uint8_t *ptr, int len)
+{
+int sum = 0;
+
+for (int i = 0; i < len; i++) {
+sum += ptr[i];
+}
+
+return 256 - sum;
+}
+
+static int intel_me_i2c_event(I2CSlave *i2c, enum i2c_event event)
+{
+IntelMEState *s = INTEL_ME(i2c);
+
+switch (event) {
+case I2C_START_RECV:
+break;
+case I2C_START_SEND:
+trace_intel_me_rx_start(i2c->address);
+s->rx_len = 0;
+memset(s->rx_buf, 0, sizeof(s->rx_buf));
+break;
+case I2C_START_SEND_ASYNC:
+break;
+case I2C_FINISH:
+trace_intel_me_rx_end(i2c->address);
+s->tx_len = 10;
+s->tx_pos = 0;
+s->tx_buf[0] = s->rx_buf[2];
+s->tx_buf[1] = ((s->rx_buf[0] >> 2) + 1) << 2;
+s->tx_buf[2] = checksum(s->tx_buf, 2);
+s->tx_buf[3] = i2c->address;
+s->tx_buf[4] = (s->rx_buf[3] >> 2) << 2;
+s->tx_buf[5] = s->rx_buf[4];
+s->tx_buf[6] = 0x00;
+s->tx_buf[7] = 0x55;
+s->tx_buf[8] = 0x00;
+s->tx_buf[9] = checksum(s->tx_buf, s->tx_len - 1);
+s->tx_buf[0] >>= 1;
+i2c_bus_master(s->bus, s->bh);
+break;
+case I2C_NACK:
+break;
+}
+
+return 0;
+}
+
+static uint8_t intel_me_i2c_recv(I2CSlave *i2c)
+{
+return 0xff;
+}
+
+static int intel_me_i2c_send(I2CSlave *i2c, uint8_t data)
+{
+IntelMEState *s = INTEL_ME(i2c);
+
+trace_intel_me_rx_data(i2c->address, data);
+
+assert(s->rx_len < sizeof(s->rx_buf));
+s->rx_buf[s->rx_len++] = data;
+
+return 0;
+}
+
+static void intel_me_class_init(ObjectClass *oc, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(oc);
+I2CSlaveClass *i2c = I2C_SLAVE_CLASS(oc);
+
+dc->realize = intel_me_realize;
+i2c->event = intel_me_i2c_event;
+i2c->recv = intel_me_i2c_recv;
+i2c->send = intel_me_i2c_send;
+}
+
+static const TypeInfo types[] = {
+

[PATCH v3 08/14] hw/i2c/pmbus: Add idle state to return 0xff's

2022-06-29 Thread Peter Delevoryas

From: Peter Delevoryas 

Signed-off-by: Peter Delevoryas 
---
 hw/i2c/pmbus_device.c | 9 +
 include/hw/i2c/pmbus_device.h | 7 +++
 2 files changed, 16 insertions(+)

diff --git a/hw/i2c/pmbus_device.c b/hw/i2c/pmbus_device.c
index 62885fa6a1..f89fea65f3 100644
--- a/hw/i2c/pmbus_device.c
+++ b/hw/i2c/pmbus_device.c
@@ -261,6 +261,11 @@ void pmbus_check_limits(PMBusDevice *pmdev)
 }
 }
 
+void pmbus_idle(PMBusDevice *pmdev)
+{
+pmdev->code = PMBUS_IDLE_STATE;
+}
+
 /* assert the status_cml error upon receipt of malformed command */
 static void pmbus_cml_error(PMBusDevice *pmdev)
 {
@@ -984,6 +989,10 @@ static uint8_t pmbus_receive_byte(SMBusDevice *smd)
 }
 break;
 
+case PMBUS_IDLE_STATE:
+pmbus_send8(pmdev, PMBUS_ERR_BYTE);
+break;
+
 case PMBUS_CLEAR_FAULTS:  /* Send Byte */
 case PMBUS_PAGE_PLUS_WRITE:   /* Block Write-only */
 case PMBUS_STORE_DEFAULT_ALL: /* Send Byte */
diff --git a/include/hw/i2c/pmbus_device.h b/include/hw/i2c/pmbus_device.h
index 0f4d6b3fad..93f5d57c9d 100644
--- a/include/hw/i2c/pmbus_device.h
+++ b/include/hw/i2c/pmbus_device.h
@@ -155,6 +155,7 @@ enum pmbus_registers {
 PMBUS_MFR_MAX_TEMP_1= 0xC0, /* R/W word */
 PMBUS_MFR_MAX_TEMP_2= 0xC1, /* R/W word */
 PMBUS_MFR_MAX_TEMP_3= 0xC2, /* R/W word */
+PMBUS_IDLE_STATE= 0xFF,
 };
 
 /* STATUS_WORD */
@@ -527,6 +528,12 @@ int pmbus_page_config(PMBusDevice *pmdev, uint8_t 
page_index, uint64_t flags);
  */
 void pmbus_check_limits(PMBusDevice *pmdev);
 
+/**
+ * Enter an idle state where only the PMBUS_ERR_BYTE will be returned
+ * indefinitely until a new command is issued.
+ */
+void pmbus_idle(PMBusDevice *pmdev);
+
 extern const VMStateDescription vmstate_pmbus_device;
 
 #define VMSTATE_PMBUS_DEVICE(_field, _state) {   \
-- 
2.37.0

[PATCH v3 12/14] hw/misc/aspeed: Add fby35-sb-cpld

2022-06-29 Thread Peter Delevoryas

From: Peter Delevoryas 

fby35 machines have 1 BMC on a baseboard and 2-4 server boards with BIC's.
There are also CPLD's on each of the boards, one type of CPLD on the
baseboard and another type on each of the server boards. This commit adds an
implementation of some of the logic performed by the server board CPLD,
which is connected to the server board BIC.

fby35 machines have 1 baseboard with a BMC (AST2600) and 4 server boards
with bridge interconnects (BIC's, AST1030's). Each server board has a CPLD
on it which provides FRU information and some synchronization functionality
with the BMC. The baseboard also has one CPLD, but it does other stuff.

This commit just adds some of the FRU functionality to allow the BIC to
startup without any errors.

Signed-off-by: Peter Delevoryas 
---
 MAINTAINERS |   1 +
 hw/misc/fby35_sb_cpld.c | 128 
 hw/misc/meson.build |   3 +-
 3 files changed, 131 insertions(+), 1 deletion(-)
 create mode 100644 hw/misc/fby35_sb_cpld.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 05cf84b58c..3ffd473db1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1067,6 +1067,7 @@ F: hw/net/ftgmac100.c
 F: include/hw/net/ftgmac100.h
 F: docs/system/arm/aspeed.rst
 F: tests/qtest/*aspeed*
+F: hw/misc/fby35_sb_cpld.c
 
 NRF51
 M: Joel Stanley 
diff --git a/hw/misc/fby35_sb_cpld.c b/hw/misc/fby35_sb_cpld.c
new file mode 100644
index 00..f170a6c781
--- /dev/null
+++ b/hw/misc/fby35_sb_cpld.c
@@ -0,0 +1,128 @@
+/*
+ * fby35 Server Board CPLD
+ *
+ * Copyright (c) Meta Platforms, Inc. and affiliates. (http://www.meta.com)
+ *
+ * This code is licensed under the GPL version 2 or later. See the COPYING
+ * file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "hw/i2c/i2c.h"
+#include "hw/registerfields.h"
+
+#define BOARD_ID_CLASS1 0b
+#define BOARD_ID_CLASS2 0b0001
+
+#define TYPE_FBY35_SB_CPLD "fby35-sb-cpld"
+OBJECT_DECLARE_SIMPLE_TYPE(Fby35SbCpldState, FBY35_SB_CPLD);
+
+REG8(CLASS_TYPE, 0x5);
+FIELD(CLASS_TYPE, RESERVED, 0, 2);
+FIELD(CLASS_TYPE, 1OU_EXPANSION_NOT_PRESENT, 2, 1);
+FIELD(CLASS_TYPE, 2OU_EXPANSION_NOT_PRESENT, 3, 1);
+FIELD(CLASS_TYPE, BOARD_ID, 4, 4);
+REG8(BOARD_REVISION, 0x8);
+FIELD(BOARD_REVISION, VALUE, 0, 4);
+FIELD(BOARD_REVISION, RESERVED, 4, 4);
+
+struct Fby35SbCpldState {
+I2CSlave parent_obj;
+
+uint8_t target_reg;
+uint32_t regs[10];
+};
+
+static void fby35_sb_cpld_realize(DeviceState *dev, Error **errp)
+{
+Fby35SbCpldState *s = FBY35_SB_CPLD(dev);
+
+memset(s->regs, 0, sizeof(s->regs));
+s->target_reg = 0;
+
+ARRAY_FIELD_DP32(s->regs, CLASS_TYPE, BOARD_ID, 0b);
+ARRAY_FIELD_DP32(s->regs, CLASS_TYPE, 1OU_EXPANSION_NOT_PRESENT, 1);
+ARRAY_FIELD_DP32(s->regs, CLASS_TYPE, 2OU_EXPANSION_NOT_PRESENT, 1);
+ARRAY_FIELD_DP32(s->regs, BOARD_REVISION, VALUE, 0x1);
+}
+
+static int fby35_sb_cpld_i2c_event(I2CSlave *i2c, enum i2c_event event)
+{
+Fby35SbCpldState *s = FBY35_SB_CPLD(i2c);
+
+switch (event) {
+case I2C_START_RECV:
+break;
+case I2C_START_SEND:
+s->target_reg = 0;
+break;
+case I2C_START_SEND_ASYNC:
+case I2C_FINISH:
+case I2C_NACK:
+break;
+}
+
+return 0;
+}
+
+static uint8_t fby35_sb_cpld_i2c_recv(I2CSlave *i2c)
+{
+Fby35SbCpldState *s = FBY35_SB_CPLD(i2c);
+
+switch (s->target_reg) {
+case R_CLASS_TYPE:
+case R_BOARD_REVISION:
+return s->regs[s->target_reg];
+default:
+qemu_log_mask(LOG_UNIMP, "%s: Register read unimplemented: 0x%02x\n",
+  __func__, s->target_reg);
+return 0xff;
+}
+}
+
+static int fby35_sb_cpld_i2c_send(I2CSlave *i2c, uint8_t data)
+{
+Fby35SbCpldState *s = FBY35_SB_CPLD(i2c);
+
+if (s->target_reg == 0) {
+s->target_reg = data;
+return 0;
+}
+
+switch (s->target_reg) {
+case R_CLASS_TYPE:
+case R_BOARD_REVISION:
+s->regs[s->target_reg] = data;
+break;
+default:
+qemu_log_mask(LOG_UNIMP,
+  "%s: Register write unimplemented: 0x%02x 0x%02x\n",
+  __func__, s->target_reg, data);
+break;
+}
+
+return 0;
+}
+
+static void fby35_sb_cpld_class_init(ObjectClass *oc, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(oc);
+I2CSlaveClass *i2c = I2C_SLAVE_CLASS(oc);
+
+dc->realize = fby35_sb_cpld_realize;
+i2c->event = fby35_sb_cpld_i2c_event;
+i2c->recv = fby35_sb_cpld_i2c_recv;
+i2c->send = fby35_sb_cpld_i2c_send;
+}
+
+static const TypeInfo types[] = {
+{
+.name = TYPE_FBY35_SB_CPLD,
+.parent = TYPE_I2C_SLAVE,
+.instance_size = sizeof(Fby35SbCpldState),
+.class_init = fby35_sb_cpld_class_init,
+},
+};
+
+DEFINE_TYPES(types);
diff --git a/hw/misc/meson.build b/hw/misc/meson.build
index 95268eddc0..948e25c440 100644
--- a/hw/misc/meson.build
+++

[PATCH v3 04/14] hw/i2c: support multiple masters

2022-06-29 Thread Peter Delevoryas

From: Klaus Jensen 

Allow slaves to master the bus by registering a bottom halve. If the bus
is busy, the bottom half is queued up. When a slave has succesfully
mastered the bus, the bottom half is scheduled.

Signed-off-by: Klaus Jensen 
[ clg : - fixed typos in commit log ]
Message-Id: <20220601210831.67259-4-...@irrelevant.dk>
Signed-off-by: Cédric Le Goater 
---
 hw/i2c/core.c| 34 +-
 include/hw/i2c/i2c.h | 14 ++
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/hw/i2c/core.c b/hw/i2c/core.c
index d0cb2d32fa..145dce6078 100644
--- a/hw/i2c/core.c
+++ b/hw/i2c/core.c
@@ -13,6 +13,7 @@
 #include "migration/vmstate.h"
 #include "qapi/error.h"
 #include "qemu/module.h"
+#include "qemu/main-loop.h"
 #include "trace.h"
 
 #define I2C_BROADCAST 0x00
@@ -62,6 +63,7 @@ I2CBus *i2c_init_bus(DeviceState *parent, const char *name)
 
 bus = I2C_BUS(qbus_new(TYPE_I2C_BUS, parent, name));
 QLIST_INIT(>current_devs);
+QSIMPLEQ_INIT(>pending_masters);
 vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, _i2c_bus, bus);
 return bus;
 }
@@ -74,7 +76,7 @@ void i2c_slave_set_address(I2CSlave *dev, uint8_t address)
 /* Return nonzero if bus is busy.  */
 int i2c_bus_busy(I2CBus *bus)
 {
-return !QLIST_EMPTY(>current_devs);
+return !QLIST_EMPTY(>current_devs) || bus->bh;
 }
 
 bool i2c_scan_bus(I2CBus *bus, uint8_t address, bool broadcast,
@@ -180,6 +182,26 @@ int i2c_start_transfer(I2CBus *bus, uint8_t address, bool 
is_recv)
: I2C_START_SEND);
 }
 
+void i2c_bus_master(I2CBus *bus, QEMUBH *bh)
+{
+if (i2c_bus_busy(bus)) {
+I2CPendingMaster *node = g_new(struct I2CPendingMaster, 1);
+node->bh = bh;
+
+QSIMPLEQ_INSERT_TAIL(>pending_masters, node, entry);
+
+return;
+}
+
+bus->bh = bh;
+qemu_bh_schedule(bus->bh);
+}
+
+void i2c_bus_release(I2CBus *bus)
+{
+bus->bh = NULL;
+}
+
 int i2c_start_recv(I2CBus *bus, uint8_t address)
 {
 return i2c_do_start_transfer(bus, address, I2C_START_RECV);
@@ -206,6 +228,16 @@ void i2c_end_transfer(I2CBus *bus)
 g_free(node);
 }
 bus->broadcast = false;
+
+if (!QSIMPLEQ_EMPTY(>pending_masters)) {
+I2CPendingMaster *node = QSIMPLEQ_FIRST(>pending_masters);
+bus->bh = node->bh;
+
+QSIMPLEQ_REMOVE_HEAD(>pending_masters, entry);
+g_free(node);
+
+qemu_bh_schedule(bus->bh);
+}
 }
 
 int i2c_send(I2CBus *bus, uint8_t data)
diff --git a/include/hw/i2c/i2c.h b/include/hw/i2c/i2c.h
index 5ca3b708c0..be8bb8b78a 100644
--- a/include/hw/i2c/i2c.h
+++ b/include/hw/i2c/i2c.h
@@ -69,13 +69,25 @@ struct I2CNode {
 QLIST_ENTRY(I2CNode) next;
 };
 
+typedef struct I2CPendingMaster I2CPendingMaster;
+
+struct I2CPendingMaster {
+QEMUBH *bh;
+QSIMPLEQ_ENTRY(I2CPendingMaster) entry;
+};
+
 typedef QLIST_HEAD(I2CNodeList, I2CNode) I2CNodeList;
+typedef QSIMPLEQ_HEAD(I2CPendingMasters, I2CPendingMaster) I2CPendingMasters;
 
 struct I2CBus {
 BusState qbus;
 I2CNodeList current_devs;
+I2CPendingMasters pending_masters;
 uint8_t saved_address;
 bool broadcast;
+
+/* Set from slave currently mastering the bus. */
+QEMUBH *bh;
 };
 
 I2CBus *i2c_init_bus(DeviceState *parent, const char *name);
@@ -117,6 +129,8 @@ int i2c_start_send(I2CBus *bus, uint8_t address);
 
 void i2c_end_transfer(I2CBus *bus);
 void i2c_nack(I2CBus *bus);
+void i2c_bus_master(I2CBus *bus, QEMUBH *bh);
+void i2c_bus_release(I2CBus *bus);
 int i2c_send(I2CBus *bus, uint8_t data);
 uint8_t i2c_recv(I2CBus *bus);
 bool i2c_scan_bus(I2CBus *bus, uint8_t address, bool broadcast,
-- 
2.37.0

[PATCH v3 11/14] hw/misc/aspeed: Add PECI controller

2022-06-29 Thread Peter Delevoryas

From: Peter Delevoryas 

This introduces a really basic PECI controller that responses to
commands by always setting the response code to success and then raising
an interrupt to indicate the command is done. This helps avoid getting
hit with constant errors if the driver continuously attempts to send a
command and keeps timing out.

The AST2400 and AST2500 only included registers up to 0x5C, not 0xFC.
They supported PECI 1.1, 2.0, and 3.0. The AST2600 and AST1030 support
PECI 4.0, which includes more read/write buffer registers from 0x80 to
0xFC to support 64-byte mode.

This patch doesn't attempt to handle that, or to create a different
version of the controller for the different generations, since it's only
implementing functionality that is common to all generations.

The basic sequence of events is that the firmware will read and write to
various registers and then trigger a command by setting the FIRE bit in
the command register (similar to the I2C controller).

Then the firmware waits for an interrupt from the PECI controller,
expecting the interrupt status register to be filled in with info on
what happened. If the command was transmitted and received successfully,
then response codes from the host CPU will be found in the data buffer
registers.

Signed-off-by: Peter Delevoryas 
---
 hw/arm/aspeed_ast10x0.c   |  12 +++
 hw/arm/aspeed_ast2600.c   |  12 +++
 hw/arm/aspeed_soc.c   |  13 +++
 hw/misc/aspeed_peci.c | 152 ++
 hw/misc/meson.build   |   3 +-
 hw/misc/trace-events  |   5 ++
 include/hw/arm/aspeed_soc.h   |   3 +
 include/hw/misc/aspeed_peci.h |  29 +++
 8 files changed, 228 insertions(+), 1 deletion(-)
 create mode 100644 hw/misc/aspeed_peci.c
 create mode 100644 include/hw/misc/aspeed_peci.h

diff --git a/hw/arm/aspeed_ast10x0.c b/hw/arm/aspeed_ast10x0.c
index 5df480a21f..56e8de3d89 100644
--- a/hw/arm/aspeed_ast10x0.c
+++ b/hw/arm/aspeed_ast10x0.c
@@ -47,6 +47,7 @@ static const hwaddr aspeed_soc_ast1030_memmap[] = {
 [ASPEED_DEV_UART13]= 0x7E790700,
 [ASPEED_DEV_WDT]   = 0x7E785000,
 [ASPEED_DEV_LPC]   = 0x7E789000,
+[ASPEED_DEV_PECI]  = 0x7E78B000,
 [ASPEED_DEV_I2C]   = 0x7E7B,
 };
 
@@ -75,6 +76,7 @@ static const int aspeed_soc_ast1030_irqmap[] = {
 [ASPEED_DEV_TIMER8]= 23,
 [ASPEED_DEV_WDT]   = 24,
 [ASPEED_DEV_LPC]   = 35,
+[ASPEED_DEV_PECI]  = 38,
 [ASPEED_DEV_FMC]   = 39,
 [ASPEED_DEV_PWM]   = 44,
 [ASPEED_DEV_ADC]   = 46,
@@ -133,6 +135,8 @@ static void aspeed_soc_ast1030_init(Object *obj)
 
 object_initialize_child(obj, "lpc", >lpc, TYPE_ASPEED_LPC);
 
+object_initialize_child(obj, "peci", >peci, TYPE_ASPEED_PECI);
+
 object_initialize_child(obj, "sbc", >sbc, TYPE_ASPEED_SBC);
 
 for (i = 0; i < sc->wdts_num; i++) {
@@ -206,6 +210,14 @@ static void aspeed_soc_ast1030_realize(DeviceState 
*dev_soc, Error **errp)
 sysbus_connect_irq(SYS_BUS_DEVICE(>i2c.busses[i]), 0, irq);
 }
 
+/* PECI */
+if (!sysbus_realize(SYS_BUS_DEVICE(>peci), errp)) {
+return;
+}
+sysbus_mmio_map(SYS_BUS_DEVICE(>peci), 0, sc->memmap[ASPEED_DEV_PECI]);
+sysbus_connect_irq(SYS_BUS_DEVICE(>peci), 0,
+   aspeed_soc_get_irq(s, ASPEED_DEV_PECI));
+
 /* LPC */
 if (!sysbus_realize(SYS_BUS_DEVICE(>lpc), errp)) {
 return;
diff --git a/hw/arm/aspeed_ast2600.c b/hw/arm/aspeed_ast2600.c
index b0a4199b69..85178fabea 100644
--- a/hw/arm/aspeed_ast2600.c
+++ b/hw/arm/aspeed_ast2600.c
@@ -59,6 +59,7 @@ static const hwaddr aspeed_soc_ast2600_memmap[] = {
 [ASPEED_DEV_LPC]   = 0x1E789000,
 [ASPEED_DEV_IBT]   = 0x1E789140,
 [ASPEED_DEV_I2C]   = 0x1E78A000,
+[ASPEED_DEV_PECI]  = 0x1E78B000,
 [ASPEED_DEV_UART1] = 0x1E783000,
 [ASPEED_DEV_UART2] = 0x1E78D000,
 [ASPEED_DEV_UART3] = 0x1E78E000,
@@ -122,6 +123,7 @@ static const int aspeed_soc_ast2600_irqmap[] = {
 [ASPEED_DEV_LPC]   = 35,
 [ASPEED_DEV_IBT]   = 143,
 [ASPEED_DEV_I2C]   = 110,   /* 110 -> 125 */
+[ASPEED_DEV_PECI]  = 38,
 [ASPEED_DEV_ETH1]  = 2,
 [ASPEED_DEV_ETH2]  = 3,
 [ASPEED_DEV_HACE]  = 4,
@@ -180,6 +182,8 @@ static void aspeed_soc_ast2600_init(Object *obj)
 snprintf(typename, sizeof(typename), "aspeed.i2c-%s", socname);
 object_initialize_child(obj, "i2c", >i2c, typename);
 
+object_initialize_child(obj, "peci", >peci, TYPE_ASPEED_PECI);
+
 snprintf(typename, sizeof(typename), "aspeed.fmc-%s", socname);
 object_initialize_child(obj, "fmc", >fmc, typename);
 
@@ -388,6 +392,14 @@ static void aspeed_soc_ast2600_realize(DeviceState *dev, 
Error **errp)
 sysbus_connect_irq(SYS_BUS_DEVICE(>i2c.busses[i]), 0, irq);
 }
 
+/* PECI */
+if (!sysbus_realize(SYS_BUS_DEVICE(>peci), errp)) {
+return;
+}
+sysbus_mmio_map(SYS_BUS_DEVICE(>peci), 0,

[PATCH v3 10/14] hw/sensor: Add Renesas ISL69259 device model

2022-06-29 Thread Peter Delevoryas

From: Peter Delevoryas 

This adds the ISL69259, using all the same functionality as the existing
ISL69260 but overriding the IC_DEVICE_ID.

Signed-off-by: Peter Delevoryas 
---
 hw/sensor/isl_pmbus_vr.c | 28 
 1 file changed, 28 insertions(+)

diff --git a/hw/sensor/isl_pmbus_vr.c b/hw/sensor/isl_pmbus_vr.c
index 799ea9d89e..853d70536f 100644
--- a/hw/sensor/isl_pmbus_vr.c
+++ b/hw/sensor/isl_pmbus_vr.c
@@ -119,6 +119,18 @@ static void raa228000_exit_reset(Object *obj)
 pmdev->pages[0].read_temperature_3 = 0;
 }
 
+static void isl69259_exit_reset(Object *obj)
+{
+ISLState *s = ISL69260(obj);
+static const uint8_t ic_device_id[] = {0x04, 0x00, 0x81, 0xD2, 0x49, 0x3c};
+g_assert_cmphex(sizeof(ic_device_id), <=, sizeof(s->ic_device_id));
+
+isl_pmbus_vr_exit_reset(obj);
+
+s->ic_device_id_len = sizeof(ic_device_id);
+memcpy(s->ic_device_id, ic_device_id, sizeof(ic_device_id));
+}
+
 static void isl_pmbus_vr_add_props(Object *obj, uint64_t *flags, uint8_t pages)
 {
 PMBusDevice *pmdev = PMBUS_DEVICE(obj);
@@ -257,6 +269,21 @@ static void raa229004_class_init(ObjectClass *klass, void 
*data)
 isl_pmbus_vr_class_init(klass, data, 2);
 }
 
+static void isl69259_class_init(ObjectClass *klass, void *data)
+{
+ResettableClass *rc = RESETTABLE_CLASS(klass);
+DeviceClass *dc = DEVICE_CLASS(klass);
+dc->desc = "Renesas ISL69259 Digital Multiphase Voltage Regulator";
+rc->phases.exit = isl69259_exit_reset;
+isl_pmbus_vr_class_init(klass, data, 2);
+}
+
+static const TypeInfo isl69259_info = {
+.name = TYPE_ISL69259,
+.parent = TYPE_ISL69260,
+.class_init = isl69259_class_init,
+};
+
 static const TypeInfo isl69260_info = {
 .name = TYPE_ISL69260,
 .parent = TYPE_PMBUS_DEVICE,
@@ -283,6 +310,7 @@ static const TypeInfo raa228000_info = {
 
 static void isl_pmbus_vr_register_types(void)
 {
+type_register_static(_info);
 type_register_static(_info);
 type_register_static(_info);
 type_register_static(_info);
-- 
2.37.0

[PATCH v3 07/14] hw/i2c/aspeed: Add new-registers DMA slave mode RX support

2022-06-29 Thread Peter Delevoryas

From: Peter Delevoryas 

This commit adds support for DMA RX in slave mode while using the new
register set in the AST2600 and AST1030. This patch also pretty much
assumes packet mode is enabled, I'm not sure if this will work in DMA
step mode.

This is particularly useful for testing IPMB exchanges between Zephyr
and external devices, which requires multi-master I2C support and DMA in
the new register mode, because the Zephyr drivers from Aspeed use DMA in
the new mode by default. The Zephyr drivers are also using packet mode.

The typical sequence of events for receiving data in DMA slave + packet
mode is that the Zephyr firmware will configure the slave address
register with an address to receive on and configure the bus's function
control register to enable master mode and slave mode simultaneously at
startup, before any transfers are initiated.

RX DMA is enabled in the slave mode command register, and the slave RX
DMA buffer address and slave RX DMA buffer length are set. TX DMA is not
covered in this patch.

When the Aspeed I2C controller receives data from some other I2C master,
it will reset the I2CS_DMA_LEN RX_LEN value to zero, then buffer
incoming data in the RX DMA buffer while incrementing the I2CC_DMA_ADDR
address counter and decrementing the I2CC_DMA_LEN counter. It will also
update the I2CS_DMA_LEN RX_LEN value along the way.

Once all the data has been received, the bus controller will raise an
interrupt indicating a packet command was completed, the slave address
matched, a normal stop condition was seen, and the transfer was an RX
operation.

If the master sent a NACK instead of a normal stop condition, or the
transfer timed out, then a slightly different set of interrupt status
values would be set. Those conditions are not handled in this commit.

The Zephyr firmware then collects data from the RX DMA buffer and clears
the status register by writing the PKT_MODE_EN bit to the status
register. In packet mode, clearing the packet mode interrupt enable bit
also clears most of the other interrupt bits automatically (except for a
few bits above it).

Note: if the master transmit or receive functions were in use
simultaneously with the slave mode receive functionality, then the
master mode functions may have raised the interrupt line for the bus
before the DMA slave transfer is complete. It's important to have the
slave's interrupt status register clear throughout the receive
operation, and if the slave attempts to raise the interrupt before the
master interrupt status is cleared, then it needs to re-raise the
interrupt once the master interrupt status is cleared. (And vice-versa).
That's why in this commit, when the master interrupt status is cleared
and the interrupt line is lowered, we call the slave interrupt _raise_
function, to see if the interrupt was pending. (And again, vice-versa).

Signed-off-by: Peter Delevoryas 
---
 hw/i2c/aspeed_i2c.c | 133 
 include/hw/i2c/aspeed_i2c.h |   3 +
 2 files changed, 124 insertions(+), 12 deletions(-)

diff --git a/hw/i2c/aspeed_i2c.c b/hw/i2c/aspeed_i2c.c
index 8a8514586f..fc8b6b62cf 100644
--- a/hw/i2c/aspeed_i2c.c
+++ b/hw/i2c/aspeed_i2c.c
@@ -78,6 +78,18 @@ static inline void 
aspeed_i2c_bus_raise_interrupt(AspeedI2CBus *bus)
 }
 }
 
+static inline void aspeed_i2c_bus_raise_slave_interrupt(AspeedI2CBus *bus)
+{
+AspeedI2CClass *aic = ASPEED_I2C_GET_CLASS(bus->controller);
+
+if (!bus->regs[R_I2CS_INTR_STS]) {
+return;
+}
+
+bus->controller->intr_status |= 1 << bus->id;
+qemu_irq_raise(aic->bus_get_irq(bus));
+}
+
 static uint64_t aspeed_i2c_bus_old_read(AspeedI2CBus *bus, hwaddr offset,
 unsigned size)
 {
@@ -140,8 +152,17 @@ static uint64_t aspeed_i2c_bus_new_read(AspeedI2CBus *bus, 
hwaddr offset,
 case A_I2CM_DMA_LEN_STS:
 case A_I2CC_DMA_ADDR:
 case A_I2CC_DMA_LEN:
+
+case A_I2CS_DEV_ADDR:
+case A_I2CS_DMA_RX_ADDR:
+case A_I2CS_DMA_LEN:
+case A_I2CS_CMD:
+case A_I2CS_INTR_CTRL:
+case A_I2CS_DMA_LEN_STS:
 /* Value is already set, don't do anything. */
 break;
+case A_I2CS_INTR_STS:
+break;
 case A_I2CM_CMD:
 value = SHARED_FIELD_DP32(value, BUS_BUSY_STS, i2c_bus_busy(bus->bus));
 break;
@@ -547,12 +568,7 @@ static void aspeed_i2c_bus_new_write(AspeedI2CBus *bus, 
hwaddr offset,
 
 switch (offset) {
 case A_I2CC_FUN_CTRL:
-if (SHARED_FIELD_EX32(value, SLAVE_EN)) {
-qemu_log_mask(LOG_UNIMP, "%s: slave mode not implemented\n",
-  __func__);
-break;
-}
-bus->regs[R_I2CC_FUN_CTRL] = value & 0x007dc3ff;
+bus->regs[R_I2CC_FUN_CTRL] = value;
 break;
 case A_I2CC_AC_TIMING:
 bus->regs[R_I2CC_AC_TIMING] = value & 0x10ff;
@@ -580,6 +596,7 @@ static void aspeed_i2c_bus_new_write(AspeedI2CBus *bus, 
hwaddr offset,

[PATCH v3 02/14] hw/i2c/aspeed: Fix DMA len write-enable bit handling

2022-06-29 Thread Peter Delevoryas

From: Peter Delevoryas 

I noticed i2c rx transfers were getting shortened to "1" on Zephyr. It
seems to be because the Zephyr i2c driver sets the RX DMA len with the
RX field write-enable bit set (bit 31) to avoid a read-modify-write. [1]

/* 0x1C : I2CM Master DMA Transfer Length Register   */

I think we should be checking the write-enable bits on the incoming
value, not checking the register array. I'm not sure we're even writing
the write-enable bits to the register array, actually.

[1] 
https://github.com/AspeedTech-BMC/zephyr/blob/db3dbcc9c52e67a47180890ac938ed380b33f91c/drivers/i2c/i2c_aspeed.c#L145-L148

Fixes: ba2cccd64e90f34 ("aspeed: i2c: Add new mode support")
Signed-off-by: Peter Delevoryas 
---
 hw/i2c/aspeed_i2c.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/hw/i2c/aspeed_i2c.c b/hw/i2c/aspeed_i2c.c
index ff33571954..cbaa7c96fc 100644
--- a/hw/i2c/aspeed_i2c.c
+++ b/hw/i2c/aspeed_i2c.c
@@ -644,18 +644,18 @@ static void aspeed_i2c_bus_new_write(AspeedI2CBus *bus, 
hwaddr offset,
  RX_BUF_LEN) + 1;
 break;
 case A_I2CM_DMA_LEN:
-w1t = ARRAY_FIELD_EX32(bus->regs, I2CM_DMA_LEN, RX_BUF_LEN_W1T) ||
-   ARRAY_FIELD_EX32(bus->regs, I2CM_DMA_LEN, TX_BUF_LEN_W1T);
+w1t = FIELD_EX32(value, I2CM_DMA_LEN, RX_BUF_LEN_W1T) ||
+  FIELD_EX32(value, I2CM_DMA_LEN, TX_BUF_LEN_W1T);
 /* If none of the w1t bits are set, just write to the reg as normal. */
 if (!w1t) {
 bus->regs[R_I2CM_DMA_LEN] = value;
 break;
 }
-if (ARRAY_FIELD_EX32(bus->regs, I2CM_DMA_LEN, RX_BUF_LEN_W1T)) {
+if (FIELD_EX32(value, I2CM_DMA_LEN, RX_BUF_LEN_W1T)) {
 ARRAY_FIELD_DP32(bus->regs, I2CM_DMA_LEN, RX_BUF_LEN,
  FIELD_EX32(value, I2CM_DMA_LEN, RX_BUF_LEN));
 }
-if (ARRAY_FIELD_EX32(bus->regs, I2CM_DMA_LEN, TX_BUF_LEN_W1T)) {
+if (FIELD_EX32(value, I2CM_DMA_LEN, TX_BUF_LEN_W1T)) {
 ARRAY_FIELD_DP32(bus->regs, I2CM_DMA_LEN, TX_BUF_LEN,
  FIELD_EX32(value, I2CM_DMA_LEN, TX_BUF_LEN));
 }
-- 
2.37.0

[PATCH v3 06/14] hw/i2c/aspeed: add slave device in old register mode

2022-06-29 Thread Peter Delevoryas

From: Klaus Jensen 

Add slave mode functionality for the Aspeed I2C controller in old
register mode. This is implemented by realizing an I2C slave device
owned by the I2C controller and attached to its own bus.

The I2C slave device only implements asynchronous sends on the bus, so
slaves not supporting that will not be able to communicate with it.

Signed-off-by: Klaus Jensen 
[ clg: checkpatch fixes ]
Message-Id: <20220601210831.67259-6-...@irrelevant.dk>
Signed-off-by: Cédric Le Goater 
---
 hw/i2c/aspeed_i2c.c | 89 +
 include/hw/i2c/aspeed_i2c.h |  8 
 2 files changed, 88 insertions(+), 9 deletions(-)

diff --git a/hw/i2c/aspeed_i2c.c b/hw/i2c/aspeed_i2c.c
index c153a1a942..8a8514586f 100644
--- a/hw/i2c/aspeed_i2c.c
+++ b/hw/i2c/aspeed_i2c.c
@@ -696,9 +696,7 @@ static void aspeed_i2c_bus_old_write(AspeedI2CBus *bus, 
hwaddr offset,
 switch (offset) {
 case A_I2CD_FUN_CTRL:
 if (SHARED_FIELD_EX32(value, SLAVE_EN)) {
-qemu_log_mask(LOG_UNIMP, "%s: slave mode not implemented\n",
-  __func__);
-break;
+i2c_slave_set_address(bus->slave, bus->regs[R_I2CD_DEV_ADDR]);
 }
 bus->regs[R_I2CD_FUN_CTRL] = value & 0x0071C3FF;
 break;
@@ -719,12 +717,15 @@ static void aspeed_i2c_bus_old_write(AspeedI2CBus *bus, 
hwaddr offset,
 bus->controller->intr_status &= ~(1 << bus->id);
 qemu_irq_lower(aic->bus_get_irq(bus));
 }
-if (handle_rx && (SHARED_ARRAY_FIELD_EX32(bus->regs, R_I2CD_CMD,
-  M_RX_CMD) ||
-  SHARED_ARRAY_FIELD_EX32(bus->regs, R_I2CD_CMD,
-  M_S_RX_CMD_LAST))) {
-aspeed_i2c_handle_rx_cmd(bus);
-aspeed_i2c_bus_raise_interrupt(bus);
+if (handle_rx) {
+if (SHARED_ARRAY_FIELD_EX32(bus->regs, R_I2CD_CMD, M_RX_CMD) ||
+SHARED_ARRAY_FIELD_EX32(bus->regs, R_I2CD_CMD,
+M_S_RX_CMD_LAST)) {
+aspeed_i2c_handle_rx_cmd(bus);
+aspeed_i2c_bus_raise_interrupt(bus);
+} else if (aspeed_i2c_get_state(bus) == I2CD_STXD) {
+i2c_ack(bus->bus);
+}
 }
 break;
 case A_I2CD_DEV_ADDR:
@@ -1036,6 +1037,73 @@ static const TypeInfo aspeed_i2c_info = {
 .abstract   = true,
 };
 
+static int aspeed_i2c_bus_slave_event(I2CSlave *slave, enum i2c_event event)
+{
+BusState *qbus = qdev_get_parent_bus(DEVICE(slave));
+AspeedI2CBus *bus = ASPEED_I2C_BUS(qbus->parent);
+uint32_t reg_intr_sts = aspeed_i2c_bus_intr_sts_offset(bus);
+uint32_t reg_byte_buf = aspeed_i2c_bus_byte_buf_offset(bus);
+uint32_t value;
+
+switch (event) {
+case I2C_START_SEND_ASYNC:
+value = SHARED_ARRAY_FIELD_EX32(bus->regs, reg_byte_buf, TX_BUF);
+SHARED_ARRAY_FIELD_DP32(bus->regs, reg_byte_buf, RX_BUF, value << 1);
+
+ARRAY_FIELD_DP32(bus->regs, I2CD_INTR_STS, SLAVE_ADDR_RX_MATCH, 1);
+SHARED_ARRAY_FIELD_DP32(bus->regs, reg_intr_sts, RX_DONE, 1);
+
+aspeed_i2c_set_state(bus, I2CD_STXD);
+
+break;
+
+case I2C_FINISH:
+SHARED_ARRAY_FIELD_DP32(bus->regs, reg_intr_sts, NORMAL_STOP, 1);
+
+aspeed_i2c_set_state(bus, I2CD_IDLE);
+
+break;
+
+default:
+return -1;
+}
+
+aspeed_i2c_bus_raise_interrupt(bus);
+
+return 0;
+}
+
+static void aspeed_i2c_bus_slave_send_async(I2CSlave *slave, uint8_t data)
+{
+BusState *qbus = qdev_get_parent_bus(DEVICE(slave));
+AspeedI2CBus *bus = ASPEED_I2C_BUS(qbus->parent);
+uint32_t reg_intr_sts = aspeed_i2c_bus_intr_sts_offset(bus);
+uint32_t reg_byte_buf = aspeed_i2c_bus_byte_buf_offset(bus);
+
+SHARED_ARRAY_FIELD_DP32(bus->regs, reg_byte_buf, RX_BUF, data);
+SHARED_ARRAY_FIELD_DP32(bus->regs, reg_intr_sts, RX_DONE, 1);
+
+aspeed_i2c_bus_raise_interrupt(bus);
+}
+
+static void aspeed_i2c_bus_slave_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+I2CSlaveClass *sc = I2C_SLAVE_CLASS(klass);
+
+dc->desc = "Aspeed I2C Bus Slave";
+
+sc->event = aspeed_i2c_bus_slave_event;
+sc->send_async = aspeed_i2c_bus_slave_send_async;
+}
+
+static const TypeInfo aspeed_i2c_bus_slave_info = {
+.name   = TYPE_ASPEED_I2C_BUS_SLAVE,
+.parent = TYPE_I2C_SLAVE,
+.instance_size  = sizeof(AspeedI2CBusSlave),
+.class_init = aspeed_i2c_bus_slave_class_init,
+};
+
 static void aspeed_i2c_bus_reset(DeviceState *dev)
 {
 AspeedI2CBus *s = ASPEED_I2C_BUS(dev);
@@ -1060,6 +1128,8 @@ static void aspeed_i2c_bus_realize(DeviceState *dev, 
Error **errp)
 sysbus_init_irq(SYS_BUS_DEVICE(dev), >irq);
 
 s->bus = i2c_init_bus(dev, name);
+s->slave = i2c_slave_create_simple(s->bus, TYPE_ASPEED_I2C_BUS_SLAVE,
+

[PATCH v3 09/14] hw/sensor: Add IC_DEVICE_ID to ISL voltage regulators

2022-06-29 Thread Peter Delevoryas

From: Peter Delevoryas 

This commit adds a passthrough for PMBUS_IC_DEVICE_ID to allow Renesas
voltage regulators to return the integrated circuit device ID if they
would like to.

The behavior is very device specific, so it hasn't been added to the
general PMBUS model. Additionally, if the device ID hasn't been set,
then the voltage regulator will respond with the error byte value.  The
guest error message will change slightly for IC_DEVICE_ID with this
commit.

Signed-off-by: Peter Delevoryas 
---
 hw/sensor/isl_pmbus_vr.c | 12 
 include/hw/sensor/isl_pmbus_vr.h |  5 +
 2 files changed, 17 insertions(+)

diff --git a/hw/sensor/isl_pmbus_vr.c b/hw/sensor/isl_pmbus_vr.c
index e11e028884..799ea9d89e 100644
--- a/hw/sensor/isl_pmbus_vr.c
+++ b/hw/sensor/isl_pmbus_vr.c
@@ -15,6 +15,18 @@
 
 static uint8_t isl_pmbus_vr_read_byte(PMBusDevice *pmdev)
 {
+ISLState *s = ISL69260(pmdev);
+
+switch (pmdev->code) {
+case PMBUS_IC_DEVICE_ID:
+if (!s->ic_device_id_len) {
+break;
+}
+pmbus_send(pmdev, s->ic_device_id, s->ic_device_id_len);
+pmbus_idle(pmdev);
+return 0;
+}
+
 qemu_log_mask(LOG_GUEST_ERROR,
   "%s: reading from unsupported register: 0x%02x\n",
   __func__, pmdev->code);
diff --git a/include/hw/sensor/isl_pmbus_vr.h b/include/hw/sensor/isl_pmbus_vr.h
index 3e47ff7e48..aa2c2767df 100644
--- a/include/hw/sensor/isl_pmbus_vr.h
+++ b/include/hw/sensor/isl_pmbus_vr.h
@@ -12,12 +12,17 @@
 #include "hw/i2c/pmbus_device.h"
 #include "qom/object.h"
 
+#define TYPE_ISL69259   "isl69259"
 #define TYPE_ISL69260   "isl69260"
 #define TYPE_RAA228000  "raa228000"
 #define TYPE_RAA229004  "raa229004"
+#define ISL_MAX_IC_DEVICE_ID_LEN 16
 
 struct ISLState {
 PMBusDevice parent;
+
+uint8_t ic_device_id[ISL_MAX_IC_DEVICE_ID_LEN];
+uint8_t ic_device_id_len;
 };
 
 OBJECT_DECLARE_SIMPLE_TYPE(ISLState, ISL69260)
-- 
2.37.0

[PATCH v3 00/14] hw/i2c/aspeed: I2C slave mode DMA RX w/ new regs

2022-06-29 Thread Peter Delevoryas

From: Peter Delevoryas 

v3:
- hw/i2c/pmbus_device:
  - Removed commit that resets the out buf.
  - Removed IC_DEVICE_ID
  - Added commit to allow devices to move to an idle state that
avoids enqueuing excess data into the out buf.
- hw/sensor/isl_pmbus_vr:
  - Added IC_DEVICE_ID commit just for voltage regulators.
  - Added ISL69259 with an IC_DEVICE_ID.
- hw/misc/aspeed_peci:
  - Moved registers from .h to .c
  - Replaced guest_error on interrupt disable case with trace
for all interrupts (not just when they're disabled).
  - Removed leftover qemu_irq_raise

Thanks,
Peter

Klaus Jensen (3):
  hw/i2c: support multiple masters
  hw/i2c: add asynchronous send
  hw/i2c/aspeed: add slave device in old register mode

Peter Delevoryas (11):
  hw/i2c/aspeed: Fix R_I2CD_FUN_CTRL reference
  hw/i2c/aspeed: Fix DMA len write-enable bit handling
  hw/i2c/aspeed: Fix MASTER_EN missing error message
  hw/i2c/aspeed: Add new-registers DMA slave mode RX support
  hw/i2c/pmbus: Add idle state to return 0xff's
  hw/sensor: Add IC_DEVICE_ID to ISL voltage regulators
  hw/sensor: Add Renesas ISL69259 device model
  hw/misc/aspeed: Add PECI controller
  hw/misc/aspeed: Add fby35-sb-cpld
  hw/misc/aspeed: Add intel-me
  hw/arm/aspeed: Add oby35-cl machine

 MAINTAINERS  |   2 +
 hw/arm/aspeed.c  |  48 +++
 hw/arm/aspeed_ast10x0.c  |  12 ++
 hw/arm/aspeed_ast2600.c  |  12 ++
 hw/arm/aspeed_soc.c  |  13 ++
 hw/arm/pxa2xx.c  |   2 +
 hw/display/sii9022.c |   2 +
 hw/display/ssd0303.c |   2 +
 hw/i2c/aspeed_i2c.c  | 234 +++
 hw/i2c/core.c|  70 -
 hw/i2c/pmbus_device.c|   9 ++
 hw/i2c/smbus_slave.c |   4 +
 hw/i2c/trace-events  |   2 +
 hw/misc/aspeed_peci.c| 152 
 hw/misc/fby35_sb_cpld.c  | 128 +
 hw/misc/intel_me.c   | 162 +
 hw/misc/meson.build  |   5 +-
 hw/misc/trace-events |  13 ++
 hw/nvram/eeprom_at24c.c  |   2 +
 hw/sensor/isl_pmbus_vr.c |  40 ++
 hw/sensor/lsm303dlhc_mag.c   |   2 +
 include/hw/arm/aspeed_soc.h  |   3 +
 include/hw/i2c/aspeed_i2c.h  |  11 ++
 include/hw/i2c/i2c.h |  30 
 include/hw/i2c/pmbus_device.h|   7 +
 include/hw/misc/aspeed_peci.h|  29 
 include/hw/sensor/isl_pmbus_vr.h |   5 +
 27 files changed, 971 insertions(+), 30 deletions(-)
 create mode 100644 hw/misc/aspeed_peci.c
 create mode 100644 hw/misc/fby35_sb_cpld.c
 create mode 100644 hw/misc/intel_me.c
 create mode 100644 include/hw/misc/aspeed_peci.h

-- 
2.37.0

[PATCH v3 01/14] hw/i2c/aspeed: Fix R_I2CD_FUN_CTRL reference

2022-06-29 Thread Peter Delevoryas

From: Peter Delevoryas 

Very minor, doesn't effect functionality, but this is supposed to be
R_I2CC_FUN_CTRL (new-mode, not old-mode).

Fixes: ba2cccd64e9 ("aspeed: i2c: Add new mode support")
Signed-off-by: Peter Delevoryas 
---
 hw/i2c/aspeed_i2c.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/i2c/aspeed_i2c.c b/hw/i2c/aspeed_i2c.c
index 37ae1f2e04..ff33571954 100644
--- a/hw/i2c/aspeed_i2c.c
+++ b/hw/i2c/aspeed_i2c.c
@@ -552,7 +552,7 @@ static void aspeed_i2c_bus_new_write(AspeedI2CBus *bus, 
hwaddr offset,
   __func__);
 break;
 }
-bus->regs[R_I2CD_FUN_CTRL] = value & 0x007dc3ff;
+bus->regs[R_I2CC_FUN_CTRL] = value & 0x007dc3ff;
 break;
 case A_I2CC_AC_TIMING:
 bus->regs[R_I2CC_AC_TIMING] = value & 0x10ff;
-- 
2.37.0

[PATCH v3 03/14] hw/i2c/aspeed: Fix MASTER_EN missing error message

2022-06-29 Thread Peter Delevoryas

From: Peter Delevoryas 

aspeed_i2c_bus_is_master is checking if master mode is enabled in the I2C
bus controller's function-control register, not that slave mode is enabled
or something.  The error here is that the guest is trying to trigger an I2C
master mode command while master mode is not enabled.

Fixes: ba2cccd64e90f342 ("aspeed: i2c: Add new mode support")
Signed-off-by: Peter Delevoryas 
---
 hw/i2c/aspeed_i2c.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/i2c/aspeed_i2c.c b/hw/i2c/aspeed_i2c.c
index cbaa7c96fc..c153a1a942 100644
--- a/hw/i2c/aspeed_i2c.c
+++ b/hw/i2c/aspeed_i2c.c
@@ -601,7 +601,7 @@ static void aspeed_i2c_bus_new_write(AspeedI2CBus *bus, 
hwaddr offset,
 }
 
 if (!aspeed_i2c_bus_is_master(bus)) {
-qemu_log_mask(LOG_UNIMP, "%s: slave mode not implemented\n",
+qemu_log_mask(LOG_GUEST_ERROR, "%s: Master mode is not enabled\n",
   __func__);
 break;
 }
@@ -744,7 +744,7 @@ static void aspeed_i2c_bus_old_write(AspeedI2CBus *bus, 
hwaddr offset,
 }
 
 if (!aspeed_i2c_bus_is_master(bus)) {
-qemu_log_mask(LOG_UNIMP, "%s: slave mode not implemented\n",
+qemu_log_mask(LOG_GUEST_ERROR, "%s: Master mode is not enabled\n",
   __func__);
 break;
 }
-- 
2.37.0

[PATCH v3 05/14] hw/i2c: add asynchronous send

2022-06-29 Thread Peter Delevoryas

From: Klaus Jensen 

Add an asynchronous version of i2c_send() that requires the slave to
explicitly acknowledge on the bus with i2c_ack().

The current master must use the new i2c_start_send_async() to indicate
that it wants to do an asynchronous transfer. This allows the i2c core
to check if the target slave supports this or not. This approach relies
on adding a new enum i2c_event member, which is why a bunch of other
devices needs changes in their event handling switches.

Signed-off-by: Klaus Jensen 
Message-Id: <20220601210831.67259-5-...@irrelevant.dk>
Signed-off-by: Cédric Le Goater 
---
 hw/arm/pxa2xx.c|  2 ++
 hw/display/sii9022.c   |  2 ++
 hw/display/ssd0303.c   |  2 ++
 hw/i2c/core.c  | 36 +++-
 hw/i2c/smbus_slave.c   |  4 
 hw/i2c/trace-events|  2 ++
 hw/nvram/eeprom_at24c.c|  2 ++
 hw/sensor/lsm303dlhc_mag.c |  2 ++
 include/hw/i2c/i2c.h   | 16 
 9 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/hw/arm/pxa2xx.c b/hw/arm/pxa2xx.c
index f4f687df68..93dda83d7a 100644
--- a/hw/arm/pxa2xx.c
+++ b/hw/arm/pxa2xx.c
@@ -1305,6 +1305,8 @@ static int pxa2xx_i2c_event(I2CSlave *i2c, enum i2c_event 
event)
 case I2C_NACK:
 s->status |= 1 << 1;   /* set ACKNAK */
 break;
+default:
+return -1;
 }
 pxa2xx_i2c_update(s);
 
diff --git a/hw/display/sii9022.c b/hw/display/sii9022.c
index b591a58789..664fd4046d 100644
--- a/hw/display/sii9022.c
+++ b/hw/display/sii9022.c
@@ -76,6 +76,8 @@ static int sii9022_event(I2CSlave *i2c, enum i2c_event event)
 break;
 case I2C_NACK:
 break;
+default:
+return -1;
 }
 
 return 0;
diff --git a/hw/display/ssd0303.c b/hw/display/ssd0303.c
index aeae22da9c..d67b0ad7b5 100644
--- a/hw/display/ssd0303.c
+++ b/hw/display/ssd0303.c
@@ -196,6 +196,8 @@ static int ssd0303_event(I2CSlave *i2c, enum i2c_event 
event)
 case I2C_NACK:
 /* Nothing to do.  */
 break;
+default:
+return -1;
 }
 
 return 0;
diff --git a/hw/i2c/core.c b/hw/i2c/core.c
index 145dce6078..d4ba8146bf 100644
--- a/hw/i2c/core.c
+++ b/hw/i2c/core.c
@@ -161,7 +161,8 @@ static int i2c_do_start_transfer(I2CBus *bus, uint8_t 
address,
start condition.  */
 
 if (sc->event) {
-trace_i2c_event("start", s->address);
+trace_i2c_event(event == I2C_START_SEND ? "start" : "start_async",
+s->address);
 rv = sc->event(s, event);
 if (rv && !bus->broadcast) {
 if (bus_scanned) {
@@ -212,6 +213,11 @@ int i2c_start_send(I2CBus *bus, uint8_t address)
 return i2c_do_start_transfer(bus, address, I2C_START_SEND);
 }
 
+int i2c_start_send_async(I2CBus *bus, uint8_t address)
+{
+return i2c_do_start_transfer(bus, address, I2C_START_SEND_ASYNC);
+}
+
 void i2c_end_transfer(I2CBus *bus)
 {
 I2CSlaveClass *sc;
@@ -261,6 +267,23 @@ int i2c_send(I2CBus *bus, uint8_t data)
 return ret ? -1 : 0;
 }
 
+int i2c_send_async(I2CBus *bus, uint8_t data)
+{
+I2CNode *node = QLIST_FIRST(>current_devs);
+I2CSlave *slave = node->elt;
+I2CSlaveClass *sc = I2C_SLAVE_GET_CLASS(slave);
+
+if (!sc->send_async) {
+return -1;
+}
+
+trace_i2c_send_async(slave->address, data);
+
+sc->send_async(slave, data);
+
+return 0;
+}
+
 uint8_t i2c_recv(I2CBus *bus)
 {
 uint8_t data = 0xff;
@@ -297,6 +320,17 @@ void i2c_nack(I2CBus *bus)
 }
 }
 
+void i2c_ack(I2CBus *bus)
+{
+if (!bus->bh) {
+return;
+}
+
+trace_i2c_ack();
+
+qemu_bh_schedule(bus->bh);
+}
+
 static int i2c_slave_post_load(void *opaque, int version_id)
 {
 I2CSlave *dev = opaque;
diff --git a/hw/i2c/smbus_slave.c b/hw/i2c/smbus_slave.c
index 5d10e27664..feb3ec6333 100644
--- a/hw/i2c/smbus_slave.c
+++ b/hw/i2c/smbus_slave.c
@@ -143,6 +143,10 @@ static int smbus_i2c_event(I2CSlave *s, enum i2c_event 
event)
 dev->mode = SMBUS_CONFUSED;
 break;
 }
+break;
+
+default:
+return -1;
 }
 
 return 0;
diff --git a/hw/i2c/trace-events b/hw/i2c/trace-events
index 209275ed2d..af181d43ee 100644
--- a/hw/i2c/trace-events
+++ b/hw/i2c/trace-events
@@ -4,7 +4,9 @@
 
 i2c_event(const char *event, uint8_t address) "%s(addr:0x%02x)"
 i2c_send(uint8_t address, uint8_t data) "send(addr:0x%02x) data:0x%02x"
+i2c_send_async(uint8_t address, uint8_t data) "send_async(addr:0x%02x) 
data:0x%02x"
 i2c_recv(uint8_t address, uint8_t data) "recv(addr:0x%02x) data:0x%02x"
+i2c_ack(void) ""
 
 # aspeed_i2c.c
 
diff --git a/hw/nvram/eeprom_at24c.c b/hw/nvram/eeprom_at24c.c
index 01a3093600..d695f6ae89 100644
--- a/hw/nvram/eeprom_at24c.c
+++ b/hw/nvram/eeprom_at24c.c
@@ -75,6 +75,8 @@ int at24c_eeprom_event(I2CSlave *s, enum i2c_event event)
 break;
 case I2C_NACK:
 break;
+default:
+return -1;

[PATCH v2] hw/nvme: Use ioeventfd to handle doorbell updates

2022-06-29 Thread Jinhao Fan

Add property "ioeventfd" which is enabled by default. When this is
enabled, updates on the doorbell registers will cause KVM to signal
an event to the QEMU main loop to handle the doorbell updates.
Therefore, instead of letting the vcpu thread run both guest VM and
IO emulation, we now use the main loop thread to do IO emulation and
thus the vcpu thread has more cycles for the guest VM.

Since ioeventfd does not tell us the exact value that is written, it is
only useful when shadow doorbell buffer is enabled, where we check
for the value in the shadow doorbell buffer when we get the doorbell
update event.

IOPS comparison on Linux 5.19-rc2: (Unit: KIOPS)

qd   1   4  16  64
qemu35 121 176 153
ioeventfd   41 133 258 313

Changes since v1:
- Return value handling code cleanup

Signed-off-by: Jinhao Fan 
---
 hw/nvme/ctrl.c | 100 -
 hw/nvme/nvme.h |   5 +++
 2 files changed, 104 insertions(+), 1 deletion(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index c952c34f94..fb5466678a 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -1374,7 +1374,14 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, 
NvmeRequest *req)
 
 QTAILQ_REMOVE(>sq->out_req_list, req, entry);
 QTAILQ_INSERT_TAIL(>req_list, req, entry);
-timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
+
+if (req->sq->ioeventfd_enabled) {
+/* Post CQE directly since we are in main loop thread */
+nvme_post_cqes(cq);
+} else {
+/* Schedule the timer to post CQE later since we are in vcpu thread */
+timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
+}
 }
 
 static void nvme_process_aers(void *opaque)
@@ -4195,10 +4202,76 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest 
*req)
 return NVME_INVALID_OPCODE | NVME_DNR;
 }
 
+static void nvme_cq_notifier(EventNotifier *e)
+{
+NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier);
+NvmeCtrl *n = cq->ctrl;
+
+event_notifier_test_and_clear(>notifier);
+
+nvme_update_cq_head(cq);
+
+if (cq->tail == cq->head) {
+if (cq->irq_enabled) {
+n->cq_pending--;
+}
+
+nvme_irq_deassert(n, cq);
+}
+
+nvme_post_cqes(cq);
+}
+
+static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
+{
+NvmeCtrl *n = cq->ctrl;
+uint16_t offset = (cq->cqid << 3) + (1 << 2);
+int ret;
+
+ret = event_notifier_init(>notifier, 0);
+if (ret < 0) {
+return ret;
+}
+
+event_notifier_set_handler(>notifier, nvme_cq_notifier);
+memory_region_add_eventfd(>iomem,
+  0x1000 + offset, 4, false, 0, >notifier);
+
+return 0;
+}
+
+static void nvme_sq_notifier(EventNotifier *e)
+{
+NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier);
+
+event_notifier_test_and_clear(>notifier);
+
+nvme_process_sq(sq);
+}
+
+static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
+{
+NvmeCtrl *n = sq->ctrl;
+uint16_t offset = sq->sqid << 3;
+int ret;
+
+ret = event_notifier_init(>notifier, 0);
+if (ret < 0) {
+return ret;
+}
+
+event_notifier_set_handler(>notifier, nvme_sq_notifier);
+memory_region_add_eventfd(>iomem,
+  0x1000 + offset, 4, false, 0, >notifier);
+
+return 0;
+}
+
 static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
 {
 n->sq[sq->sqid] = NULL;
 timer_free(sq->timer);
+event_notifier_cleanup(>notifier);
 g_free(sq->io_req);
 if (sq->sqid) {
 g_free(sq);
@@ -4271,6 +4344,12 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, 
uint64_t dma_addr,
 if (n->dbbuf_enabled) {
 sq->db_addr = n->dbbuf_dbs + (sqid << 3);
 sq->ei_addr = n->dbbuf_eis + (sqid << 3);
+
+if (n->params.ioeventfd && sq->sqid != 0) {
+if (!nvme_init_sq_ioeventfd(sq)) {
+sq->ioeventfd_enabled = true;
+}
+}
 }
 
 assert(n->cq[cqid]);
@@ -4577,6 +4656,7 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
 {
 n->cq[cq->cqid] = NULL;
 timer_free(cq->timer);
+event_notifier_cleanup(>notifier);
 if (msix_enabled(>parent_obj)) {
 msix_vector_unuse(>parent_obj, cq->vector);
 }
@@ -4635,6 +4715,12 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, 
uint64_t dma_addr,
 if (n->dbbuf_enabled) {
 cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2);
 cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2);
+
+if (n->params.ioeventfd && cqid != 0) {
+if (!nvme_init_cq_ioeventfd(cq)) {
+cq->ioeventfd_enabled = false;
+}
+}
 }
 n->cq[cqid] = cq;
 cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
@@ -5793,6 +5879,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
NvmeRequest *req)
 uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
 uint64_t eis_addr =

[PATCH] target/s390x: Exit tb after executing ex_value

2022-06-29 Thread Richard Henderson

When EXECUTE sets ex_value to interrupt the constructed instruction,
we implicitly disable interrupts so that the value is not corrupted.
Exit to the main loop after execution, so that we re-evaluate any
pending interrupts.

Reported-by: Sven Schnelle 
Signed-off-by: Richard Henderson 
---

Hi Sven.  Will you test this vs your testcase?  Thanks,

r~

---
 target/s390x/tcg/translate.c | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c
index fd2433d625..e52c2a4a6f 100644
--- a/target/s390x/tcg/translate.c
+++ b/target/s390x/tcg/translate.c
@@ -6620,11 +6620,18 @@ static void s390x_tr_translate_insn(DisasContextBase 
*dcbase, CPUState *cs)
 
 dc->base.is_jmp = translate_one(env, dc);
 if (dc->base.is_jmp == DISAS_NEXT) {
-uint64_t page_start;
-
-page_start = dc->base.pc_first & TARGET_PAGE_MASK;
-if (dc->base.pc_next - page_start >= TARGET_PAGE_SIZE || dc->ex_value) 
{
-dc->base.is_jmp = DISAS_TOO_MANY;
+if (unlikely(dc->ex_value)) {
+/*
+ * Because ex_value was set, s390_cpu_exec_interrupt may
+ * have skipped an interrupt.  Exit to the main loop to
+ * re-evaluate interrupts, as we do for LCTL.
+ */
+dc->base.is_jmp = DISAS_PC_STALE_NOCHAIN;
+} else {
+uint64_t page_start = dc->base.pc_first & TARGET_PAGE_MASK;
+if (dc->base.pc_next - page_start >= TARGET_PAGE_SIZE) {
+dc->base.is_jmp = DISAS_TOO_MANY;
+}
 }
 }
 }
-- 
2.34.1

Re: [PATCH] hw/nvme: Use ioeventfd to handle doorbell updates

2022-06-29 Thread Jinhao Fan

at 4:13 AM, Klaus Jensen  wrote:

> On Jun 27 18:48, Jinhao Fan wrote:
>> Add property "ioeventfd" which is enabled by default. When this is
>> enabled, updates on the doorbell registers will cause KVM to signal
>> an event to the QEMU main loop to handle the doorbell updates.
>> Therefore, instead of letting the vcpu thread run both guest VM and
>> IO emulation, we now use the main loop thread to do IO emulation and
>> thus the vcpu thread has more cycles for the guest VM.
>> 
>> Since ioeventfd does not tell us the exact value that is written, it is
>> only useful when shadow doorbell buffer is enabled, where we check
>> for the value in the shadow doorbell buffer when we get the doorbell
>> update event.
>> 
>> IOPS comparison on Linux 5.19-rc2: (Unit: KIOPS)
>> 
>> qd   1   4  16  64
>> qemu35 121 176 153
>> ioeventfd   41 133 258 313
>> 
>> Signed-off-by: Jinhao Fan 
>> ---
>> hw/nvme/ctrl.c | 97 +-
>> hw/nvme/nvme.h |  5 +++
>> 2 files changed, 101 insertions(+), 1 deletion(-)
>> 
>> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
>> index c952c34f94..787b89f7d3 100644
>> --- a/hw/nvme/ctrl.c
>> +++ b/hw/nvme/ctrl.c
>> @@ -1374,7 +1374,14 @@ static void nvme_enqueue_req_completion(NvmeCQueue 
>> *cq, NvmeRequest *req)
>> 
>> QTAILQ_REMOVE(>sq->out_req_list, req, entry);
>> QTAILQ_INSERT_TAIL(>req_list, req, entry);
>> -timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
>> +
>> +if (req->sq->ioeventfd_enabled) {
>> +/* Post CQE directly since we are in main loop thread */
>> +nvme_post_cqes(cq);
>> +} else {
>> +/* Schedule the timer to post CQE later since we are in vcpu thread 
>> */
>> +timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
>> +}
>> }
>> 
>> static void nvme_process_aers(void *opaque)
>> @@ -4195,10 +4202,74 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest 
>> *req)
>> return NVME_INVALID_OPCODE | NVME_DNR;
>> }
>> 
>> +static void nvme_cq_notifier(EventNotifier *e)
>> +{
>> +NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier);
>> +NvmeCtrl *n = cq->ctrl;
>> +
>> +event_notifier_test_and_clear(>notifier);
>> +
>> +nvme_update_cq_head(cq);
>> +
>> +if (cq->tail == cq->head) {
>> +if (cq->irq_enabled) {
>> +n->cq_pending--;
>> +}
>> +
>> +nvme_irq_deassert(n, cq);
>> +}
>> +
>> +nvme_post_cqes(cq);
>> +}
>> +
>> +static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
>> +{
>> +NvmeCtrl *n = cq->ctrl;
>> +uint16_t offset = (cq->cqid << 3) + (1 << 2);
>> +int ret;
>> +
>> +if ((ret = event_notifier_init(>notifier, 0))) {
>> +return ret;
>> +}
> 
> Dont assign in conditionals and rely on the implicit value. It's too
> error prone. Split into
> 
>  ret = event_notifier_init(>notifier, 0);
>  if (ret < 0) {
>return ret;
>  }
> 
>> +
>> +event_notifier_set_handler(>notifier, nvme_cq_notifier);
>> +memory_region_add_eventfd(>iomem,
>> +  0x1000 + offset, 4, false, 0, >notifier);
>> +
>> +return 0;
>> +}
>> +
>> +static void nvme_sq_notifier(EventNotifier *e)
>> +{
>> +NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier);
>> +
>> +event_notifier_test_and_clear(>notifier);
>> +
>> +nvme_process_sq(sq);
>> +}
>> +
>> +static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
>> +{
>> +NvmeCtrl *n = sq->ctrl;
>> +uint16_t offset = sq->sqid << 3;
>> +int ret;
>> +
>> +if ((ret = event_notifier_init(>notifier, 0))) {
>> +return ret;
>> +}
> 
> Same as above.
> 
>> +
>> +event_notifier_set_handler(>notifier, nvme_sq_notifier);
>> +memory_region_add_eventfd(>iomem,
>> +  0x1000 + offset, 4, false, 0, >notifier);
>> +
>> +return 0;
>> +}
>> +
>> static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
>> {
>> n->sq[sq->sqid] = NULL;
>> timer_free(sq->timer);
>> +event_notifier_cleanup(>notifier);
>> g_free(sq->io_req);
>> if (sq->sqid) {
>> g_free(sq);
>> @@ -4250,6 +4321,7 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, 
>> uint64_t dma_addr,
>>  uint16_t sqid, uint16_t cqid, uint16_t size)
>> {
>> int i;
>> +int ret;
>> NvmeCQueue *cq;
>> 
>> sq->ctrl = n;
>> @@ -4271,6 +4343,11 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, 
>> uint64_t dma_addr,
>> if (n->dbbuf_enabled) {
>> sq->db_addr = n->dbbuf_dbs + (sqid << 3);
>> sq->ei_addr = n->dbbuf_eis + (sqid << 3);
>> +
>> +if (n->params.ioeventfd && sq->sqid != 0) {
>> +ret = nvme_init_sq_ioeventfd(sq);
>> +sq->ioeventfd_enabled = ret == 0;
>> +}
> 
> Not using ret for anything really, so
> 
>  if (!nvme_init_sq_ioeventfd(sq)) {
>sq->ioeventfd_enabled = true;
>  }
> 
> should do.
> 
>> }
>> 
>> assert(n->cq[cqid]);
>> @@

Re: qemu-system-s390x hang in tcg

2022-06-29 Thread Richard Henderson


On 6/29/22 16:16, Sven Schnelle wrote:

Thanks, that was very helpful. I added debugging and it turned out
that the TB is left because of a pending irq. The code then calls
s390_cpu_exec_interrupt:

bool s390_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
{
 if (interrupt_request & CPU_INTERRUPT_HARD) {
 S390CPU *cpu = S390_CPU(cs);
 CPUS390XState *env = >env;

 if (env->ex_value) {
 /* Execution of the target insn is indivisible from
the parent EXECUTE insn.  */
 return false;
 }
 if (s390_cpu_has_int(cpu)) {
 s390_cpu_do_interrupt(cs);
 return true;
 }
 if (env->psw.mask & PSW_MASK_WAIT) {
 /* Woken up because of a floating interrupt but it has already
  * been delivered. Go back to sleep. */
 cpu_interrupt(CPU(cpu), CPU_INTERRUPT_HALT);
 }
 }
 return false;
}

Note the 'if (env->ex_value) { }' check. It looks like this function
just returns false in case tcg is executing an EX instruction. After
that the information that the TB should be exited because of an
interrupt is gone. So the TB's are never exited again, although the
interrupt wasn't handled. At least that's my assumption now, if i'm
wrong please tell me.


Ah, yes, I see.

We wanted to treat ex_value != 0 as if interrupts are disabled, because we have no way of 
stacking that value for re-execution after the interrupt (which itself could use EXECUTE).


One solution might be to zap ex_value and arrange to re-execute the EXECUTE instruction 
after the interrupt.


Another solution is to generate an exit from any TB translating ex_value, so that 
interrupts are re-examined.  This is probably cleanest.  I'll prepare a patch.



r~

Re: [PATCH] hw/nvme: Use ioeventfd to handle doorbell updates

2022-06-29 Thread Jinhao Fan



> That looks correct since we don't need the ioevent is an optional 
> optimization.
> 
> I would just suggest making this easier to read. For example, in
> nvme_init_sq_ioeventfd(), instead of assigning within a conditional:
> 
>if ((ret = event_notifier_init(>notifier, 0))) {
> 
> Do each part separately:
> 
>ret = event_notifier_init(>notifier, 0);
>if (ret) {

Thanks for the comment! Will change in the next version.

> 
>> I’ve also been wondering whether using irqfd for sending interrupts can
>> bring some benefits. I’m not familiar with how QEMU emulates interrupts.
>> What do you think of irqfd’s?
> 
> Not sure about this mechanism, I'll need to look into it.

Since irqfd is internally a counter, I guess it may be able to “coalesce”
interrupts so that performance can be improved. I will try it tomorrow.

Re: [PATCH v2 08/13] hw/i2c/pmbus: Reset out buf after switching pages

2022-06-29 Thread Peter Delevoryas



> On Jun 29, 2022, at 11:28 AM, Peter Delevoryas  wrote:
> 
> 
> 
>> On Jun 29, 2022, at 11:05 AM, Titus Rwantare  wrote:
>> 
>> On Tue, 28 Jun 2022 at 20:36, Peter Delevoryas
>>  wrote:
>>> 
>>> When a pmbus device switches pages, it should clears its output buffer so
>>> that the next transaction doesn't emit data from the previous page.
>>> 
>>> Fixes: 3746d5c15e70570b ("hw/i2c: add support for PMBus”)
>>> Signed-off-by: Peter Delevoryas 
>>> ---
>>> hw/i2c/pmbus_device.c | 1 +
>>> 1 file changed, 1 insertion(+)
>>> 
>>> diff --git a/hw/i2c/pmbus_device.c b/hw/i2c/pmbus_device.c
>>> index 62885fa6a1..efddc36fd9 100644
>>> --- a/hw/i2c/pmbus_device.c
>>> +++ b/hw/i2c/pmbus_device.c
>>> @@ -1088,6 +1088,7 @@ static int pmbus_write_data(SMBusDevice *smd, uint8_t 
>>> *buf, uint8_t len)
>>> 
>>> if (pmdev->code == PMBUS_PAGE) {
>>> pmdev->page = pmbus_receive8(pmdev);
>>> + pmdev->out_buf_len = 0;
>>> return 0;
>>> }
>>> 
>> 
>> I suspect you were running into this because ic_device_id was putting
>> too much data in the output buffer. Still, I wouldn't want the buffer
>> cleared if the page hasn't changed. Some drivers write the same page
>> before every read.
> 
> Yes you’re correct: this is the code that was querying the ic_device_id [1]:
> 
> memset(, 0, sizeof(msg));
> msg.bus = sensor_config[index].port;
> msg.target_addr = sensor_config[index].target_addr;
> msg.tx_len = 1;
> msg.rx_len = 7;
> msg.data[0] = PMBUS_IC_DEVICE_ID;
> if (i2c_master_read(, retry)) {
> printf("Failed to read VR IC_DEVICE_ID: register(0x%x)\n", 
> PMBUS_IC_DEVICE_ID);
> return;
> }
> 
> By sending a buffer that was way larger than the rx buffer of 7, it was
> leaving stuff lying around for the next query.
> 
> I’ll test it out and see what happens if I fix the IC_DEVICE_ID length
> transmitted by the ISL69259 to 4, maybe we don’t need this patch. But,
> at the very least, I’ll make sure to gate this on the page value changing,
> not just being set.

Hmmm, actually I’m not going to change this. It seems that our Zephyr code
is actually querying one of our devices multiple times, setting the page
to zero before each read, and expecting it to return the device ID without
any wrapping. If it was only resetting the output buffer on page
commands that change the value of the page, then the Zephyr code
wouldn’t work. I also added some printing and tested it on some hardware:

check_vr_type: i2c4 76 page 00 [04 00 81 d2 49 56 ff]
endpoint
check_vr_type: i2c4 76 page 01 [04 00 81 d2 49 56 ff]

[00:00:00.
check_vr_type: i2c4 60 page 00 [04 00 81 d2 49 3c ff]
003,000]
heck_vr_type: i2c4 60 page 01 [04 00 81 d2 49 3c ff]
m usb_d
check_vr_type: i2c4 62 page 00 [04 00 81 d2 49 d4 ff] <
c_aspeed: se
check_vr_type: i2c4 76 page 00 [04 00 81 d2 49 56 ff]
lect ep[0x3]
check_vr_type: i2c4 76 page 01 [04 00 81 d2 49 56 ff]
 as OUT endp
check_vr_type: i2c4 60 page 00 [04 00 81 d2 49 3c ff]
oint
[0
check_vr_type: i2c4 60 page 01 [04 00 81 d2 49 3c ff]
0:00:00.059,
check_vr_type: i2c4 62 page 00 [04 00 81 d2 49 d4 ff] <
000]  kcs_aspee
check_vr_type: i2c4 76 page 01 [04 00 81 d2 49 56 ff]
d: KCS3: add
check_vr_type: i2c4 60 page 00 [04 00 81 d2 49 3c ff]
r=0xca2, idr
check_vr_type: i2c4 60 page 01 [04 00 81 d2 49 3c ff]
=0x2c, odr=0
check_vr_type: i2c4 62 page 00 [04 00 81 d2 49 d4 ff]
x38, str=0x4
check_vr_type: i2c4 76 page 00 [04 00 81 d2 49 56 ff]
4

[00:
check_vr_type: i2c4 76 page 01 [04 00 81 d2 49 56 ff]
00:00.059,00
check_vr_type: i2c4 60 page 00 [04 00 81 d2 49 3c ff]
0]  usb_dc_asp
check_vr_type: i2c4 76 page 00 [04 00 81 d2 49 56 ff ff ff ff ff ff ff ff ff ff 
ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff f
f ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff]
eed: select ep[0x3] as OUT endpoint
heck_vr_type: i2c4 76 page 00 [04 00 81 d2 49 56 ff ff ff ff ff ff ff ff ff ff 
ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff]
m
[00:00:00.059,000]  kcs_asp
check_vr_type: i2c4 60 page 00 [04 00 81 d2 49 3c ff ff ff ff ff ff ff ff ff ff 
ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff f
f ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff]
eed: KCS3: addr=0xca2, idr=0x2c, odr=0
check_vr_type: i2c4 60 page 00 [04 00 81 d2 49 3c ff ff ff ff ff ff ff ff ff ff 
ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff f
f ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff]
x38, str=0x44

[00:00:00.060,000]
check_vr_type: i2c4 62 page 00 [04 00 81 d2 49 d4 ff ff ff ff ff ff ff ff ff ff 
ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff f
f ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff]
  spi_nor_multi_dev: [1216
check_vr_type: i2c4 76 page 00 [04 00 81 d2 49 56 ff ff ff ff ff ff ff ff ff ff 
ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff f
f ff ff ff ff ff ff

[PATCH v2] io_uring: fix short read slow path

2022-06-29 Thread Dominique Martinet

sqeq.off here is the offset to read within the disk image, so obviously
not 'nread' (the amount we just read), but as the author meant to write
its current value incremented by the amount we just read.

Normally recent versions of linux will not issue short reads,
but it can happen so we should fix this.

This lead to weird image corruptions when short read happened

Fixes: 6663a0a33764 ("block/io_uring: implements interfaces for io_uring")
Link: https://lkml.kernel.org/r/yrrfgo4a1js0g...@atmark-techno.com
Signed-off-by: Dominique Martinet 
---
v1 -> v2: also updated total_read to use += as suggested by Kevin,
thank you!

I've tested this quickly by making short reads "recursives", e.g. added
the following to luring_resubmit_short_read() after setting 'remaining':
if (remaining > 4096) remaining -= 4096;

so when we ask for more we issue an extra short reads, making sure we go
through the two short reads path.
(Unfortunately I wasn't quite sure what to fiddle with to issue short
reads in the first place, I tried cutting one of the iovs short in
luring_do_submit() but I must not have been doing it properly as I ended
up with 0 return values which are handled by filling in with 0 (reads
after eof) and that didn't work well)

Anyway, this looks OK to me now.

Thanks,
Dominique

 block/io_uring.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/io_uring.c b/block/io_uring.c
index d48e472e74cb..b238661740f5 100644
--- a/block/io_uring.c
+++ b/block/io_uring.c
@@ -89,7 +89,7 @@ static void luring_resubmit_short_read(LuringState *s, 
LuringAIOCB *luringcb,
 trace_luring_resubmit_short_read(s, luringcb, nread);
 
 /* Update read position */
-luringcb->total_read = nread;
+luringcb->total_read += nread;
 remaining = luringcb->qiov->size - luringcb->total_read;
 
 /* Shorten qiov */
@@ -103,7 +103,7 @@ static void luring_resubmit_short_read(LuringState *s, 
LuringAIOCB *luringcb,
   remaining);
 
 /* Update sqe */
-luringcb->sqeq.off = nread;
+luringcb->sqeq.off += nread;
 luringcb->sqeq.addr = (__u64)(uintptr_t)luringcb->resubmit_qiov.iov;
 luringcb->sqeq.len = luringcb->resubmit_qiov.niov;
 
-- 
2.35.1

[PATCH v3 2/2] ui/gtk: a new array param monitor to specify the target displays

2022-06-29 Thread Dongwon Kim

New integer array parameter, 'monitor' is for specifying the target
monitors where individual GTK windows are placed upon launching.

Monitor numbers in the array are associated with virtual consoles
in the order of [VC0, VC1, VC2 ... VCn].

Every GTK window containing each VC will be placed in the region
of corresponding monitors.

Usage: -display gtk,monitor.=,..
   ex)-display gtk,monitor.0=1,monitor.1=0

v3: - Revised commit message
- Rewrote desription of the new parameter (Markus Armbruster)
- Replaced unnecessary 'for' loop with 'if' condition
  (Markus Armbruster)

Cc: Daniel P. Berrangé 
Cc: Markus Armbruster 
Cc: Philippe Mathieu-Daudé 
Cc: Paolo Bonzini 
Cc: Gerd Hoffmann 
Cc: Vivek Kasireddy 
Signed-off-by: Dongwon Kim 
---
 qapi/ui.json|  9 -
 qemu-options.hx |  3 ++-
 ui/gtk.c| 31 +--
 3 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/qapi/ui.json b/qapi/ui.json
index 413371d5e8..7b4c098bb4 100644
--- a/qapi/ui.json
+++ b/qapi/ui.json
@@ -1195,12 +1195,19 @@
 #   assuming the guest will resize the display to match
 #   the window size then.  Otherwise it defaults to "off".
 #   Since 3.1
+# @monitor: Array of numbers, each of which represents physical
+#   monitor where GTK window containing a given VC will be
+#   placed. Each monitor number in the array will be
+#   associated with a virtual-console starting from VC0.
+#
+#   since 7.1
 #
 # Since: 2.12
 ##
 { 'struct'  : 'DisplayGTK',
   'data': { '*grab-on-hover' : 'bool',
-'*zoom-to-fit'   : 'bool'  } }
+'*zoom-to-fit'   : 'bool',
+'*monitor'   : ['uint16']  } }
 
 ##
 # @DisplayEGLHeadless:
diff --git a/qemu-options.hx b/qemu-options.hx
index 377d22fbd8..aabdfb0636 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -1938,7 +1938,8 @@ DEF("display", HAS_ARG, QEMU_OPTION_display,
 #endif
 #if defined(CONFIG_GTK)
 "-display gtk[,full-screen=on|off][,gl=on|off][,grab-on-hover=on|off]\n"
-"[,show-cursor=on|off][,window-close=on|off]\n"
+"[,monitor.=][,show-cursor=on|off]"
+"[,window-close=on|off]\n"
 #endif
 #if defined(CONFIG_VNC)
 "-display vnc=[,]\n"
diff --git a/ui/gtk.c b/ui/gtk.c
index e6878c3209..935176e614 100644
--- a/ui/gtk.c
+++ b/ui/gtk.c
@@ -2316,6 +2316,10 @@ static void gtk_display_init(DisplayState *ds, 
DisplayOptions *opts)
 GtkDisplayState *s = g_malloc0(sizeof(*s));
 GdkDisplay *window_display;
 GtkIconTheme *theme;
+GtkWidget *win;
+GdkRectangle dest;
+uint16List *mon;
+int n_mon;
 int i;
 char *dir;
 
@@ -2393,10 +2397,33 @@ static void gtk_display_init(DisplayState *ds, 
DisplayOptions *opts)
 gtk_menu_item_activate(GTK_MENU_ITEM(s->untabify_item));
 }
 }
-if (opts->has_full_screen &&
-opts->full_screen) {
+
+if (opts->u.gtk.has_monitor) {
+i = 0;
+n_mon = gdk_display_get_n_monitors(window_display);
+for (mon = opts->u.gtk.monitor; mon; mon = mon->next) {
+if (mon->value < n_mon && i < s->nb_vcs) {
+win = s->vc[i].window ? s->vc[i].window : s->window;
+if (opts->has_full_screen && opts->full_screen) {
+gtk_window_fullscreen_on_monitor(
+GTK_WINDOW(win),
+gdk_display_get_default_screen(window_display),
+mon->value);
+} else {
+gdk_monitor_get_geometry(
+gdk_display_get_monitor(window_display, mon->value),
+);
+gtk_window_move(GTK_WINDOW(win),
+dest.x, dest.y);
+}
+i++;
+}
+}
+} else if (opts->has_full_screen &&
+   opts->full_screen) {
 gtk_menu_item_activate(GTK_MENU_ITEM(s->full_screen_item));
 }
+
 if (opts->u.gtk.has_grab_on_hover &&
 opts->u.gtk.grab_on_hover) {
 gtk_menu_item_activate(GTK_MENU_ITEM(s->grab_on_hover_item));
-- 
2.20.1

[PATCH v3 1/3] ui/gtk: detach VCs for additional guest displays

2022-06-29 Thread Dongwon Kim

Detaching any addtional guest displays in case multiple displays are
assigned to the guest OS (e.g. max_outputs=n) so that all of them are
visible upon lauching.

v2: - making sure type of VC is GD_VC_GFX before qemu_console_is_graphic
  (Gerd Hoffman)
- vc[0] is always primary guest display so we won't need n_gfx_vcs
  (Gerd Hoffmann)
- making sure detached window's size same as original surface size
  (Daniel P. Berrangé)

Cc: Daniel P. Berrangé 
Cc: Markus Armbruster 
Cc: Philippe Mathieu-Daudé 
Cc: Paolo Bonzini 
Cc: Gerd Hoffmann 
Cc: Vivek Kasireddy 
Signed-off-by: Dongwon Kim 
---
 ui/gtk.c | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/ui/gtk.c b/ui/gtk.c
index 2a791dd2aa..e6878c3209 100644
--- a/ui/gtk.c
+++ b/ui/gtk.c
@@ -1361,6 +1361,11 @@ static void gd_menu_untabify(GtkMenuItem *item, void 
*opaque)
 
 g_signal_connect(vc->window, "delete-event",
  G_CALLBACK(gd_tab_window_close), vc);
+
+gtk_window_set_default_size(GTK_WINDOW(vc->window),
+surface_width(vc->gfx.ds),
+surface_height(vc->gfx.ds));
+
 gtk_widget_show_all(vc->window);
 
 if (qemu_console_is_graphic(vc->gfx.dcl.con)) {
@@ -2311,6 +2316,7 @@ static void gtk_display_init(DisplayState *ds, 
DisplayOptions *opts)
 GtkDisplayState *s = g_malloc0(sizeof(*s));
 GdkDisplay *window_display;
 GtkIconTheme *theme;
+int i;
 char *dir;
 
 if (!gtkinit) {
@@ -2381,7 +2387,12 @@ static void gtk_display_init(DisplayState *ds, 
DisplayOptions *opts)
 gtk_widget_set_sensitive(s->copy_item,
  vc && vc->type == GD_VC_VTE);
 #endif
-
+for (i = 1; i < s->nb_vcs; i++) {
+if (vc->type == GD_VC_GFX &&
+qemu_console_is_graphic(s->vc[i].gfx.dcl.con)) {
+gtk_menu_item_activate(GTK_MENU_ITEM(s->untabify_item));
+}
+}
 if (opts->has_full_screen &&
 opts->full_screen) {
 gtk_menu_item_activate(GTK_MENU_ITEM(s->full_screen_item));
-- 
2.30.2

[PATCH v3 0/2] handling guest multiple displays

2022-06-29 Thread Dongwon Kim

This patch seires is for adding some useful features for the guest os with
multi-displays. First patch is to make all of guest displays visible
when guest os is launched using "detach". Second patch is for providing
a method to assign each guest display to specific physical monitor,
which would be useful if someone wants to directly full-screen individual
guest scanouts to host's physical displays.

Changes in v3:

* ui/gtk: a new array param monitor to specify the target

  - Revised commit message
  - Rewrote desription of the new parameter (Markus Armbruster)
  - Replaced unnecessary 'for' loop with 'if' condition
(Markus Armbruster)

Changes in v2:

* ui/gtk: detach VCS for additional guest displays

  - check if the type of VC is GD_VC_GFX before qemu_console_is_graphic
(Gerd Hoffman)
  - vc[0] is always primary guest display so we won't need n_gfx_vcs
(Gerd Hoffmann)
  - making sure detached window's size same as original surface size
(Daniel P. Berrangé)

Dongwon Kim (2):
  ui/gtk: detach VCS for additional guest displays (v3)
  ui/gtk: a new array param monitor to specify the target displays (v3)

 qapi/ui.json|  7 ++-
 qemu-options.hx |  2 +-
 ui/gtk.c| 43 +--
 3 files changed, 48 insertions(+), 4 deletions(-)

-- 
2.30.2

Re: [PULL 00/11] Trivial branch for 7.1 patches

2022-06-29 Thread Richard Henderson


On 6/29/22 14:38, Laurent Vivier wrote:

The following changes since commit ad4c7f529a279685da84297773b4ec8080153c2d:

   Merge tag 'pull-semi-20220628' of https://gitlab.com/rth7680/qemu into 
staging (2022-06-28 10:24:31 +0530)

are available in the Git repository at:

   https://gitlab.com/laurent_vivier/qemu.git 
tags/trivial-branch-for-7.1-pull-request

for you to fetch changes up to c379bd7551f34e42c4c935783c0c08bab41d70c1:

   hw/i386/xen/xen-hvm: Inline xen_piix_pci_write_config_client() and remove it 
(2022-06-29 00:24:59 +0200)


trivial patches pull request 20220629


Applied, thanks.  Please update https://wiki.qemu.org/ChangeLog/7.1 as 
appropriate.


r~






Bernhard Beschow (3):
   hw/pci-host/i440fx: Remove unused parameter from i440fx_init()
   hw/i386/xen/xen-hvm: Allow for stubbing xen_set_pci_link_route()
   hw/i386/xen/xen-hvm: Inline xen_piix_pci_write_config_client() and
 remove it

Dr. David Alan Gilbert (2):
   Trivial: 3 char repeat typos
   trivial typos: namesapce

Eugenio Pérez (1):
   util: Return void on iova_tree_remove

Guo Zhi (1):
   vga: avoid crash if no default vga card

Lev Kujawski (1):
   hw/ide/atapi.c: Correct typos (CD-CDROM -> CD-ROM)

Markus Armbruster (1):
   MAINTAINERS: Add softmmu/runstate.c to "Main loop"

Philippe Mathieu-Daudé (1):
   qom/object: Remove circular include dependency

Thomas Huth (1):
   common-user: Only compile the common user code if have_user is set

  MAINTAINERS  |  1 +
  common-user/meson.build  |  4 
  hw/9pfs/9p-xattr-user.c  |  8 
  hw/acpi/nvdimm.c |  2 +-
  hw/i386/pc_piix.c|  3 ---
  hw/i386/xen/xen-hvm.c| 17 ++---
  hw/ide/atapi.c   |  4 ++--
  hw/intc/openpic.c|  2 +-
  hw/isa/piix3.c   | 15 ++-
  hw/net/imx_fec.c |  2 +-
  hw/nvme/ctrl.c   |  2 +-
  hw/pci-host/i440fx.c |  4 +---
  hw/pci/pcie_aer.c|  2 +-
  hw/pci/shpc.c|  3 ++-
  hw/ppc/spapr_caps.c  |  2 +-
  hw/scsi/spapr_vscsi.c|  2 +-
  include/hw/pci-host/i440fx.h |  1 -
  include/hw/xen/xen.h |  2 +-
  include/hw/xen/xen_common.h  |  6 --
  include/qemu/iova-tree.h |  4 +---
  include/qom/object.h |  1 -
  qapi/net.json|  2 +-
  softmmu/vl.c |  3 ++-
  stubs/xen-hw-stub.c  |  3 ++-
  tools/virtiofsd/passthrough_ll.c |  2 +-
  ui/input.c   |  2 +-
  util/iova-tree.c |  4 +---
  27 files changed, 47 insertions(+), 56 deletions(-)

[PATCH v4 3/3] tests/qtest/i440fx-test.c: Enable full test of i440FX PAM operation

2022-06-29 Thread Lev Kujawski

With the prior patch in this series adding support for RE^WE PAM
semantics, the '#ifndef BROKEN' segments of test_i440fx_pam can now be
enabled.

* tests/qtest/i440fx-test.c
test_i440fx_pam:
- Verify that changing attributes does not affect the initial contents
  of the PAM region;
- Verify that that the first new mask is written before switching
  attributes;
- Verify that just PAM_WE works by writing a new mask;
- Switch back to PAM_RE after PAM_WE to read the new mask;
- Tighten logic of the !WE write test because we know what the
  original contents were; and
- Write the last mask before testing for it.

Signed-off-by: Lev Kujawski 
---
(v4) Minor tweaks:
- Use spacing to delineate the functionality being tested.
- Ensure that PAM_WE is working by writing a mask within
  that portion.
(v2-v3) No changes

 tests/qtest/i440fx-test.c | 31 +--
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/tests/qtest/i440fx-test.c b/tests/qtest/i440fx-test.c
index 6d7d4d8d8f..b761ae53cb 100644
--- a/tests/qtest/i440fx-test.c
+++ b/tests/qtest/i440fx-test.c
@@ -229,6 +229,7 @@ static void test_i440fx_pam(gconstpointer opaque)
 
 g_test_message("Checking area 0x%05x..0x%05x",
pam_area[i].start, pam_area[i].end);
+
 /* Switch to RE for the area */
 pam_set(dev, i, PAM_RE);
 /* Verify the RAM is all zeros */
@@ -236,33 +237,35 @@ static void test_i440fx_pam(gconstpointer opaque)
 
 /* Switch to WE for the area */
 pam_set(dev, i, PAM_RE | PAM_WE);
+/* Verify the RAM is still all zeros */
+g_assert(verify_area(pam_area[i].start, pam_area[i].end, 0));
 /* Write out a non-zero mask to the full area */
 write_area(pam_area[i].start, pam_area[i].end, 0x42);
-
-#ifndef BROKEN
-/* QEMU only supports a limited form of PAM */
+/* Verify the area contains the new mask */
+g_assert(verify_area(pam_area[i].start, pam_area[i].end, 0x42));
 
 /* Switch to !RE for the area */
 pam_set(dev, i, PAM_WE);
 /* Verify the area is not our mask */
 g_assert(!verify_area(pam_area[i].start, pam_area[i].end, 0x42));
-#endif
-
-/* Verify the area is our new mask */
-g_assert(verify_area(pam_area[i].start, pam_area[i].end, 0x42));
+/* Write out a new mask */
+write_area(pam_area[i].start, pam_area[i].end, 0xaa);
+/* Verify the area is not our new mask */
+g_assert(!verify_area(pam_area[i].start, pam_area[i].end, 0xaa));
 
+/* Switch to !WE for the area */
+pam_set(dev, i, PAM_RE);
+/* Verify the area is the new mask */
+g_assert(verify_area(pam_area[i].start, pam_area[i].end, 0xaa));
 /* Write out a new mask */
 write_area(pam_area[i].start, pam_area[i].end, 0x82);
-
-#ifndef BROKEN
-/* QEMU only supports a limited form of PAM */
-
-/* Verify the area is not our mask */
-g_assert(!verify_area(pam_area[i].start, pam_area[i].end, 0x82));
+/* Verify the area is not the new mask */
+g_assert(verify_area(pam_area[i].start, pam_area[i].end, 0xaa));
 
 /* Switch to RE for the area */
 pam_set(dev, i, PAM_RE | PAM_WE);
-#endif
+/* Write out a new mask again */
+write_area(pam_area[i].start, pam_area[i].end, 0x82);
 /* Verify the area is our new mask */
 g_assert(verify_area(pam_area[i].start, pam_area[i].end, 0x82));
 
-- 
2.34.1

[PATCH v4 1/3] hw/pci-host/pam.c: Fully support RE^WE semantics of i440FX PAM

2022-06-29 Thread Lev Kujawski

The Programmable Attribute Registers (PAM) of QEMU's emulated i440FX
chipset now fully support the exclusive Read Enable (RE) and Write
Enable (WE) modes by forwarding reads of the applicable PAM region to
RAM and writes to the bus or vice versa, respectively.  This chipset
functionality is often used by x86 firmware for shadowing ROM.

For the WE case, the prior behavior was to create a RAM alias, but
reads were not forwarded to the bus.  This prevents the classic BIOS
shadowing mechanism, which is executing from flash ROM while copying
the contents to the equivalent location in RAM.

Support for PAM_WE involved adding a ROMD mode to QEMU's Memory
Sections, similar to the existing support for read-only sections.
When a write is made to a read-only memory region within a ROMD
section, QEMU will conduct a downwards hierarchical search from the
root for a ROMD region that is marked read-only (unlike normal ROMD
regions); this region receives the write as an MMIO operation.

* accel/kvm/kvm-all.c
- kvm_set_phys_mem: Also ignore read-only memory regions that are not
  backed by RAM.

* accel/tcg/cputlb.c
- tlb_set_page_with_attrs: Handle the case of RAM regions within ROMD
  sections.
- io_writex: Search for the actual ROMD memory region when writing.

* hw/i386/pc.c
- Split the RAM into conventional and extended areas, so as to avoid
  double-aliasing with PAM.
- Create a new E820 entry to account for the resultant gap over the
  ISA MMIO area [0xA-0XF], which firmware or the operating
  system can fill with cache/shadow memory if desired.

* include/hw/pci-host/i440fx.h
* include/hw/pci-host/q35.h
- Add address spaces for both RAM and MMIO to the PCI host state.

* hw/pci-host/i440fx.c
* hw/pci-host/q35.c
- Initialize RAM and MMIO address spaces for use by pam.c.
- Adjust init_pam and pam_update calls for updated parameters.
- Now that RAM is not normally exposed within the ISA MMIO area,
  invert the logic of enabling/disabling the SMRAM region.

* hw/pci-host/pam.c
- pam_rmem_write: Write to the PCI address space to forward ISA area
  MMIO writes.
- pam_wmem_write: Write to the RAM address space, aborting when there
  are memory transaction errors.
- Make the PAM memory region a read-only ROMD container in the PAM_RE
  and PAM_WE modes.  Reads will pass through to any underlying ROMs,
  enabling the traditional execute-in-place behavior.
- Remove PAM aliases entirely when mode 0 (the default) is active, as
  it is no longer necessary to hide underlying RAM.

* include/exec/memory.h
- Add romd_mode to MemoryRegionSection, check when testing for
  equivalency.
- Add a prototype for the hierarchical search function
  memory_region_find_romd_container.

* softmmu/memory.c
- render_memory_region: Mark romd_mode when a read-only ROMD memory
  region container is encountered.
- Define the hierarchical search function
  memory_region_find_romd_container.

* softmmu/physmem.c
- flatview_translate: Search for the controlling ROMD memory region
  when writing to a read-only section marked for romd_mode.

Tested with SeaBIOS and AMIBIOS.

Signed-off-by: Lev Kujawski 
---
(v4) Revamp to support execution in place for PCI ROMs in WE mode (2)
 using the new romd memory section support in the QEMU MMU.

 The romd memory section support obviates the need for a ROMD
 region or the flushing in v3.
(v3) Relocate ownership of the RAM address space into the respective
 PAM-utilizing chipsets to reduce memory usage and eliminate mtree
 duplicates.
 Avoid changing the PAM region if possible.
 Flush ROM after writing.
(v2) Write to an AddressSpace mapped over ram_memory instead of using
 a pointer, as it suprisingly may not be backed by RAM on, e.g.,
 NUMA configurations.

 accel/kvm/kvm-all.c  |   2 +-
 accel/tcg/cputlb.c   |  14 ++-
 hw/i386/pc.c |  19 +++-
 hw/pci-host/i440fx.c |  34 +++---
 hw/pci-host/pam.c| 193 +--
 hw/pci-host/q35.c|  50 +
 include/exec/memory.h|  19 
 include/hw/pci-host/i440fx.h |   2 +
 include/hw/pci-host/pam.h|  19 +++-
 include/hw/pci-host/q35.h|   2 +
 softmmu/memory.c |  58 ++-
 softmmu/physmem.c|   5 +
 12 files changed, 336 insertions(+), 81 deletions(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index ba3210b1c1..a8ef0605a1 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -1353,7 +1353,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
 void *ram;
 
 if (!memory_region_is_ram(mr)) {
-if (writable || !kvm_readonly_mem_allowed) {
+if (!mr->ram_block || writable || !kvm_readonly_mem_allowed) {
 return;
 } else if (!mr->romd_mode) {
 /* If the memory device is not in romd_mode, then we actually want
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index f90f4312ea..2e8df6a906 100644
--- a/accel/tcg/cputlb.c
+++

[PATCH v4 2/3] tests/data/acpi/q35/SSDT.dimmpxm: Account for new E820 entry

2022-06-29 Thread Lev Kujawski

As detailed in the first patch of this series (8e64de1c), the
previously contiguous RAM aliased for the i440FX and Q35 machines was
broken into conventional [0-0xA] and extended regions
[0x10-...], creating a need for a new E820 entry to account for
the ISA MMIO area gap.  This new entry slightly reduces the available
RAM, thus in the Q35's DSDT, the following change occurred:

< Name (MEMA, 0x07FFF000)
> Name (MEMA, 0x07FFE000)

accompanied by a corresponding change of checksum.  This patch updates
the above-referenced file to forestall the BIOS table test failure.

Signed-off-by: Lev Kujawski 
---
(v4) New patch.

 tests/data/acpi/q35/SSDT.dimmpxm | Bin 734 -> 734 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/tests/data/acpi/q35/SSDT.dimmpxm b/tests/data/acpi/q35/SSDT.dimmpxm
index 
98e6f0e3f3bb02dd419e36bdd1db9b94c728c406..ac55387d57e48adb99eb738a102308688a262fb8
 100644
GIT binary patch
delta 23
fcmcb|dXJSWIM^lR9uortW7tNni%iT9{<8xBTM-An

delta 23
fcmcb|dXJSWIM^lR9uortquWNVi%iTP{<8xBTCoSc

-- 
2.34.1

[PATCH v4 0/3] Full PAM emulation (RE^WE) for i440FX and Q35

2022-06-29 Thread Lev Kujawski

Hello,

This patch series (v4) implements full PAM emulation for the i440FX
and Q35 x86 platforms (see commit log for details.)  Prior versions
did not support executing code from ROMs within the ISA MMIO range
when mode 2 was active (PAM_WE), but this series adds the requisite
support for new romd_mode memory sections to the QEMU MMU.  It appears
to be working well on TCG and KVM, but I would appreciate any feedback
or testing on other accelerators.  For testing purposes, I have
attached a patch to SeaBIOS that removes its special handling for
QEMU, but unaltered versions run fine as well.

Kind regards,
Lev Kujawski

--- a/src/fw/shadow.c
+++ b/src/fw/shadow.c
@@ -28,7 +28,7 @@ union pamdata_u {

 // Enable shadowing and copy bios.
 static void
-__make_bios_writable_intel(u16 bdf, u32 pam0)
+make_bios_writable_intel(u16 bdf, u32 pam0)
 {
 // Read in current PAM settings from pci config space
 union pamdata_u pamdata;
@@ -39,11 +39,11 @@ __make_bios_writable_intel(u16 bdf, u32 pam0)
 // Make ram from 0xc-0xf writable
 int i;
 for (i=0; i<6; i++)
-pam[i + 1] = 0x33;
+pam[i + 1] = 0x22;

 // Make ram from 0xf-0x10 writable
 int ram_present = pam[0] & 0x10;
-pam[0] = 0x30;
+pam[0] = 0x20;

 // Write PAM settings back to pci config space
 pci_ioconfig_writel(bdf, ALIGN_DOWN(pam0, 4), pamdata.data32[0]);
@@ -54,24 +54,17 @@ __make_bios_writable_intel(u16 bdf, u32 pam0)
 memcpy(VSYMBOL(code32flat_start)
, VSYMBOL(code32flat_start) + BIOS_SRC_OFFSET
, SYMBOL(code32flat_end) - SYMBOL(code32flat_start));
-}

-static void
-make_bios_writable_intel(u16 bdf, u32 pam0)
-{
-int reg = pci_ioconfig_readb(bdf, pam0);
-if (!(reg & 0x10)) {
-// QEMU doesn't fully implement the piix shadow capabilities -
-// if ram isn't backing the bios segment when shadowing is
-// disabled, the code itself won't be in memory.  So, run the
-// code from the high-memory flash location.
-u32 pos = (u32)__make_bios_writable_intel + BIOS_SRC_OFFSET;
-void (*func)(u16 bdf, u32 pam0) = (void*)pos;
-func(bdf, pam0);
-return;
-}
-// Ram already present - just enable writes
-__make_bios_writable_intel(bdf, pam0);
+// Make ram from 0xc-0xf writable
+for (i=0; i<6; i++)
+pam[i + 1] = 0x33;
+
+// Make ram from 0xf-0x10 writable
+pam[0] = 0x30;
+
+// Write PAM settings back to pci config space
+pci_ioconfig_writel(bdf, ALIGN_DOWN(pam0, 4), pamdata.data32[0]);
+pci_ioconfig_writel(bdf, ALIGN_DOWN(pam0, 4) + 4, pamdata.data32[1]);
 }

Lev Kujawski (3):
  hw/pci-host/pam.c: Fully support RE^WE semantics of i440FX PAM
  tests/data/acpi/q35/SSDT.dimmpxm: Account for new E820 entry
  tests/qtest/i440fx-test.c: Enable full test of i440FX PAM operation

 accel/kvm/kvm-all.c  |   2 +-
 accel/tcg/cputlb.c   |  14 ++-
 hw/i386/pc.c |  19 ++-
 hw/pci-host/i440fx.c |  34 +++---
 hw/pci-host/pam.c| 193 ++-
 hw/pci-host/q35.c|  50 
 include/exec/memory.h|  19 +++
 include/hw/pci-host/i440fx.h |   2 +
 include/hw/pci-host/pam.h|  19 ++-
 include/hw/pci-host/q35.h|   2 +
 softmmu/memory.c |  58 +-
 softmmu/physmem.c|   5 +
 tests/data/acpi/q35/SSDT.dimmpxm | Bin 734 -> 734 bytes
 tests/qtest/i440fx-test.c|  31 ++---
 14 files changed, 353 insertions(+), 95 deletions(-)

-- 
2.34.1

Re: [PATCH 0/2] AIA draft v0.3.0 support for QEMU RISC-V

2022-06-29 Thread Alistair Francis

On Thu, Jun 16, 2022 at 1:17 PM Anup Patel  wrote:
>
> The latest AIA draft v0.3.0 addresses comments from the architecture
> review committee.
> (Refer, https://github.com/riscv/riscv-aia/releases/tag/0.3.0-draft.31)
>
> There are primarily two changes:
> 1) Removing various [m|s|vs]seteienum, [m|s|vs]clreienum, [m|s|vs]seteipnum,
>and [m|s|vs]clrei;num CSRs because these CSRs were mostly for software
>convienence.
> 2) Simplifying the default priority assignment for local interrupts
>
> These patches can also be found in riscv_aia_update_v1 branch at:
> https://github.com/avpatel/qemu.git
>
> Corresponding changes in OpenSBI and Linux were small and these can be
> found at:
>  riscv_aia_update_v1 branch of https://github.com/avpatel/opensbi.git
>  riscv_aia_v1 branch of https://github.com/avpatel/linux.git
>
> Anup Patel (2):
>   target/riscv: Remove CSRs that set/clear an IMSIC interrupt file bits
>   target/riscv: Update default priority table for local interrupts

Thanks!

Applied to riscv-to-apply.next

Alistair

>
>  target/riscv/cpu_bits.h   |  26 +--
>  target/riscv/cpu_helper.c | 134 +-
>  target/riscv/csr.c| 150 +-
>  3 files changed, 72 insertions(+), 238 deletions(-)
>
> --
> 2.34.1
>
>

Re: [PATCH 2/2] python/qemu/machine: accept QMP connection asynchronously

2022-06-29 Thread John Snow

On Tue, Jun 28, 2022 at 10:17 AM Daniel P. Berrangé  wrote:
>
> On Tue, Jun 28, 2022 at 05:49:39PM +0400, marcandre.lur...@redhat.com wrote:
> > From: Marc-André Lureau 
> >
> > QMP accept is currently synchronous. If qemu dies before the connection
> > is established, it will wait there. Instead turn the code to do
> > concurrently accept() and wait(). Returns when the first task is
> > completed to determine whether a connection was established.
>
> If the spawned QEMU process was given -daemonize, won't this code
> mistakenly think the subprocess has quit ?

Do we use daemonize with this code anywhere? Is it important that we
are able to?

Many of the shutdown routines I wrote expect to work directly with a
launched process ... at least, that expectation exists in my head. I
suppose a lot of this code may actually just coincidentally work with
-daemonize and I wouldn't have noticed. I certainly haven't been
testing it explicitly. I definitely make no accommodations for it, so
I would expect some stale processes in various cases at a minimum.

If we want to expand to accommodate this feature, can we do that
later? Machine needs a bit of a remodel anyway. (I want to write an
'idiomatic' asyncio version to match the QMP lib. I have some
questions to work out WRT which portions of this appliance can be
upstreamed and which need to remain only in our testing tree. We can
talk about those pieces later, just throwing it out there that it's on
my list.)

--js

Re: [PATCH 2/2] target/riscv: Update default priority table for local interrupts

2022-06-29 Thread Alistair Francis

On Thu, Jun 16, 2022 at 1:17 PM Anup Patel  wrote:
>
> The latest AIA draft v0.3.0 defines a relatively simpler scheme for
> default priority assignments where:
> 1) local interrupts 24 to 31 and 48 to 63 are reserved for custom use
>and have implementation specific default priority.
> 2) remaining local interrupts 0 to 23 and 32 to 47 have a recommended
>(not mandatory) priority assignments.
>
> We update the default priority table and hviprio mapping as-per above.
>
> Signed-off-by: Anup Patel 

Reviewed-by: Alistair Francis 

Alistair

> ---
>  target/riscv/cpu_bits.h   |   2 +-
>  target/riscv/cpu_helper.c | 134 ++
>  2 files changed, 66 insertions(+), 70 deletions(-)
>
> diff --git a/target/riscv/cpu_bits.h b/target/riscv/cpu_bits.h
> index 01608f86e5..63ba867379 100644
> --- a/target/riscv/cpu_bits.h
> +++ b/target/riscv/cpu_bits.h
> @@ -773,7 +773,7 @@ typedef enum RISCVException {
>  #define IPRIO_IRQ_BITS 8
>  #define IPRIO_MMAXIPRIO255
>  #define IPRIO_DEFAULT_UPPER4
> -#define IPRIO_DEFAULT_MIDDLE   (IPRIO_DEFAULT_UPPER + 24)
> +#define IPRIO_DEFAULT_MIDDLE   (IPRIO_DEFAULT_UPPER + 12)
>  #define IPRIO_DEFAULT_MIPRIO_DEFAULT_MIDDLE
>  #define IPRIO_DEFAULT_S(IPRIO_DEFAULT_M + 3)
>  #define IPRIO_DEFAULT_SGEXT(IPRIO_DEFAULT_S + 3)
> diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
> index 3c8ebecf84..063a1403db 100644
> --- a/target/riscv/cpu_helper.c
> +++ b/target/riscv/cpu_helper.c
> @@ -169,17 +169,17 @@ void riscv_cpu_update_mask(CPURISCVState *env)
>   * 14 "
>   * 15 "
>   * 16 "
> - * 18 Debug/trace interrupt
> - * 20 (Reserved interrupt)
> + * 17 "
> + * 18 "
> + * 19 "
> + * 20 "
> + * 21 "
>   * 22 "
> - * 24 "
> - * 26 "
> - * 28 "
> - * 30 (Reserved for standard reporting of bus or system errors)
> + * 23 "
>   */
>
>  static const int hviprio_index2irq[] = {
> -0, 1, 4, 5, 8, 13, 14, 15, 16, 18, 20, 22, 24, 26, 28, 30 };
> +0, 1, 4, 5, 8, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 };
>  static const int hviprio_index2rdzero[] = {
>  1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
>
> @@ -208,50 +208,60 @@ int riscv_cpu_hviprio_index2irq(int index, int 
> *out_irq, int *out_rdzero)
>   *  Default  |
>   *  Priority | Major Interrupt Numbers
>   * 
> - *  Highest  | 63 (3f), 62 (3e), 31 (1f), 30 (1e), 61 (3d), 60 (3c),
> - *   | 59 (3b), 58 (3a), 29 (1d), 28 (1c), 57 (39), 56 (38),
> - *   | 55 (37), 54 (36), 27 (1b), 26 (1a), 53 (35), 52 (34),
> - *   | 51 (33), 50 (32), 25 (19), 24 (18), 49 (31), 48 (30)
> + *  Highest  | 47, 23, 46, 45, 22, 44,
> + *   | 43, 21, 42, 41, 20, 40
>   *   |
>   *   | 11 (0b),  3 (03),  7 (07)
>   *   |  9 (09),  1 (01),  5 (05)
>   *   | 12 (0c)
>   *   | 10 (0a),  2 (02),  6 (06)
>   *   |
> - *   | 47 (2f), 46 (2e), 23 (17), 22 (16), 45 (2d), 44 (2c),
> - *   | 43 (2b), 42 (2a), 21 (15), 20 (14), 41 (29), 40 (28),
> - *   | 39 (27), 38 (26), 19 (13), 18 (12), 37 (25), 36 (24),
> - *  Lowest   | 35 (23), 34 (22), 17 (11), 16 (10), 33 (21), 32 (20)
> + *   | 39, 19, 38, 37, 18, 36,
> + *  Lowest   | 35, 17, 34, 33, 16, 32
>   * 
>   */
>  static const uint8_t default_iprio[64] = {
> - [63] = IPRIO_DEFAULT_UPPER,
> - [62] = IPRIO_DEFAULT_UPPER + 1,
> - [31] = IPRIO_DEFAULT_UPPER + 2,
> - [30] = IPRIO_DEFAULT_UPPER + 3,
> - [61] = IPRIO_DEFAULT_UPPER + 4,
> - [60] = IPRIO_DEFAULT_UPPER + 5,
> -
> - [59] = IPRIO_DEFAULT_UPPER + 6,
> - [58] = IPRIO_DEFAULT_UPPER + 7,
> - [29] = IPRIO_DEFAULT_UPPER + 8,
> - [28] = IPRIO_DEFAULT_UPPER + 9,
> - [57] = IPRIO_DEFAULT_UPPER + 10,
> - [56] = IPRIO_DEFAULT_UPPER + 11,
> -
> - [55] = IPRIO_DEFAULT_UPPER + 12,
> - [54] = IPRIO_DEFAULT_UPPER + 13,
> - [27] = IPRIO_DEFAULT_UPPER + 14,
> - [26] = IPRIO_DEFAULT_UPPER + 15,
> - [53] = IPRIO_DEFAULT_UPPER + 16,
> - [52] = IPRIO_DEFAULT_UPPER + 17,
> -
> - [51] = IPRIO_DEFAULT_UPPER + 18,
> - [50] = IPRIO_DEFAULT_UPPER + 19,
> - [25] = IPRIO_DEFAULT_UPPER + 20,
> - [24] = IPRIO_DEFAULT_UPPER + 21,
> - [49] = IPRIO_DEFAULT_UPPER + 22,
> - [48] = IPRIO_DEFAULT_UPPER + 23,
> + /* Custom interrupts 48 to 63 */
> + [63] = IPRIO_MMAXIPRIO,
> + [62] = IPRIO_MMAXIPRIO,
> + [61] = IPRIO_MMAXIPRIO,
> + [60] = IPRIO_MMAXIPRIO,
> + [59] = IPRIO_MMAXIPRIO,
> + [58] = IPRIO_MMAXIPRIO,
> + [57] = IPRIO_MMAXIPRIO,
> + [56] = IPRIO_MMAXIPRIO,
> + [55] = IPRIO_MMAXIPRIO,
> + [54] = IPRIO_MMAXIPRIO,
> + [53] = IPRIO_MMAXIPRIO,
> + [52] = IPRIO_MMAXIPRIO,
> + [51] = IPRIO_MMAXIPRIO,
> + [50] = IPRIO_MMAXIPRIO,
> + [49] = IPRIO_MMAXIPRIO,
> + [48] = IPRIO_MMAXIPRIO,
> +
> + /* Custom interrupts 24 to 31 */
> + [31] = IPRIO_MMAXIPRIO,
> + [30] =

Re: [PATCH 1/2] target/riscv: Remove CSRs that set/clear an IMSIC interrupt file bits

2022-06-29 Thread Alistair Francis

On Thu, Jun 16, 2022 at 1:18 PM Anup Patel  wrote:
>
> Based on architecture review committee feedback, the [m|s|vs]seteienum,
> [m|s|vs]clreienum, [m|s|vs]seteipnum, and [m|s|vs]clreipnum CSRs are
> removed in the latest AIA draft v0.3.0 specification.
> (Refer, https://github.com/riscv/riscv-aia/releases/tag/0.3.0-draft.31)
>
> These CSRs were mostly for software convenience and software can always
> use [m|s|vs]iselect and [m|s|vs]ireg CSRs to update the IMSIC interrupt
> file bits.
>
> We update the IMSIC CSR emulation as-per above to match the latest AIA
> draft specification.
>
> Signed-off-by: Anup Patel 

Reviewed-by: Alistair Francis 

Alistair

> ---
>  target/riscv/cpu_bits.h |  24 +--
>  target/riscv/csr.c  | 150 +---
>  2 files changed, 6 insertions(+), 168 deletions(-)
>
> diff --git a/target/riscv/cpu_bits.h b/target/riscv/cpu_bits.h
> index 4a55c6a709..01608f86e5 100644
> --- a/target/riscv/cpu_bits.h
> +++ b/target/riscv/cpu_bits.h
> @@ -177,14 +177,8 @@
>  #define CSR_MIREG   0x351
>
>  /* Machine-Level Interrupts (AIA) */
> -#define CSR_MTOPI   0xfb0
> -
> -/* Machine-Level IMSIC Interface (AIA) */
> -#define CSR_MSETEIPNUM  0x358
> -#define CSR_MCLREIPNUM  0x359
> -#define CSR_MSETEIENUM  0x35a
> -#define CSR_MCLREIENUM  0x35b
>  #define CSR_MTOPEI  0x35c
> +#define CSR_MTOPI   0xfb0
>
>  /* Virtual Interrupts for Supervisor Level (AIA) */
>  #define CSR_MVIEN   0x308
> @@ -224,14 +218,8 @@
>  #define CSR_SIREG   0x151
>
>  /* Supervisor-Level Interrupts (AIA) */
> -#define CSR_STOPI   0xdb0
> -
> -/* Supervisor-Level IMSIC Interface (AIA) */
> -#define CSR_SSETEIPNUM  0x158
> -#define CSR_SCLREIPNUM  0x159
> -#define CSR_SSETEIENUM  0x15a
> -#define CSR_SCLREIENUM  0x15b
>  #define CSR_STOPEI  0x15c
> +#define CSR_STOPI   0xdb0
>
>  /* Supervisor-Level High-Half CSRs (AIA) */
>  #define CSR_SIEH0x114
> @@ -282,14 +270,8 @@
>  #define CSR_VSIREG  0x251
>
>  /* VS-Level Interrupts (H-extension with AIA) */
> -#define CSR_VSTOPI  0xeb0
> -
> -/* VS-Level IMSIC Interface (H-extension with AIA) */
> -#define CSR_VSSETEIPNUM 0x258
> -#define CSR_VSCLREIPNUM 0x259
> -#define CSR_VSSETEIENUM 0x25a
> -#define CSR_VSCLREIENUM 0x25b
>  #define CSR_VSTOPEI 0x25c
> +#define CSR_VSTOPI  0xeb0
>
>  /* Hypervisor and VS-Level High-Half CSRs (H-extension with AIA) */
>  #define CSR_HIDELEGH0x613
> diff --git a/target/riscv/csr.c b/target/riscv/csr.c
> index 409a209f14..a4890ebc70 100644
> --- a/target/riscv/csr.c
> +++ b/target/riscv/csr.c
> @@ -1040,14 +1040,6 @@ static int aia_xlate_vs_csrno(CPURISCVState *env, int 
> csrno)
>  return CSR_VSISELECT;
>  case CSR_SIREG:
>  return CSR_VSIREG;
> -case CSR_SSETEIPNUM:
> -return CSR_VSSETEIPNUM;
> -case CSR_SCLREIPNUM:
> -return CSR_VSCLREIPNUM;
> -case CSR_SSETEIENUM:
> -return CSR_VSSETEIENUM;
> -case CSR_SCLREIENUM:
> -return CSR_VSCLREIENUM;
>  case CSR_STOPEI:
>  return CSR_VSTOPEI;
>  default:
> @@ -1202,124 +1194,6 @@ done:
>  return RISCV_EXCP_NONE;
>  }
>
> -static int rmw_xsetclreinum(CPURISCVState *env, int csrno, target_ulong *val,
> -target_ulong new_val, target_ulong wr_mask)
> -{
> -int ret = -EINVAL;
> -bool set, pend, virt;
> -target_ulong priv, isel, vgein, xlen, nval, wmask;
> -
> -/* Translate CSR number for VS-mode */
> -csrno = aia_xlate_vs_csrno(env, csrno);
> -
> -/* Decode register details from CSR number */
> -virt = set = pend = false;
> -switch (csrno) {
> -case CSR_MSETEIPNUM:
> -priv = PRV_M;
> -set = true;
> -pend = true;
> -break;
> -case CSR_MCLREIPNUM:
> -priv = PRV_M;
> -pend = true;
> -break;
> -case CSR_MSETEIENUM:
> -priv = PRV_M;
> -set = true;
> -break;
> -case CSR_MCLREIENUM:
> -priv = PRV_M;
> -break;
> -case CSR_SSETEIPNUM:
> -priv = PRV_S;
> -set = true;
> -pend = true;
> -break;
> -case CSR_SCLREIPNUM:
> -priv = PRV_S;
> -pend = true;
> -break;
> -case CSR_SSETEIENUM:
> -priv = PRV_S;
> -set = true;
> -break;
> -case CSR_SCLREIENUM:
> -priv = PRV_S;
> -break;
> -case CSR_VSSETEIPNUM:
> -priv = PRV_S;
> -virt = true;
> -set = true;
> -pend = true;
> -break;
> -case CSR_VSCLREIPNUM:
> -priv = PRV_S;
> -virt = true;
> -pend = true;
> -break;
> -case CSR_VSSETEIENUM:
> -priv = PRV_S;
> -virt = true;
> -set = true;
> -break;
> -case CSR_VSCLREIENUM:
> -priv = PRV_S;
> -virt = true;
> -break;
> -

Re: [RFC PATCH v3] RISC-V: Add Zawrs ISA extension support

2022-06-29 Thread Alistair Francis

On Mon, Jun 27, 2022 at 6:16 PM Christoph Müllner
 wrote:
>
>
>
> On Mon, Jun 27, 2022 at 7:20 AM Alistair Francis  wrote:
>>
>> On Fri, Jun 24, 2022 at 1:31 AM Christoph Muellner
>>  wrote:
>> >
>> > This patch adds support for the Zawrs ISA extension.
>> > Given the current (incomplete) implementation of reservation sets
>> > there seems to be no way to provide a full emulation of the WRS
>> > instruction (wake on reservation set invalidation or timeout or
>> > interrupt). Therefore, we just pretend that an interrupt occured,
>> > exit the execution loop and finally continue execution.
>> >
>> > The specification can be found here:
>> > https://github.com/riscv/riscv-zawrs/blob/main/zawrs.adoc
>> >
>> > Note, that the Zawrs extension is not frozen or ratified yet.
>> > Therefore this patch is an RFC and not intended to get merged.
>> >
>> > Changes since v2:
>> > * Adjustments according to a specification change
>> > * Inline REQUIRE_ZAWRS() since it has only one user
>> >
>> > Changes since v1:
>> > * Adding zawrs to the ISA string that is passed to the kernel
>> >
>> > Signed-off-by: Christoph Müllner 
>> > ---
>> >  target/riscv/cpu.c  |  2 +
>> >  target/riscv/cpu.h  |  1 +
>> >  target/riscv/insn32.decode  |  4 ++
>> >  target/riscv/insn_trans/trans_rvzawrs.c.inc | 54 +
>> >  target/riscv/translate.c|  1 +
>> >  5 files changed, 62 insertions(+)
>> >  create mode 100644 target/riscv/insn_trans/trans_rvzawrs.c.inc
>> >
>> > diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
>> > index 05e6521351..6cb00fadff 100644
>> > --- a/target/riscv/cpu.c
>> > +++ b/target/riscv/cpu.c
>> > @@ -882,6 +882,7 @@ static Property riscv_cpu_extensions[] = {
>> >  DEFINE_PROP_BOOL("Counters", RISCVCPU, cfg.ext_counters, true),
>> >  DEFINE_PROP_BOOL("Zifencei", RISCVCPU, cfg.ext_ifencei, true),
>> >  DEFINE_PROP_BOOL("Zicsr", RISCVCPU, cfg.ext_icsr, true),
>> > +DEFINE_PROP_BOOL("zawrs", RISCVCPU, cfg.ext_zawrs, true),
>>
>> Would this be enabled by default?
>
>
> The "true" was a personal preference (I prefer to keep the argument list for 
> QEMU short)
> and I did not see any conflicts with existing behavior (no code should break).
> If you prefer otherwise or if I missed a policy I will change it.
>
>>
>>
>> >  DEFINE_PROP_BOOL("Zfh", RISCVCPU, cfg.ext_zfh, false),
>> >  DEFINE_PROP_BOOL("Zfhmin", RISCVCPU, cfg.ext_zfhmin, false),
>> >  DEFINE_PROP_BOOL("Zve32f", RISCVCPU, cfg.ext_zve32f, false),
>> > @@ -1075,6 +1076,7 @@ static void riscv_isa_string_ext(RISCVCPU *cpu, char 
>> > **isa_str, int max_str_len)
>> >  ISA_EDATA_ENTRY(zicsr, ext_icsr),
>> >  ISA_EDATA_ENTRY(zifencei, ext_ifencei),
>> >  ISA_EDATA_ENTRY(zmmul, ext_zmmul),
>> > +ISA_EDATA_ENTRY(zawrs, ext_zawrs),
>> >  ISA_EDATA_ENTRY(zfh, ext_zfh),
>> >  ISA_EDATA_ENTRY(zfhmin, ext_zfhmin),
>> >  ISA_EDATA_ENTRY(zfinx, ext_zfinx),
>> > diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h
>> > index 7d6397acdf..a22bc0fa09 100644
>> > --- a/target/riscv/cpu.h
>> > +++ b/target/riscv/cpu.h
>> > @@ -380,6 +380,7 @@ struct RISCVCPUConfig {
>> >  bool ext_h;
>> >  bool ext_j;
>> >  bool ext_v;
>> > +bool ext_zawrs;
>> >  bool ext_zba;
>> >  bool ext_zbb;
>> >  bool ext_zbc;
>> > diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode
>> > index 4033565393..513ea227fe 100644
>> > --- a/target/riscv/insn32.decode
>> > +++ b/target/riscv/insn32.decode
>> > @@ -711,6 +711,10 @@ vsetvli 0 ... . 111 . 1010111 
>> >  @r2_zimm11
>> >  vsetivli11 .. . 111 . 1010111  @r2_zimm10
>> >  vsetvl  100 . . 111 . 1010111  @r
>> >
>> > +# *** Zawrs Standard Extension ***
>> > +wrs_nto1101 0 000 0 1110011
>> > +wrs_sto00011101 0 000 0 1110011
>> > +
>> >  # *** RV32 Zba Standard Extension ***
>> >  sh1add 001 .. 010 . 0110011 @r
>> >  sh2add 001 .. 100 . 0110011 @r
>> > diff --git a/target/riscv/insn_trans/trans_rvzawrs.c.inc 
>> > b/target/riscv/insn_trans/trans_rvzawrs.c.inc
>> > new file mode 100644
>> > index 00..d0df56378e
>> > --- /dev/null
>> > +++ b/target/riscv/insn_trans/trans_rvzawrs.c.inc
>> > @@ -0,0 +1,54 @@
>> > +/*
>> > + * RISC-V translation routines for the RISC-V Zawrs Extension.
>> > + *
>> > + * Copyright (c) 2022 Christoph Muellner, christoph.muell...@vrull.io
>> > + *
>> > + * This program is free software; you can redistribute it and/or modify it
>> > + * under the terms and conditions of the GNU General Public License,
>> > + * version 2 or later, as published by the Free Software Foundation.
>> > + *
>> > + * This program is distributed in the hope it will be useful, but WITHOUT
>> > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
>> > + * FITNESS

[PATCH 2/2] target/riscv: Ibex: Support priv version 1.11

2022-06-29 Thread Alistair Francis

From: Alistair Francis 

The Ibex CPU supports version 1.11 of the priv spec [1], so let's
correct that in QEMU as well.

1: https://ibex-core.readthedocs.io/en/latest/01_overview/compliance.html

Signed-off-by: Alistair Francis 
---
 target/riscv/cpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 05e6521351..178b4de51f 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -237,7 +237,7 @@ static void rv32_ibex_cpu_init(Object *obj)
 RISCVCPU *cpu = RISCV_CPU(obj);
 
 set_misa(env, MXL_RV32, RVI | RVM | RVC | RVU);
-set_priv_version(env, PRIV_VERSION_1_10_0);
+set_priv_version(env, PRIV_VERSION_1_11_0);
 cpu->cfg.mmu = false;
 cpu->cfg.epmp = true;
 }
-- 
2.36.1

[PATCH 1/2] target/riscv: Fixup MSECCFG minimum priv check

2022-06-29 Thread Alistair Francis

From: Alistair Francis 

There is nothing in the RISC-V spec that mandates version 1.12 is
required for ePMP and there is currently hardware [1] that implements
ePMP (a draft version though) with the 1.11 priv spec.

1: https://ibex-core.readthedocs.io/en/latest/01_overview/compliance.html

Fixes: a4b2fa433125af0305b0695d7f8dda61db3364b0 target/riscv: Introduce 
privilege version field in the CSR ops.
Signed-off-by: Alistair Francis 
---
 target/riscv/csr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target/riscv/csr.c b/target/riscv/csr.c
index 6dbe9b541f..6379bef5a5 100644
--- a/target/riscv/csr.c
+++ b/target/riscv/csr.c
@@ -3561,7 +3561,7 @@ riscv_csr_operations csr_ops[CSR_TABLE_SIZE] = {
 
 /* Physical Memory Protection */
 [CSR_MSECCFG]= { "mseccfg",  epmp, read_mseccfg, write_mseccfg,
- .min_priv_ver = PRIV_VERSION_1_12_0 },
+ .min_priv_ver = PRIV_VERSION_1_11_0 },
 [CSR_PMPCFG0]= { "pmpcfg0",   pmp, read_pmpcfg,  write_pmpcfg  },
 [CSR_PMPCFG1]= { "pmpcfg1",   pmp, read_pmpcfg,  write_pmpcfg  },
 [CSR_PMPCFG2]= { "pmpcfg2",   pmp, read_pmpcfg,  write_pmpcfg  },
-- 
2.36.1

[PATCH 0/2] target/riscv: Fixes for Ibex and OpenTitan

2022-06-29 Thread Alistair Francis

From: Alistair Francis 

This fixes some issues discovered on the Ibex SoC when running OpenTitan tests.

Alistair Francis (2):
  target/riscv: Fixup MSECCFG minimum priv check
  target/riscv: Ibex: Support priv version 1.11

 target/riscv/cpu.c | 2 +-
 target/riscv/csr.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

-- 
2.36.1

Re: [PULL v2 0/9] Block jobs & NBD patches

2022-06-29 Thread Richard Henderson


On 6/29/22 13:45, Vladimir Sementsov-Ogievskiy wrote:

The following changes since commit ad4c7f529a279685da84297773b4ec8080153c2d:

   Merge tag 'pull-semi-20220628' of https://gitlab.com/rth7680/qemu into 
staging (2022-06-28 10:24:31 +0530)

are available in the Git repository at:

   https://gitlab.com/vsementsov/qemu.git tags/pull-block-2022-06-14-v2

for you to fetch changes up to 1b8f777673985af366de099ad4e41d334b36fb12:

   block: use 'unsigned' for in_flight field on driver state (2022-06-29 
10:57:02 +0300)


Block jobs & NBD patches

v2: - add arguments to QEMUMachine constructor in test, to make it work
   on arm in gitlab pipeline
 - use bdrv_inc_in_flight() / bdrv_dec_in_flight() instead of direct
   manipulation with bs->in_flight


Applied, thanks.  Please update https://wiki.qemu.org/ChangeLog/7.1 as 
appropriate.


r~




- add new options for copy-before-write filter
- new trace points for NBD
- prefer unsigned type for some 'in_flight' fields

Denis V. Lunev (2):
   nbd: trace long NBD operations
   block: use 'unsigned' for in_flight field on driver state

Vladimir Sementsov-Ogievskiy (7):
   block/copy-before-write: refactor option parsing
   block/copy-before-write: add on-cbw-error open parameter
   iotests: add copy-before-write: on-cbw-error tests
   util: add qemu-co-timeout
   block/block-copy: block_copy(): add timeout_ns parameter
   block/copy-before-write: implement cbw-timeout option
   iotests: copy-before-write: add cases for cbw-timeout option

  block/block-copy.c|  33 ++-
  block/copy-before-write.c | 110 ++---
  block/mirror.c|   2 +-
  block/nbd.c   |   8 +-
  block/trace-events|   2 +
  include/block/block-copy.h|   4 +-
  include/qemu/coroutine.h  |  13 ++
  nbd/client-connection.c   |   2 +
  nbd/trace-events  |   3 +
  qapi/block-core.json  |  31 ++-
  tests/qemu-iotests/pylintrc   |   5 +
  tests/qemu-iotests/tests/copy-before-write| 216 ++
  .../qemu-iotests/tests/copy-before-write.out  |   5 +
  util/meson.build  |   1 +
  util/qemu-co-timeout.c|  89 
  15 files changed, 482 insertions(+), 42 deletions(-)
  create mode 100755 tests/qemu-iotests/tests/copy-before-write
  create mode 100644 tests/qemu-iotests/tests/copy-before-write.out
  create mode 100644 util/qemu-co-timeout.c

Re: [PATCH V8 30/39] vfio-pci: recover from unmap-all-vaddr failure

2022-06-29 Thread Alex Williamson

On Wed, 15 Jun 2022 07:52:17 -0700
Steve Sistare  wrote:

> If vfio_cpr_save fails to unmap all vaddr's, then recover by walking all
> flat sections to restore the vaddr for each.  Do so by invoking the
> vfio listener callback, and passing a new "replay" flag that tells it
> to replay a mapping without re-allocating new userland data structures.

Is this comment accurate?  I thought we had unwind in the kernel for
vaddr invalidation, and the notifier here is hooked up to any fault, so
it's at least misleading regarding vaddr.  The replay option really
needs some documentation in comments.

> Signed-off-by: Steve Sistare 
> ---
>  hw/vfio/common.c  | 66 
> ---
>  hw/vfio/cpr.c | 29 +++
>  include/hw/vfio/vfio-common.h |  2 +-
>  3 files changed, 80 insertions(+), 17 deletions(-)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index c7d73b6..5f2bd50 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -895,15 +895,35 @@ static bool 
> vfio_known_safe_misalignment(MemoryRegionSection *section)
>  return true;
>  }
>  
> +static VFIORamDiscardListener *vfio_find_ram_discard_listener(
> +VFIOContainer *container, MemoryRegionSection *section)
> +{
> +VFIORamDiscardListener *vrdl = NULL;

This initialization was copied from current code, but...

#define QLIST_FOREACH(var, head, field) \
for ((var) = ((head)->lh_first);\
   ...

it doesn't look necessary.  Thanks,

Alex

> +
> +QLIST_FOREACH(vrdl, >vrdl_list, next) {
> +if (vrdl->mr == section->mr &&
> +vrdl->offset_within_address_space ==
> +section->offset_within_address_space) {
> +break;
> +}
> +}
> +
> +if (!vrdl) {
> +hw_error("vfio: Trying to sync missing RAM discard listener");
> +/* does not return */
> +}
> +return vrdl;
> +}
> +
>  static void vfio_listener_region_add(MemoryListener *listener,
>   MemoryRegionSection *section)
>  {
>  VFIOContainer *container = container_of(listener, VFIOContainer, 
> listener);
> -vfio_container_region_add(container, section);
> +vfio_container_region_add(container, section, false);
>  }
>  
>  void vfio_container_region_add(VFIOContainer *container,
> -   MemoryRegionSection *section)
> +   MemoryRegionSection *section, bool replay)
>  {
>  hwaddr iova, end;
>  Int128 llend, llsize;
> @@ -1033,6 +1053,23 @@ void vfio_container_region_add(VFIOContainer 
> *container,
>  int iommu_idx;
>  
>  trace_vfio_listener_region_add_iommu(iova, end);
> +
> +if (replay) {
> +hwaddr as_offset = section->offset_within_address_space;
> +hwaddr iommu_offset = as_offset - section->offset_within_region;
> +
> +QLIST_FOREACH(giommu, >giommu_list, giommu_next) {
> +if (giommu->iommu_mr == iommu_mr &&
> +giommu->iommu_offset == iommu_offset) {
> +memory_region_iommu_replay(giommu->iommu_mr, >n);
> +return;
> +}
> +}
> +error_report("Container cannot find iommu region %s offset %lx",
> +memory_region_name(section->mr), iommu_offset);
> +goto fail;
> +}
> +
>  /*
>   * FIXME: For VFIO iommu types which have KVM acceleration to
>   * avoid bouncing all map/unmaps through qemu this way, this
> @@ -1083,7 +1120,15 @@ void vfio_container_region_add(VFIOContainer 
> *container,
>   * about changes.
>   */
>  if (memory_region_has_ram_discard_manager(section->mr)) {
> -vfio_register_ram_discard_listener(container, section);
> +if (replay)  {
> +VFIORamDiscardListener *vrdl =
> +vfio_find_ram_discard_listener(container, section);
> +if (vfio_ram_discard_notify_populate(>listener, section)) {
> +error_report("ram_discard_manager_replay_populated failed");
> +}
> +} else {
> +vfio_register_ram_discard_listener(container, section);
> +}
>  return;
>  }
>  
> @@ -1417,19 +1462,8 @@ static int 
> vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container,
> MemoryRegionSection 
> *section)
>  {
>  RamDiscardManager *rdm = 
> memory_region_get_ram_discard_manager(section->mr);
> -VFIORamDiscardListener *vrdl = NULL;
> -
> -QLIST_FOREACH(vrdl, >vrdl_list, next) {
> -if (vrdl->mr == section->mr &&
> -vrdl->offset_within_address_space ==
> -section->offset_within_address_space) {
> -break;
> -}
> -}
> -
> -if (!vrdl) {
> -hw_error("vfio: Trying to sync missing

Re: [PATCH] target/ppc: Add error reporting when opening file fails

2022-06-29 Thread Daniel Henrique Barboza





On 6/29/22 02:56, Markus Armbruster wrote:

jianchunfu  writes:


Add error reporting before return when opening file fails.

Signed-off-by: jianchunfu 
---
  target/ppc/kvm.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
index dc93b99189..ef9a871411 100644
--- a/target/ppc/kvm.c
+++ b/target/ppc/kvm.c
@@ -1798,6 +1798,7 @@ static int read_cpuinfo(const char *field, char *value, 
int len)
  

static int read_cpuinfo(const char *field, char *value, int len)
{
FILE *f;
int ret = -1;
int field_len = strlen(field);
char line[512];


  f = fopen("/proc/cpuinfo", "r");
  if (!f) {
+fprintf(stderr, "Error opening /proc/cpuinfo: %s\n", strerror(errno));
  return -1;
  }


do {
if (!fgets(line, sizeof(line), f)) {
break;
}
if (!strncmp(line, field, field_len)) {
pstrcpy(value, len, line);
ret = 0;
break;
}
} while (*line);

fclose(f);

return ret;
}

This function now reports an error on one out of two failures.  The
caller can't tell whether it reported or not.

Please use error_report() for errors, warn_report() for warnings, and
info_report() for informational messages.

But is it an error?  Here's the only caller:

 static uint32_t kvmppc_get_tbfreq_procfs(void)
 {
 char line[512];
 char *ns;
 uint32_t tbfreq_fallback = NANOSECONDS_PER_SECOND;
 uint32_t tbfreq_procfs;

 if (read_cpuinfo("timebase", line, sizeof(line))) {
--->return tbfreq_fallback;
 }

 ns = strchr(line, ':');
 if (!ns) {
--->return tbfreq_fallback;
 }

 tbfreq_procfs = atoi(++ns);

 /* 0 is certainly not acceptable by the guest, return fallback value */
--->return tbfreq_procfs ? tbfreq_procfs : tbfreq_fallback;
 }

I marked the three spots that handle errors.  All quietly return
NANOSECONDS_PER_SECOND.  The caller can't tell whether that happened.

Reporting an error when we don't actually fail is confusing.  Better
would be something like "Can't open /proc/cpuinfo, assuming timebase X",
where X is the value you assume.

Reporting this only in one out of several cases where we assume feels
wrong.  If it's worth reporting in one case, why isn't it worth
reporting in the other cases?  Is it worth reporting?

Aside: the use of atoi() silently maps a timebase of 0 to
NANOSECONDS_PER_SECOND.  Not fond of this function.  Not your patch's
problem, of course.

  
@@ -1906,6 +1907,7 @@ static uint64_t kvmppc_read_int_dt(const char *filename)
  
  f = fopen(filename, "rb");

  if (!f) {
+fprintf(stderr, "Error opening %s: %s\n", filename, strerror(errno));
  return -1;
  }


Preexisting: this function returns -1 when fopen() fails, 0 when fread()
fails or read less data than expected.  Its caller
kvmppc_read_int_cpu_dt() passes on the return value.  However, it is
documented to return "0 if anything goes wrong".  Bug.  Not your patch's
fault, but it needs fixing.


It does needs fixing , but it'll require some work because this code is
called in a lot of places. I'll hand this part personally because it's less
trivial than the error you reported above with read_cpuinfo() and
kvmppc_get_tbfreq_procfs().


I'll apply the relevant bits of jianchunfu's patch in the cleanup as well.


Thanks,


Daniel



Similar issue as above: you make the function emit an error message on
some, but not all failures.  If it's worth reporting in one case, why
isn't it worth reporting in the other cases?  Is it worth reporting?

Re: Slowness with multi-thread TCG?

2022-06-29 Thread Cédric Le Goater


On 6/29/22 19:13, Alex Bennée wrote:


"Matheus K. Ferst"  writes:


On 29/06/2022 12:36, Frederic Barrat wrote:

[E-MAIL EXTERNO] Não clique em links ou abra anexos, a menos que
você possa confirmar o remetente e saber que o conteúdo é seguro. Em
caso de e-mail suspeito entre imediatamente em contato com o DTI.
On 29/06/2022 00:17, Alex Bennée wrote:

If you run the sync-profiler (via the HMP "sync-profile on") you can
then get a breakdown of which mutex's are being held and for how long
("info sync-profile").

Alex, a huge thank you!
For the record, the "info sync-profile" showed:
Type   Object  Call site Wait Time (s)
     Count  Average (us)
--
BQL mutex  0x55eb89425540  accel/tcg/cpu-exec.c:744
96.31578
      73589937  1.31
BQL mutex  0x55eb89425540  target/ppc/helper_regs.c:207    0.00150
      1178  1.27
And it points to a lock in the interrupt delivery path, in
cpu_handle_interrupt().
I now understand the root cause. The interrupt signal for the
decrementer interrupt remains set because the interrupt is not being
delivered, per the config. I'm not quite sure what the proper fix is yet
(there seems to be several implementations of the decrementer on ppc),
but at least I understand why we are so slow.



To summarize what we talked elsewhere:
1 - The threads that are not decompressing the kernel have a pending
PPC_INTERRUPT_DECR, and cs->interrupt_request is CPU_INTERRUPT_HARD;


I think ppc_set_irq should be doing some gating before calling to set
cs->interrupt_request.


2 - cpu_handle_interrupt calls ppc_cpu_exec_interrupt, that calls
ppc_hw_interrupt to handle the interrupt;
3 - ppc_cpu_exec_interrupt decides that the interrupt cannot be
delivered immediately, so the corresponding bit in
env->pending_interrupts is not reset;


Is the logic controlled by ppc_hw_interrupt()? The stuff around
async_deliver?

I think maybe some of the logic needs to be factored out and checked
above. Also anywhere where env->msr is updated would need to check if
we've just enabled a load of pending interrupts and then call
ppc_set_irq.

However I'm not super familiar with the PPC code so I'll defer to the
maintainers here ;-)



That part is a nightmare with a lot of history. It needs a rewrite.
we have a good testing environment and we should catch regressions.
Not for 7.1 though.






4 - ppc_cpu_exec_interrupt does not change cs->interrupt_request
because pending_interrupts != 0, so cpu_handle_interrupt will be
called again.

This loop will acquire and release qemu_mutex_lock_iothread, slowing
down other threads that need this lock.


With a quick hack, I could verify that by moving that signal out of the
way, the decompression time of the kernel is now peanuts, no matter the
number of cpus. Even with one cpu, the 15 seconds measured before was
already a huge waste, so it was not really a multiple-cpus problem.
Multiple cpus were just highlighting it.
Thanks again!
    Fred

Re: [PATCH V8 29/39] vfio-pci: cpr part 3 (intx)

2022-06-29 Thread Alex Williamson

On Wed, 15 Jun 2022 07:52:16 -0700
Steve Sistare  wrote:

> Preserve vfio INTX state across cpr restart.  Preserve VFIOINTx fields as
> follows:
>   pin : Recover this from the vfio config in kernel space
>   interrupt : Preserve its eventfd descriptor across exec.
>   unmask : Ditto
>   route.irq : This could perhaps be recovered in vfio_pci_post_load by
> calling pci_device_route_intx_to_irq(pin), whose implementation reads
> config space for a bridge device such as ich9.  However, there is no
> guarantee that the bridge vmstate is read before vfio vmstate.  Rather
> than fiddling with MigrationPriority for vmstate handlers, explicitly
> save route.irq in vfio vmstate.
>   pending : save in vfio vmstate.
>   mmap_timeout, mmap_timer : Re-initialize
>   bool kvm_accel : Re-initialize
> 
> In vfio_realize, defer calling vfio_intx_enable until the vmstate
> is available, in vfio_pci_post_load.  Modify vfio_intx_enable and
> vfio_intx_kvm_enable to skip vfio initialization, but still perform
> kvm initialization.
> 
> Signed-off-by: Steve Sistare 
> ---
>  hw/vfio/pci.c | 92 
> +--
>  1 file changed, 83 insertions(+), 9 deletions(-)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 2fd7121..b8aee91 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -173,14 +173,45 @@ static void vfio_intx_eoi(VFIODevice *vbasedev)
>  vfio_unmask_single_irqindex(vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
>  }
>  
> +#ifdef CONFIG_KVM
> +static bool vfio_no_kvm_intx(VFIOPCIDevice *vdev)
> +{
> +return vdev->no_kvm_intx || !kvm_irqfds_enabled() ||
> +   vdev->intx.route.mode != PCI_INTX_ENABLED ||
> +   !kvm_resamplefds_enabled();
> +}
> +#endif
> +
> +static void vfio_intx_reenable_kvm(VFIOPCIDevice *vdev, Error **errp)
> +{
> +#ifdef CONFIG_KVM
> +if (vfio_no_kvm_intx(vdev)) {
> +return;
> +}
> +
> +if (vfio_notifier_init(vdev, >intx.unmask, "intx-unmask", 0)) {
> +error_setg(errp, "vfio_notifier_init intx-unmask failed");
> +return;
> +}
> +
> +if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state,
> +   >intx.interrupt,
> +   >intx.unmask,
> +   vdev->intx.route.irq)) {
> +error_setg_errno(errp, errno, "failed to setup resample irqfd");


Does not unwind with vfio_notifier_cleanup().  This also exactly
duplicates code in vfio_intx_enable_kvm(), which suggests it needs
further refactoring to a common helper.



> +return;
> +}
> +
> +vdev->intx.kvm_accel = true;
> +#endif
> +}
> +
>  static void vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp)
>  {
>  #ifdef CONFIG_KVM
>  int irq_fd = event_notifier_get_fd(>intx.interrupt);
>  
> -if (vdev->no_kvm_intx || !kvm_irqfds_enabled() ||
> -vdev->intx.route.mode != PCI_INTX_ENABLED ||
> -!kvm_resamplefds_enabled()) {
> +if (vfio_no_kvm_intx(vdev)) {
>  return;
>  }
>  
> @@ -328,7 +359,13 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error 
> **errp)
>  return 0;
>  }
>  
> -vfio_disable_interrupts(vdev);
> +/*
> + * Do not alter interrupt state during vfio_realize and cpr-load.  The
> + * reused flag is cleared thereafter.
> + */
> +if (!vdev->vbasedev.reused) {
> +vfio_disable_interrupts(vdev);
> +}
>  
>  vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
>  pci_config_set_interrupt_pin(vdev->pdev.config, pin);
> @@ -353,6 +390,11 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error 
> **errp)
>  fd = event_notifier_get_fd(>intx.interrupt);
>  qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev);
>  
> +if (vdev->vbasedev.reused) {
> +vfio_intx_reenable_kvm(vdev, );
> +goto finish;
> +}
> +

This only jumps over the vfio_set_irq_signaling() and
vfio_intx_enable_kvm(), largely replacing the latter with chunks of
code taken from it.  Doesn't seem like the right factoring.

>  if (vfio_set_irq_signaling(>vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
> VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) {
>  qemu_set_fd_handler(fd, NULL, NULL, vdev);
> @@ -365,6 +407,7 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error 
> **errp)
>  warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
>  }
>  
> +finish:
>  vdev->interrupt = VFIO_INT_INTx;
>  
>  trace_vfio_intx_enable(vdev->vbasedev.name);
> @@ -3195,9 +3238,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
>   vfio_intx_routing_notifier);
>  vdev->irqchip_change_notifier.notify = vfio_irqchip_change;
>  kvm_irqchip_add_change_notifier(>irqchip_change_notifier);
> -ret = vfio_intx_enable(vdev, errp);
> -if (ret) {
> -goto out_deregister;
> +
> +

Re: [PATCH V8 28/39] vfio-pci: cpr part 2 (msi)

2022-06-29 Thread Alex Williamson

On Wed, 15 Jun 2022 07:52:15 -0700
Steve Sistare  wrote:

> Finish cpr for vfio-pci MSI/MSI-X devices by preserving eventfd's and
> vector state.
> 
> Signed-off-by: Steve Sistare 
> ---
>  hw/vfio/pci.c | 122 
> +-
>  1 file changed, 121 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 237231b..2fd7121 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -53,17 +53,53 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
>  static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
>  static void vfio_msi_disable_common(VFIOPCIDevice *vdev);
>  
> +#define EVENT_FD_NAME(vdev, name)   \
> +g_strdup_printf("%s_%s", (vdev)->vbasedev.name, (name))
> +
> +static int save_event_fd(VFIOPCIDevice *vdev, const char *name, int nr,
> + EventNotifier *ev)
> +{
> +int fd = event_notifier_get_fd(ev);
> +
> +if (fd >= 0) {
> +Error *err;
> +g_autofree char *fdname = EVENT_FD_NAME(vdev, name);
> +
> +if (cpr_resave_fd(fdname, nr, fd, )) {
> +error_report_err(err);
> +return 1;


Preferably -1, but the caller doesn't actually test the return value
anyway :-\


> +}
> +}
> +return 0;
> +}
> +
> +static int load_event_fd(VFIOPCIDevice *vdev, const char *name, int nr)
> +{
> +g_autofree char *fdname = EVENT_FD_NAME(vdev, name);
> +int fd = cpr_find_fd(fdname, nr);
> +return fd;


return cpr_find_fd(EVENT_FD_NAME(vdev, name), nr);


> +}
> +
> +static void delete_event_fd(VFIOPCIDevice *vdev, const char *name, int nr)
> +{
> +g_autofree char *fdname = EVENT_FD_NAME(vdev, name);
> +cpr_delete_fd(fdname, nr);


cpr_delete_fd(EVENT_FD_NAME(vdev, name), nr);


> +}
> +
>  /* Create new or reuse existing eventfd */
>  static int vfio_notifier_init(VFIOPCIDevice *vdev, EventNotifier *e,
>const char *name, int nr)
>  {
> -int fd = -1;   /* placeholder until a subsequent patch */
>  int ret = 0;
> +int fd = load_event_fd(vdev, name, nr);
>  
>  if (fd >= 0) {
>  event_notifier_init_fd(e, fd);
>  } else {
>  ret = event_notifier_init(e, 0);
> +if (!ret) {
> +save_event_fd(vdev, name, nr, e);


Return value not tested.  The function generates an error report if it
fails, but it doesn't seem that actually blocks a cpr attempt.  Do we
just wind up with that error report as a breadcrumb to why cpr breaks
with a missing fd down the road?


> +}
>  }
>  return ret;
>  }
> @@ -71,6 +107,7 @@ static int vfio_notifier_init(VFIOPCIDevice *vdev, 
> EventNotifier *e,
>  static void vfio_notifier_cleanup(VFIOPCIDevice *vdev, EventNotifier *e,
>const char *name, int nr)
>  {
> +delete_event_fd(vdev, name, nr);
>  event_notifier_cleanup(e);
>  }
>  
> @@ -511,6 +548,15 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, 
> unsigned int nr,
>  VFIOMSIVector *vector;
>  int ret;
>  
> +/*
> + * Ignore the callback from msix_set_vector_notifiers during resume.
> + * The necessary subset of these actions is called from 
> vfio_claim_vectors
> + * during post load.
> + */
> +if (vdev->vbasedev.reused) {
> +return 0;
> +}
> +
>  trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr);
>  
>  vector = >msi_vectors[nr];
> @@ -2784,6 +2830,11 @@ static void vfio_register_err_notifier(VFIOPCIDevice 
> *vdev)
>  fd = event_notifier_get_fd(>err_notifier);
>  qemu_set_fd_handler(fd, vfio_err_notifier_handler, NULL, vdev);
>  
> +/* Do not alter irq_signaling during vfio_realize for cpr */
> +if (vdev->vbasedev.reused) {
> +return;
> +}
> +
>  if (vfio_set_irq_signaling(>vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0,
> VFIO_IRQ_SET_ACTION_TRIGGER, fd, )) {
>  error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
> @@ -2849,6 +2900,12 @@ static void vfio_register_req_notifier(VFIOPCIDevice 
> *vdev)
>  fd = event_notifier_get_fd(>req_notifier);
>  qemu_set_fd_handler(fd, vfio_req_notifier_handler, NULL, vdev);
>  
> +/* Do not alter irq_signaling during vfio_realize for cpr */
> +if (vdev->vbasedev.reused) {
> +vdev->req_enabled = true;
> +return;
> +}


vfio_notifier_init() transparently gets the old fd or creates a new
one, how do we know which has occurred to know that this eventfd is
already configured?

Don't we also have the same issue relative to vdev->pci_aer for the
error handler?

> +
>  if (vfio_set_irq_signaling(>vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0,
> VFIO_IRQ_SET_ACTION_TRIGGER, fd, )) {
>  error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
> @@ -3357,6 +3414,43 @@ static Property vfio_pci_dev_properties[] = {
>  DEFINE_PROP_END_OF_LIST(),
>  };
>  
>

Re: [PATCH] hw/nvme: Use ioeventfd to handle doorbell updates

2022-06-29 Thread Klaus Jensen

On Jun 27 18:48, Jinhao Fan wrote:
> Add property "ioeventfd" which is enabled by default. When this is
> enabled, updates on the doorbell registers will cause KVM to signal
> an event to the QEMU main loop to handle the doorbell updates.
> Therefore, instead of letting the vcpu thread run both guest VM and
> IO emulation, we now use the main loop thread to do IO emulation and
> thus the vcpu thread has more cycles for the guest VM.
> 
> Since ioeventfd does not tell us the exact value that is written, it is
> only useful when shadow doorbell buffer is enabled, where we check
> for the value in the shadow doorbell buffer when we get the doorbell
> update event.
> 
> IOPS comparison on Linux 5.19-rc2: (Unit: KIOPS)
> 
> qd   1   4  16  64
> qemu35 121 176 153
> ioeventfd   41 133 258 313
> 
> Signed-off-by: Jinhao Fan 
> ---
>  hw/nvme/ctrl.c | 97 +-
>  hw/nvme/nvme.h |  5 +++
>  2 files changed, 101 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> index c952c34f94..787b89f7d3 100644
> --- a/hw/nvme/ctrl.c
> +++ b/hw/nvme/ctrl.c
> @@ -1374,7 +1374,14 @@ static void nvme_enqueue_req_completion(NvmeCQueue 
> *cq, NvmeRequest *req)
>  
>  QTAILQ_REMOVE(>sq->out_req_list, req, entry);
>  QTAILQ_INSERT_TAIL(>req_list, req, entry);
> -timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> +
> +if (req->sq->ioeventfd_enabled) {
> +/* Post CQE directly since we are in main loop thread */
> +nvme_post_cqes(cq);
> +} else {
> +/* Schedule the timer to post CQE later since we are in vcpu thread 
> */
> +timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> +}
>  }
>  
>  static void nvme_process_aers(void *opaque)
> @@ -4195,10 +4202,74 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest 
> *req)
>  return NVME_INVALID_OPCODE | NVME_DNR;
>  }
>  
> +static void nvme_cq_notifier(EventNotifier *e)
> +{
> +NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier);
> +NvmeCtrl *n = cq->ctrl;
> +
> +event_notifier_test_and_clear(>notifier);
> +
> +nvme_update_cq_head(cq);
> +
> +if (cq->tail == cq->head) {
> +if (cq->irq_enabled) {
> +n->cq_pending--;
> +}
> +
> +nvme_irq_deassert(n, cq);
> +}
> +
> +nvme_post_cqes(cq);
> +}
> +
> +static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
> +{
> +NvmeCtrl *n = cq->ctrl;
> +uint16_t offset = (cq->cqid << 3) + (1 << 2);
> +int ret;
> +
> +if ((ret = event_notifier_init(>notifier, 0))) {
> +return ret;
> +}

Dont assign in conditionals and rely on the implicit value. It's too
error prone. Split into

  ret = event_notifier_init(>notifier, 0);
  if (ret < 0) {
return ret;
  }

> +
> +event_notifier_set_handler(>notifier, nvme_cq_notifier);
> +memory_region_add_eventfd(>iomem,
> +  0x1000 + offset, 4, false, 0, >notifier);
> +
> +return 0;
> +}
> +
> +static void nvme_sq_notifier(EventNotifier *e)
> +{
> +NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier);
> +
> +event_notifier_test_and_clear(>notifier);
> +
> +nvme_process_sq(sq);
> +}
> +
> +static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
> +{
> +NvmeCtrl *n = sq->ctrl;
> +uint16_t offset = sq->sqid << 3;
> +int ret;
> +
> +if ((ret = event_notifier_init(>notifier, 0))) {
> +return ret;
> +}

Same as above.

> +
> +event_notifier_set_handler(>notifier, nvme_sq_notifier);
> +memory_region_add_eventfd(>iomem,
> +  0x1000 + offset, 4, false, 0, >notifier);
> +
> +return 0;
> +}
> +
>  static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
>  {
>  n->sq[sq->sqid] = NULL;
>  timer_free(sq->timer);
> +event_notifier_cleanup(>notifier);
>  g_free(sq->io_req);
>  if (sq->sqid) {
>  g_free(sq);
> @@ -4250,6 +4321,7 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, 
> uint64_t dma_addr,
>   uint16_t sqid, uint16_t cqid, uint16_t size)
>  {
>  int i;
> +int ret;
>  NvmeCQueue *cq;
>  
>  sq->ctrl = n;
> @@ -4271,6 +4343,11 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, 
> uint64_t dma_addr,
>  if (n->dbbuf_enabled) {
>  sq->db_addr = n->dbbuf_dbs + (sqid << 3);
>  sq->ei_addr = n->dbbuf_eis + (sqid << 3);
> +
> +if (n->params.ioeventfd && sq->sqid != 0) {
> +ret = nvme_init_sq_ioeventfd(sq);
> +sq->ioeventfd_enabled = ret == 0;
> +}

Not using ret for anything really, so

  if (!nvme_init_sq_ioeventfd(sq)) {
sq->ioeventfd_enabled = true;
  }

should do.

>  }
>  
>  assert(n->cq[cqid]);
> @@ -4577,6 +4654,7 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
>  {
>  n->cq[cq->cqid] = NULL;
>  timer_free(cq->timer);
> +event_notifier_cleanup(>notifier);
>

Re: [PATCH v2] target/i386: Add unaccepted memory configuration

2022-06-29 Thread Gupta, Pankaj





For SEV-SNP, an OS is "SEV-SNP capable" without supporting this UEFI
v2.9 memory type. In order for OVMF to be able to avoid pre-validating
potentially hundreds of gibibytes of data before booting, it needs to
know if the guest OS can support its use of the new type of memory in
the memory map.

Cc: Xu, Min M 
Cc: Xiaoyao Li 
Cc: Thomas Lendacky 
Cc: Gerd Hoffman 
Signed-off-by: Dionna Glaze 
---


Wondering what changed in v2. Did I miss change log?


  hw/i386/fw_cfg.c  |  6 ++
  target/i386/sev.c | 49 +++
  target/i386/sev.h |  2 ++
  3 files changed, 57 insertions(+)

diff --git a/hw/i386/fw_cfg.c b/hw/i386/fw_cfg.c
index a283785a8d..9c069ddebe 100644
--- a/hw/i386/fw_cfg.c
+++ b/hw/i386/fw_cfg.c
@@ -23,6 +23,7 @@
  #include "e820_memory_layout.h"
  #include "kvm/kvm_i386.h"
  #include "qapi/error.h"
+#include "target/i386/sev.h"
  #include CONFIG_DEVICES
  
  struct hpet_fw_config hpet_cfg = {.count = UINT8_MAX};

@@ -131,6 +132,11 @@ FWCfgState *fw_cfg_arch_create(MachineState *ms,
   _reserve, sizeof(e820_reserve));
  fw_cfg_add_file(fw_cfg, "etc/e820", e820_table,
  sizeof(struct e820_entry) * e820_get_num_entries());
+if (sev_has_accept_all_memory(ms->cgs)) {
+bool accept_all = sev_accept_all_memory(ms->cgs);
+fw_cfg_add_file(fw_cfg, "opt/ovmf/AcceptAllMemory",
+_all, sizeof(accept_all));
+}
  
  fw_cfg_add_bytes(fw_cfg, FW_CFG_HPET, _cfg, sizeof(hpet_cfg));

  /* allocate memory for the NUMA channel: one (64bit) word for the number
diff --git a/target/i386/sev.c b/target/i386/sev.c
index 32f7dbac4e..01399a304c 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -64,6 +64,7 @@ struct SevGuestState {
  uint32_t cbitpos;
  uint32_t reduced_phys_bits;
  bool kernel_hashes;
+int accept_all_memory;
  
  /* runtime state */

  uint32_t handle;
@@ -155,6 +156,15 @@ static const char *const sev_fw_errlist[] = {
  [SEV_RET_SECURE_DATA_INVALID]= "Part-specific integrity check 
failure",
  };
  
+static QEnumLookup memory_acceptance_lookup = {

+.array = (const char *const[]) {
+"default",
+"true",
+"false",
+},
+.size = 3,
+};
+
  #define SEV_FW_MAX_ERROR  ARRAY_SIZE(sev_fw_errlist)
  
  static int

@@ -353,6 +363,21 @@ static void sev_guest_set_kernel_hashes(Object *obj, bool 
value, Error **errp)
  sev->kernel_hashes = value;
  }
  
+static int sev_guest_get_accept_all_memory(Object *obj, Error **errp)

+{
+SevGuestState *sev = SEV_GUEST(obj);
+
+return sev->accept_all_memory;
+}
+
+static void
+sev_guest_set_accept_all_memory(Object *obj, int value, Error **errp)
+{
+SevGuestState *sev = SEV_GUEST(obj);
+
+sev->accept_all_memory = value;
+}
+
  static void
  sev_guest_class_init(ObjectClass *oc, void *data)
  {
@@ -376,6 +401,14 @@ sev_guest_class_init(ObjectClass *oc, void *data)
 sev_guest_set_kernel_hashes);
  object_class_property_set_description(oc, "kernel-hashes",
  "add kernel hashes to guest firmware for measured Linux boot");
+object_class_property_add_enum(oc, "accept-all-memory",
+   "MemoryAcceptance",
+   _acceptance_lookup,
+sev_guest_get_accept_all_memory, sev_guest_set_accept_all_memory);
+object_class_property_set_description(
+oc, "accept-all-memory",
+"false: Accept all memory, true: Accept up to 4G and leave the rest 
unaccepted (UEFI"
+" v2.9 memory type), default: default firmware behavior.");
  }
  
  static void

@@ -906,6 +939,22 @@ sev_vm_state_change(void *opaque, bool running, RunState 
state)
  }
  }
  
+int sev_has_accept_all_memory(ConfidentialGuestSupport *cgs)

+{
+SevGuestState *sev
+= (SevGuestState *)object_dynamic_cast(OBJECT(cgs), TYPE_SEV_GUEST);
+
+return sev && sev->accept_all_memory != 0;
+}
+
+int sev_accept_all_memory(ConfidentialGuestSupport *cgs)
+{
+SevGuestState *sev
+= (SevGuestState *)object_dynamic_cast(OBJECT(cgs), TYPE_SEV_GUEST);
+
+return sev && sev->accept_all_memory == 1;
+}
+
  int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
  {
  SevGuestState *sev
diff --git a/target/i386/sev.h b/target/i386/sev.h
index 7b1528248a..d61b6e9443 100644
--- a/target/i386/sev.h
+++ b/target/i386/sev.h
@@ -58,5 +58,7 @@ int sev_es_save_reset_vector(void *flash_ptr, uint64_t 
flash_size);
  void sev_es_set_reset_vector(CPUState *cpu);
  
  int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp);

+int sev_has_accept_all_memory(ConfidentialGuestSupport *cgs);
+int sev_accept_all_memory(ConfidentialGuestSupport *cgs);
  
  #endif

[PATCH v2] target/i386: Add unaccepted memory configuration

2022-06-29 Thread Dionna Glaze

For SEV-SNP, an OS is "SEV-SNP capable" without supporting this UEFI
v2.9 memory type. In order for OVMF to be able to avoid pre-validating
potentially hundreds of gibibytes of data before booting, it needs to
know if the guest OS can support its use of the new type of memory in
the memory map.

Cc: Xu, Min M 
Cc: Xiaoyao Li 
Cc: Thomas Lendacky 
Cc: Gerd Hoffman 
Signed-off-by: Dionna Glaze 
---
 hw/i386/fw_cfg.c  |  6 ++
 target/i386/sev.c | 49 +++
 target/i386/sev.h |  2 ++
 3 files changed, 57 insertions(+)

diff --git a/hw/i386/fw_cfg.c b/hw/i386/fw_cfg.c
index a283785a8d..9c069ddebe 100644
--- a/hw/i386/fw_cfg.c
+++ b/hw/i386/fw_cfg.c
@@ -23,6 +23,7 @@
 #include "e820_memory_layout.h"
 #include "kvm/kvm_i386.h"
 #include "qapi/error.h"
+#include "target/i386/sev.h"
 #include CONFIG_DEVICES
 
 struct hpet_fw_config hpet_cfg = {.count = UINT8_MAX};
@@ -131,6 +132,11 @@ FWCfgState *fw_cfg_arch_create(MachineState *ms,
  _reserve, sizeof(e820_reserve));
 fw_cfg_add_file(fw_cfg, "etc/e820", e820_table,
 sizeof(struct e820_entry) * e820_get_num_entries());
+if (sev_has_accept_all_memory(ms->cgs)) {
+bool accept_all = sev_accept_all_memory(ms->cgs);
+fw_cfg_add_file(fw_cfg, "opt/ovmf/AcceptAllMemory",
+_all, sizeof(accept_all));
+}
 
 fw_cfg_add_bytes(fw_cfg, FW_CFG_HPET, _cfg, sizeof(hpet_cfg));
 /* allocate memory for the NUMA channel: one (64bit) word for the number
diff --git a/target/i386/sev.c b/target/i386/sev.c
index 32f7dbac4e..01399a304c 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -64,6 +64,7 @@ struct SevGuestState {
 uint32_t cbitpos;
 uint32_t reduced_phys_bits;
 bool kernel_hashes;
+int accept_all_memory;
 
 /* runtime state */
 uint32_t handle;
@@ -155,6 +156,15 @@ static const char *const sev_fw_errlist[] = {
 [SEV_RET_SECURE_DATA_INVALID]= "Part-specific integrity check failure",
 };
 
+static QEnumLookup memory_acceptance_lookup = {
+.array = (const char *const[]) {
+"default",
+"true",
+"false",
+},
+.size = 3,
+};
+
 #define SEV_FW_MAX_ERROR  ARRAY_SIZE(sev_fw_errlist)
 
 static int
@@ -353,6 +363,21 @@ static void sev_guest_set_kernel_hashes(Object *obj, bool 
value, Error **errp)
 sev->kernel_hashes = value;
 }
 
+static int sev_guest_get_accept_all_memory(Object *obj, Error **errp)
+{
+SevGuestState *sev = SEV_GUEST(obj);
+
+return sev->accept_all_memory;
+}
+
+static void
+sev_guest_set_accept_all_memory(Object *obj, int value, Error **errp)
+{
+SevGuestState *sev = SEV_GUEST(obj);
+
+sev->accept_all_memory = value;
+}
+
 static void
 sev_guest_class_init(ObjectClass *oc, void *data)
 {
@@ -376,6 +401,14 @@ sev_guest_class_init(ObjectClass *oc, void *data)
sev_guest_set_kernel_hashes);
 object_class_property_set_description(oc, "kernel-hashes",
 "add kernel hashes to guest firmware for measured Linux boot");
+object_class_property_add_enum(oc, "accept-all-memory",
+   "MemoryAcceptance",
+   _acceptance_lookup,
+sev_guest_get_accept_all_memory, sev_guest_set_accept_all_memory);
+object_class_property_set_description(
+oc, "accept-all-memory",
+"false: Accept all memory, true: Accept up to 4G and leave the rest 
unaccepted (UEFI"
+" v2.9 memory type), default: default firmware behavior.");
 }
 
 static void
@@ -906,6 +939,22 @@ sev_vm_state_change(void *opaque, bool running, RunState 
state)
 }
 }
 
+int sev_has_accept_all_memory(ConfidentialGuestSupport *cgs)
+{
+SevGuestState *sev
+= (SevGuestState *)object_dynamic_cast(OBJECT(cgs), TYPE_SEV_GUEST);
+
+return sev && sev->accept_all_memory != 0;
+}
+
+int sev_accept_all_memory(ConfidentialGuestSupport *cgs)
+{
+SevGuestState *sev
+= (SevGuestState *)object_dynamic_cast(OBJECT(cgs), TYPE_SEV_GUEST);
+
+return sev && sev->accept_all_memory == 1;
+}
+
 int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
 {
 SevGuestState *sev
diff --git a/target/i386/sev.h b/target/i386/sev.h
index 7b1528248a..d61b6e9443 100644
--- a/target/i386/sev.h
+++ b/target/i386/sev.h
@@ -58,5 +58,7 @@ int sev_es_save_reset_vector(void *flash_ptr, uint64_t 
flash_size);
 void sev_es_set_reset_vector(CPUState *cpu);
 
 int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp);
+int sev_has_accept_all_memory(ConfidentialGuestSupport *cgs);
+int sev_accept_all_memory(ConfidentialGuestSupport *cgs);
 
 #endif
-- 
2.37.0.rc0.161.g10f37bed90-goog

Re: [PATCH V8 27/39] vfio-pci: cpr part 1 (fd and dma)

2022-06-29 Thread Alex Williamson

On Wed, 15 Jun 2022 07:52:14 -0700
Steve Sistare  wrote:

> Enable vfio-pci devices to be saved and restored across an exec restart
> of qemu.
> 
> At vfio creation time, save the value of vfio container, group, and device
> descriptors in cpr state.
> 
> In the container pre_save handler, suspend the use of virtual addresses in
> DMA mappings with VFIO_DMA_UNMAP_FLAG_VADDR, because guest ram will be
> remapped at a different VA after exec.  DMA to already-mapped pages
> continues.  Save the msi message area as part of vfio-pci vmstate, save the
> interrupt and notifier eventfd's in cpr state, and clear the close-on-exec
> flag for the vfio descriptors.  The flag is not cleared earlier because the
> descriptors should not persist across miscellaneous fork and exec calls
> that may be performed during normal operation.
> 
> On qemu restart, vfio_realize() finds the saved descriptors, uses
> the descriptors, and notes that the device is being reused.  Device and
> iommu state is already configured, so operations in vfio_realize that
> would modify the configuration are skipped for a reused device, including
> vfio ioctl's and writes to PCI configuration space.  Vfio PCI device reset
> is also suppressed. The result is that vfio_realize constructs qemu data
> structures that reflect the current state of the device.  However, the
> reconstruction is not complete until cpr-load is called. cpr-load loads the
> msi data.  The vfio post_load handler finds eventfds in cpr state, rebuilds
> vector data structures, and attaches the interrupts to the new KVM instance.
> The container post_load handler then invokes the main vfio listener
> callback, which walks the flattened ranges of the vfio address space and
> calls VFIO_DMA_MAP_FLAG_VADDR to inform the kernel of the new VA's.  Lastly,
> cpr-load starts the VM.
> 
> This functionality is delivered by 3 patches for clarity.  Part 1 handles
> device file descriptors and DMA.  Part 2 adds eventfd and MSI/MSI-X vector
> support.  Part 3 adds INTX support.
> 
> Signed-off-by: Steve Sistare 
> ---
>  MAINTAINERS   |   1 +
>  hw/pci/pci.c  |  12 
>  hw/vfio/common.c  | 151 
> +++---
>  hw/vfio/cpr.c | 119 +
>  hw/vfio/meson.build   |   1 +
>  hw/vfio/pci.c |  44 
>  hw/vfio/trace-events  |   1 +
>  include/hw/vfio/vfio-common.h |  11 +++
>  include/migration/vmstate.h   |   1 +
>  9 files changed, 317 insertions(+), 24 deletions(-)
>  create mode 100644 hw/vfio/cpr.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 74a43e6..864aec6 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -3156,6 +3156,7 @@ CPR
>  M: Steve Sistare 
>  M: Mark Kanda 
>  S: Maintained
> +F: hw/vfio/cpr.c
>  F: include/migration/cpr.h
>  F: migration/cpr.c
>  F: qapi/cpr.json
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index 6e70153..a3b19eb 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -32,6 +32,7 @@
>  #include "hw/pci/pci_host.h"
>  #include "hw/qdev-properties.h"
>  #include "hw/qdev-properties-system.h"
> +#include "migration/cpr.h"
>  #include "migration/qemu-file-types.h"
>  #include "migration/vmstate.h"
>  #include "monitor/monitor.h"
> @@ -341,6 +342,17 @@ static void pci_reset_regions(PCIDevice *dev)
>  
>  static void pci_do_device_reset(PCIDevice *dev)
>  {
> +/*
> + * A PCI device that is resuming for cpr is already configured, so do
> + * not reset it here when we are called from qemu_system_reset prior to
> + * cpr-load, else interrupts may be lost for vfio-pci devices.  It is
> + * safe to skip this reset for all PCI devices, because cpr-load will set
> + * all fields that would have been set here.
> + */
> +if (cpr_get_mode() == CPR_MODE_RESTART) {
> +return;
> +}
> +
>  pci_device_deassert_intx(dev);
>  assert(dev->irq_state == 0);
>  
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index ace9562..c7d73b6 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -31,6 +31,7 @@
>  #include "exec/memory.h"
>  #include "exec/ram_addr.h"
>  #include "hw/hw.h"
> +#include "migration/cpr.h"
>  #include "qemu/error-report.h"
>  #include "qemu/main-loop.h"
>  #include "qemu/range.h"
> @@ -460,6 +461,8 @@ static int vfio_dma_unmap(VFIOContainer *container,
>  .size = size,
>  };
>  
> +assert(!container->reused);
> +
>  if (iotlb && container->dirty_pages_supported &&
>  vfio_devices_all_running_and_saving(container)) {
>  return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
> @@ -496,12 +499,24 @@ static int vfio_dma_map(VFIOContainer *container, 
> hwaddr iova,
>  {
>  struct vfio_iommu_type1_dma_map map = {
>  .argsz = sizeof(map),
> -.flags = VFIO_DMA_MAP_FLAG_READ,
>  .vaddr = (__u64)(uintptr_t)vaddr,
>  .iova = iova,
>  .size = size,
>  };
>  
> +

Re: [PATCH v2 09/13] hw/i2c/pmbus: Add read-only IC_DEVICE_ID support

2022-06-29 Thread Peter Delevoryas



> On Jun 29, 2022, at 11:04 AM, Titus Rwantare  wrote:
> 
> On Tue, 28 Jun 2022 at 20:36, Peter Delevoryas
>  wrote:
>> 
>> Signed-off-by: Peter Delevoryas 
>> ---
> 
>> --- a/hw/i2c/pmbus_device.c
>> +++ b/hw/i2c/pmbus_device.c
>> @@ -984,6 +984,11 @@ static uint8_t pmbus_receive_byte(SMBusDevice *smd)
>> }
>> break;
>> 
>> +case PMBUS_IC_DEVICE_ID:
>> +pmbus_send(pmdev, pmdev->pages[index].ic_device_id,
>> +   sizeof(pmdev->pages[index].ic_device_id));
>> +break;
>> +
> 
> I don't think it's a good idea to add this here because this sends 16
> bytes for all PMBus devices. I have at least one device that formats
> IC_DEVICE_ID differently that I've not got permission to upstream.
> The spec leaves the size and format up to the manufacturer, so this is
> best done in isl_pmbus_vr.c in isl_pmbus_vr_read_byte().
> Look at the adm1272_read_byte() which is more interesting than
> isl_pmbus_vr one as an example.

Argh, yes, you’re absolutely right. I didn’t read the spec in very
much detail, I see now that the length is device-specific. For the
ISL69259 I see that it’s 4 bytes, which makes sense now. This
is not the exact datasheet for the ISL69259, but I think the IC_DEVICE_ID
part is the same.

https://www.renesas.com/us/en/document/dst/isl68229-isl68239-datasheet

Putting the logic in isl_pmbus_vr_read_byte() is a good idea, I hadn’t
seen the implementation in adm1272_read_byte(), that looks great,
I’ll just add a switch on the command code and move the error message
to the default case.

> 
> 
>> case PMBUS_CLEAR_FAULTS:  /* Send Byte */
>> case PMBUS_PAGE_PLUS_WRITE:   /* Block Write-only */
>> case PMBUS_STORE_DEFAULT_ALL: /* Send Byte */
>> diff --git a/hw/sensor/isl_pmbus_vr.c b/hw/sensor/isl_pmbus_vr.c
>> index e11e028884..b12c46ab6d 100644
>> --- a/hw/sensor/isl_pmbus_vr.c
>> +++ b/hw/sensor/isl_pmbus_vr.c
>> @@ -218,6 +218,28 @@ static void isl_pmbus_vr_class_init(ObjectClass *klass, 
>> void *data,
>> k->device_num_pages = pages;
>> }
>> 
>> +static void isl69259_init(Object *obj)
>> +{
>> +static const uint8_t ic_device_id[] = {0x04, 0x00, 0x81, 0xD2, 0x49};
>> +PMBusDevice *pmdev = PMBUS_DEVICE(obj);
>> +int i;
>> +
>> +raa22xx_init(obj);
>> +for (i = 0; i < pmdev->num_pages; i++) {
>> +memcpy(pmdev->pages[i].ic_device_id, ic_device_id,
>> +   sizeof(ic_device_id));
>> +}
>> +}
>> +
> 
> We tend to set default register values in exit_reset() calls. You can
> do something like in raa228000_exit_reset()

Oh got it. If I can move the logic to isl_pmbus_vr_read_byte perhaps
I can avoid this whole function though.

> 
> 
>> diff --git a/include/hw/i2c/pmbus_device.h b/include/hw/i2c/pmbus_device.h
>> index 0f4d6b3fad..aed7809841 100644
>> --- a/include/hw/i2c/pmbus_device.h
>> +++ b/include/hw/i2c/pmbus_device.h
>> @@ -407,6 +407,7 @@ typedef struct PMBusPage {
>> uint16_t mfr_max_temp_1;   /* R/W word */
>> uint16_t mfr_max_temp_2;   /* R/W word */
>> uint16_t mfr_max_temp_3;   /* R/W word */
>> +uint8_t ic_device_id[16];  /* Read-Only block-read */
> 
> You wouldn't be able to do this here either, since the size could be
> anything for other devices.

Right, yeah I see what you mean.

> 
> Thanks for the new device. It helps me see where to expand on PMBus.

Thanks for adding the whole pmbus infrastructure! It’s really useful.
And thanks for the review.

Off-topic, but I’ve been meaning to reach out to you guys (Google
engineers working on QEMU for OpenBMC) about your Nuvoton NPCM845R
series, my team is interested in using it. I was just curious about
the status of it and if there’s any features missing and what plans
you have for the future, maybe we can collaborate.

Thanks!
Peter

> 
> 
> Titus

Re: [PATCH v2 08/13] hw/i2c/pmbus: Reset out buf after switching pages

2022-06-29 Thread Peter Delevoryas



> On Jun 29, 2022, at 11:05 AM, Titus Rwantare  wrote:
> 
> On Tue, 28 Jun 2022 at 20:36, Peter Delevoryas
>  wrote:
>> 
>> When a pmbus device switches pages, it should clears its output buffer so
>> that the next transaction doesn't emit data from the previous page.
>> 
>> Fixes: 3746d5c15e70570b ("hw/i2c: add support for PMBus”)
>> Signed-off-by: Peter Delevoryas 
>> ---
>> hw/i2c/pmbus_device.c | 1 +
>> 1 file changed, 1 insertion(+)
>> 
>> diff --git a/hw/i2c/pmbus_device.c b/hw/i2c/pmbus_device.c
>> index 62885fa6a1..efddc36fd9 100644
>> --- a/hw/i2c/pmbus_device.c
>> +++ b/hw/i2c/pmbus_device.c
>> @@ -1088,6 +1088,7 @@ static int pmbus_write_data(SMBusDevice *smd, uint8_t 
>> *buf, uint8_t len)
>> 
>> if (pmdev->code == PMBUS_PAGE) {
>> pmdev->page = pmbus_receive8(pmdev);
>> +pmdev->out_buf_len = 0;
>> return 0;
>> }
>> 
> 
> I suspect you were running into this because ic_device_id was putting
> too much data in the output buffer. Still, I wouldn't want the buffer
> cleared if the page hasn't changed. Some drivers write the same page
> before every read.

Yes you’re correct: this is the code that was querying the ic_device_id [1]:

memset(, 0, sizeof(msg));
msg.bus = sensor_config[index].port;
msg.target_addr = sensor_config[index].target_addr;
msg.tx_len = 1;
msg.rx_len = 7;
msg.data[0] = PMBUS_IC_DEVICE_ID;
if (i2c_master_read(, retry)) {
printf("Failed to read VR IC_DEVICE_ID: register(0x%x)\n", 
PMBUS_IC_DEVICE_ID);
return;
}

By sending a buffer that was way larger than the rx buffer of 7, it was
leaving stuff lying around for the next query.

I’ll test it out and see what happens if I fix the IC_DEVICE_ID length
transmitted by the ISL69259 to 4, maybe we don’t need this patch. But,
at the very least, I’ll make sure to gate this on the page value changing,
not just being set.

Thanks for the review, this was really useful. I’m not very familiar
with pmbus devices.

[1] 
https://github.com/facebook/OpenBIC/blob/cda4c00b032b1d9c9b94c45faa2c0eca4886a0a3/meta-facebook/yv35-cl/src/platform/plat_sensor_table.c#L332-L355

> 
> Titus

Re: [PATCH 12/14] aspeed: Make aspeed_board_init_flashes public

2022-06-29 Thread Alex Bennée



Cédric Le Goater  writes:

> On 6/29/22 16:14, Alex Bennée wrote:
>> Cédric Le Goater  writes:
>> 
>>> On 6/24/22 18:50, Cédric Le Goater wrote:
 On 6/23/22 20:43, Peter Delevoryas wrote:
>
>
>> On Jun 23, 2022, at 8:09 AM, Cédric Le Goater  wrote:
>>
>> On 6/23/22 12:26, Peter Delevoryas wrote:
>>> Signed-off-by: Peter Delevoryas 
>>
>> Let's start simple without flash support. We should be able to
>> load FW blobs in each CPU address space using loader devices.
>
> Actually, I was unable to do this, perhaps because the fb OpenBMC
> boot sequence is a little weird. I specifically _needed_ to have
> a flash device which maps the firmware in at 0x2000_, because
> the fb OpenBMC U-Boot SPL jumps to that address to start executing
> from flash? I think this is also why fb OpenBMC machines can be so slow.
>
> $ ./build/qemu-system-arm -machine fby35 \
>   -device loader,file=fby35.mtd,addr=0,cpu-num=0 -nographic \
>   -d int -drive file=fby35.mtd,format=raw,if=mtd
 Ideally we should be booting from the flash device directly using
 the machine option '-M ast2600-evb,execute-in-place=true' like HW
 does. Instructions are fetched using SPI transfers. But the amount
 of code generated is tremendous.
>> Yeah because there is a potential race when reading from HW so we
>> throw
>> away TB's after executing them because we have no way of knowing if it
>> has changed under our feet. See 873d64ac30 (accel/tcg: re-factor non-RAM
>> execution code) which cleaned up this handling.
>> 
 See some profiling below for a
 run which barely reaches DRAM training in U-Boot.
>>>
>>> Some more profiling on both ast2500 and ast2600 machines shows :
>>>
>>>
>>> * ast2600-evb,execute-in-place=true :
>>>
>>> Type   Object  Call siteWait Time (s) 
>>> Count  Average (us)
>>> -
>>> BQL mutex  0x564dc03922e0  accel/tcg/cputlb.c:1365   14.21443
>>> 32909927  0.43
>> This is unavoidable as a HW access needs the BQL held so we will go
>> through this cycle every executed instruction.
>> Did I miss why the flash contents are not mapped into the physical
>> address space? Isn't that how it appear to the processor?
>
>
> There are two modes :
>   if (ASPEED_MACHINE(machine)->mmio_exec) {
> memory_region_init_alias(boot_rom, NULL, "aspeed.boot_rom",
>  >mmio, 0, size);
> memory_region_add_subregion(get_system_memory(), FIRMWARE_ADDR,
> boot_rom);
> } else {
> memory_region_init_rom(boot_rom, NULL, "aspeed.boot_rom",
>size, _abort);
> memory_region_add_subregion(get_system_memory(), FIRMWARE_ADDR,
> boot_rom);
> write_boot_rom(drive0, FIRMWARE_ADDR, size, _abort);
> }
>
> The default boot mode uses the ROM. No issue.
>
> The "execute-in-place=true" option creates an alias on the region of
> the flash contents and each instruction is then fetched from the flash
> drive with SPI transactions.
>
> With old FW images, using an older U-boot, the machine boots in a couple
> of seconds. See the profiling below for a witherspoon-bmc machine using
> U-Boot 2016.07.
>
>   qemu-system-arm -M witherspoon-bmc,execute-in-place=true  -drive 
> file=./flash-witherspoon-bmc,format=raw,if=mtd -drive 
> file=./flash-witherspoon-bmc2,format=raw,if=mtd -nographic -nodefaults 
> -snapshot -serial mon:stdio -enable-sync-profile
>   ...
>   U-Boot 2016.07-00040-g8425e96e2e27-dirty (Jun 24 2022 - 23:21:57 +0200)
> Watchdog enabled
>   DRAM:  496 MiB
>   Flash: 32 MiB
>   In:serial
>   Out:   serial
>   Err:   serial
>   Net:
>   (qemu) info sync-profile
>   Type   Object  Call siteWait Time (s) 
> Count  Average (us)
>   
> -
>   BQL mutex  0x56189610b2e0  accel/tcg/cputlb.c:13650.25311  
> 12346237  0.02
>   condvar0x5618970cf220  softmmu/cpus.c:423 0.05506   
>   2  27530.78
>   BQL mutex  0x56189610b2e0  util/rcu.c:269 0.04709   
>   2  23544.26
>   condvar0x561896d0fc78  util/thread-pool.c:90  0.01340   
>  83161.47
>   condvar0x56189610b240  softmmu/cpus.c:571 0.5   
>   1 54.93
>   condvar0x56189610b280  softmmu/cpus.c:642 0.3   
>   1 32.88
>   BQL mutex  0x56189610b2e0  util/main-loop.c:318   0.3   
>  34  0.76
>   mutex  0x561896eade00  tcg/region.c:204   0.2   
> 995  0.02
>   rec_mutex  [

Re: [PATCH v2 08/13] hw/i2c/pmbus: Reset out buf after switching pages

2022-06-29 Thread Titus Rwantare

On Tue, 28 Jun 2022 at 20:36, Peter Delevoryas
 wrote:
>
> When a pmbus device switches pages, it should clears its output buffer so
> that the next transaction doesn't emit data from the previous page.
>
> Fixes: 3746d5c15e70570b ("hw/i2c: add support for PMBus”)
> Signed-off-by: Peter Delevoryas 
> ---
>  hw/i2c/pmbus_device.c | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/hw/i2c/pmbus_device.c b/hw/i2c/pmbus_device.c
> index 62885fa6a1..efddc36fd9 100644
> --- a/hw/i2c/pmbus_device.c
> +++ b/hw/i2c/pmbus_device.c
> @@ -1088,6 +1088,7 @@ static int pmbus_write_data(SMBusDevice *smd, uint8_t 
> *buf, uint8_t len)
>
>  if (pmdev->code == PMBUS_PAGE) {
>  pmdev->page = pmbus_receive8(pmdev);
> +pmdev->out_buf_len = 0;
>  return 0;
>  }
>

I suspect you were running into this because ic_device_id was putting
too much data in the output buffer. Still, I wouldn't want the buffer
cleared if the page hasn't changed. Some drivers write the same page
before every read.

Titus

Re: [PATCH v2 09/13] hw/i2c/pmbus: Add read-only IC_DEVICE_ID support

2022-06-29 Thread Titus Rwantare

On Tue, 28 Jun 2022 at 20:36, Peter Delevoryas
 wrote:
>
> Signed-off-by: Peter Delevoryas 
> ---

> --- a/hw/i2c/pmbus_device.c
> +++ b/hw/i2c/pmbus_device.c
> @@ -984,6 +984,11 @@ static uint8_t pmbus_receive_byte(SMBusDevice *smd)
>  }
>  break;
>
> +case PMBUS_IC_DEVICE_ID:
> +pmbus_send(pmdev, pmdev->pages[index].ic_device_id,
> +   sizeof(pmdev->pages[index].ic_device_id));
> +break;
> +

I don't think it's a good idea to add this here because this sends 16
bytes for all PMBus devices. I have at least one device that formats
IC_DEVICE_ID differently that I've not got permission to upstream.
The spec leaves the size and format up to the manufacturer, so this is
best done in isl_pmbus_vr.c in isl_pmbus_vr_read_byte().
Look at the adm1272_read_byte() which is more interesting than
isl_pmbus_vr one as an example.

>  case PMBUS_CLEAR_FAULTS:  /* Send Byte */
>  case PMBUS_PAGE_PLUS_WRITE:   /* Block Write-only */
>  case PMBUS_STORE_DEFAULT_ALL: /* Send Byte */
> diff --git a/hw/sensor/isl_pmbus_vr.c b/hw/sensor/isl_pmbus_vr.c
> index e11e028884..b12c46ab6d 100644
> --- a/hw/sensor/isl_pmbus_vr.c
> +++ b/hw/sensor/isl_pmbus_vr.c
> @@ -218,6 +218,28 @@ static void isl_pmbus_vr_class_init(ObjectClass *klass, 
> void *data,
>  k->device_num_pages = pages;
>  }
>
> +static void isl69259_init(Object *obj)
> +{
> +static const uint8_t ic_device_id[] = {0x04, 0x00, 0x81, 0xD2, 0x49};
> +PMBusDevice *pmdev = PMBUS_DEVICE(obj);
> +int i;
> +
> +raa22xx_init(obj);
> +for (i = 0; i < pmdev->num_pages; i++) {
> +memcpy(pmdev->pages[i].ic_device_id, ic_device_id,
> +   sizeof(ic_device_id));
> +}
> +}
> +

We tend to set default register values in exit_reset() calls. You can
do something like in raa228000_exit_reset()

> diff --git a/include/hw/i2c/pmbus_device.h b/include/hw/i2c/pmbus_device.h
> index 0f4d6b3fad..aed7809841 100644
> --- a/include/hw/i2c/pmbus_device.h
> +++ b/include/hw/i2c/pmbus_device.h
> @@ -407,6 +407,7 @@ typedef struct PMBusPage {
>  uint16_t mfr_max_temp_1;   /* R/W word */
>  uint16_t mfr_max_temp_2;   /* R/W word */
>  uint16_t mfr_max_temp_3;   /* R/W word */
> +uint8_t ic_device_id[16];  /* Read-Only block-read */

You wouldn't be able to do this here either, since the size could be
anything for other devices.

Thanks for the new device. It helps me see where to expand on PMBus.

Titus

Re: Slowness with multi-thread TCG?

2022-06-29 Thread Alex Bennée



"Matheus K. Ferst"  writes:

> On 29/06/2022 12:36, Frederic Barrat wrote:
>> [E-MAIL EXTERNO] Não clique em links ou abra anexos, a menos que
>> você possa confirmar o remetente e saber que o conteúdo é seguro. Em
>> caso de e-mail suspeito entre imediatamente em contato com o DTI.
>> On 29/06/2022 00:17, Alex Bennée wrote:
>>> If you run the sync-profiler (via the HMP "sync-profile on") you can
>>> then get a breakdown of which mutex's are being held and for how long
>>> ("info sync-profile").
>> Alex, a huge thank you!
>> For the record, the "info sync-profile" showed:
>> Type   Object  Call site Wait Time (s)
>>     Count  Average (us)
>> --
>> BQL mutex  0x55eb89425540  accel/tcg/cpu-exec.c:744  
>> 96.31578
>>      73589937  1.31
>> BQL mutex  0x55eb89425540  target/ppc/helper_regs.c:207    0.00150
>>      1178  1.27
>> And it points to a lock in the interrupt delivery path, in
>> cpu_handle_interrupt().
>> I now understand the root cause. The interrupt signal for the
>> decrementer interrupt remains set because the interrupt is not being
>> delivered, per the config. I'm not quite sure what the proper fix is yet
>> (there seems to be several implementations of the decrementer on ppc),
>> but at least I understand why we are so slow.
>> 
>
> To summarize what we talked elsewhere:
> 1 - The threads that are not decompressing the kernel have a pending
> PPC_INTERRUPT_DECR, and cs->interrupt_request is CPU_INTERRUPT_HARD;

I think ppc_set_irq should be doing some gating before calling to set
cs->interrupt_request.

> 2 - cpu_handle_interrupt calls ppc_cpu_exec_interrupt, that calls
> ppc_hw_interrupt to handle the interrupt;
> 3 - ppc_cpu_exec_interrupt decides that the interrupt cannot be
> delivered immediately, so the corresponding bit in
> env->pending_interrupts is not reset;

Is the logic controlled by ppc_hw_interrupt()? The stuff around
async_deliver?

I think maybe some of the logic needs to be factored out and checked
above. Also anywhere where env->msr is updated would need to check if
we've just enabled a load of pending interrupts and then call
ppc_set_irq.

However I'm not super familiar with the PPC code so I'll defer to the
maintainers here ;-)

> 4 - ppc_cpu_exec_interrupt does not change cs->interrupt_request
> because pending_interrupts != 0, so cpu_handle_interrupt will be
> called again.
>
> This loop will acquire and release qemu_mutex_lock_iothread, slowing
> down other threads that need this lock.
>
>> With a quick hack, I could verify that by moving that signal out of the
>> way, the decompression time of the kernel is now peanuts, no matter the
>> number of cpus. Even with one cpu, the 15 seconds measured before was
>> already a huge waste, so it was not really a multiple-cpus problem.
>> Multiple cpus were just highlighting it.
>> Thanks again!
>>    Fred


-- 
Alex Bennée

[PATCH 3/3] gitlab: honour QEMU_CI variable in edk2/opensbi jobs

2022-06-29 Thread Daniel P . Berrangé

To preserve contributor CI credits we don't want jobs to run by default
unless the QEMU_CI variable is set. For most jobs we can achieve this
using the base template, but the edk2/opensbi jobs are a little special
as they have some complex conditions we can't easily model in the base
template.

We duplicate existing rules and put them under control of QEMU_CI
variable, such that QEMU_CI=1 creates manual jobs and QEMU_CI=2
immediately runs jobs.

Signed-off-by: Daniel P. Berrangé 
---
 .gitlab-ci.d/edk2.yml| 23 +++
 .gitlab-ci.d/opensbi.yml | 23 +++
 2 files changed, 46 insertions(+)

diff --git a/.gitlab-ci.d/edk2.yml b/.gitlab-ci.d/edk2.yml
index 905e02440f..970bdbd315 100644
--- a/.gitlab-ci.d/edk2.yml
+++ b/.gitlab-ci.d/edk2.yml
@@ -1,6 +1,29 @@
 # All jobs needing docker-edk2 must use the same rules it uses.
 .edk2_job_rules:
   rules:
+# Forks don't get pipelines unless QEMU_CI=1 or QEMU_CI=2 is set
+- if: '$QEMU_CI != "1" && $QEMU_CI != "2" && $CI_PROJECT_NAMESPACE != 
"qemu-project"'
+  when: never
+
+# In forks, if QEMU_CI=1 is set, then create manual job
+# if any of the files affecting the build are touched
+- if: '$QEMU_CI == "1" && $CI_PROJECT_NAMESPACE != "qemu-project"'
+  changes:
+- .gitlab-ci.d/edk2.yml
+- .gitlab-ci.d/edk2/Dockerfile
+- roms/edk2/*
+  when: manual
+
+# In forks, if QEMU_CI=1 is set, then create manual job
+# if the branch/tag starts with 'edk2'
+- if: '$QEMU_CI == "1" && $CI_PROJECT_NAMESPACE != "qemu-project" && 
$CI_COMMIT_REF_NAME =~ /^edk2/'
+  when: manual
+
+# In forks, if QEMU_CI=1 is set, then create manual job
+# if last commit msg contains 'EDK2' (case insensitive)
+- if: '$QEMU_CI == "1" && $CI_PROJECT_NAMESPACE != "qemu-project" && 
$CI_COMMIT_MESSAGE =~ /edk2/i'
+  when: on_success
+
 # Run if any files affecting the build output are touched
 - changes:
 - .gitlab-ci.d/edk2.yml
diff --git a/.gitlab-ci.d/opensbi.yml b/.gitlab-ci.d/opensbi.yml
index 753a003f93..04ed5a3ea1 100644
--- a/.gitlab-ci.d/opensbi.yml
+++ b/.gitlab-ci.d/opensbi.yml
@@ -1,6 +1,29 @@
 # All jobs needing docker-opensbi must use the same rules it uses.
 .opensbi_job_rules:
   rules:
+# Forks don't get pipelines unless QEMU_CI=1 or QEMU_CI=2 is set
+- if: '$QEMU_CI != "1" && $QEMU_CI != "2" && $CI_PROJECT_NAMESPACE != 
"qemu-project"'
+  when: never
+
+# In forks, if QEMU_CI=1 is set, then create manual job
+# if any files affecting the build output are touched
+- if: '$QEMU_CI == "1" && $CI_PROJECT_NAMESPACE != "qemu-project"'
+  changes:
+- .gitlab-ci.d/opensbi.yml
+- .gitlab-ci.d/opensbi/Dockerfile
+- roms/opensbi/*
+  when: manual
+
+# In forks, if QEMU_CI=1 is set, then create manual job
+# if the branch/tag starts with 'opensbi'
+- if: '$QEMU_CI == "1" && $CI_PROJECT_NAMESPACE != "qemu-project" && 
$CI_COMMIT_REF_NAME =~ /^opensbi/'
+  when: manual
+
+# In forks, if QEMU_CI=1 is set, then create manual job
+# if the last commit msg contains 'OpenSBI' (case insensitive)
+- if: '$QEMU_CI == "1" && $CI_PROJECT_NAMESPACE != "qemu-project" && 
$CI_COMMIT_MESSAGE =~ /opensbi/i'
+  when: manual
+
 # Run if any files affecting the build output are touched
 - changes:
 - .gitlab-ci.d/opensbi.yml
-- 
2.36.1

[PATCH v5] xen/pass-through: merge emulated bits correctly

2022-06-29 Thread Chuck Zmudzinski

In xen_pt_config_reg_init(), there is an error in the merging of the
emulated data with the host value. With the current Qemu, instead of
merging the emulated bits with the host bits as defined by emu_mask,
the emulated bits are merged with the host bits as defined by the
inverse of emu_mask. In some cases, depending on the data in the
registers on the host, the way the registers are setup, and the
initial values of the emulated bits, the end result will be that
the register is initialized with the wrong value.

To correct this error, use the XEN_PT_MERGE_VALUE macro to help ensure
the merge is done correctly.

This correction is needed to resolve Qemu project issue #1061, which
describes the failure of Xen HVM Linux guests to boot in certain
configurations with passed through PCI devices, that is, when this error
disables instead of enables the PCI_STATUS_CAP_LIST bit of the
PCI_STATUS register of a passed through PCI device, which in turn
disables the MSI-X capability of the device in Linux guests with the end
result being that the Linux guest never completes the boot process.

Fixes: 2e87512eccf3 ("xen/pt: Sync up the dev.config and data values")
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1061
Buglink: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=988333

Signed-off-by: Chuck Zmudzinski 
Reviewed-by: Anthony PERARD 
---
v2: Edit the commit message to more accurately describe the cause
of the error.

v3: * Add Reviewed-By: Anthony Perard 
* Add qemu-sta...@nongnu.org to recipients to indicate the patch
  may be suitable for backport to Qemu stable

v4: * Add Fixed commit subject to Fixes: 2e87512eccf3

Sorry for the extra noise with v4 (I thought the Fixed commit subject
would be automatically added).

v5: * Coding style fix: move block comment leading /* and trailing */
  to separate lines

Again, sorry for the noise, but the style of the comment was wrong
before v5.

Thank you, Anthony, again, for taking the time to review this patch.

 hw/xen/xen_pt_config_init.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/hw/xen/xen_pt_config_init.c b/hw/xen/xen_pt_config_init.c
index cad4aeba84..4758514ddf 100644
--- a/hw/xen/xen_pt_config_init.c
+++ b/hw/xen/xen_pt_config_init.c
@@ -1965,11 +1965,12 @@ static void 
xen_pt_config_reg_init(XenPCIPassthroughState *s,
 
 if ((data & host_mask) != (val & host_mask)) {
 uint32_t new_val;
-
-/* Mask out host (including past size). */
-new_val = val & host_mask;
-/* Merge emulated ones (excluding the non-emulated ones). */
-new_val |= data & host_mask;
+/*
+ * Merge the emulated bits (data) with the host bits (val)
+ * and mask out the bits past size to enable restoration
+ * of the proper value for logging below.
+ */
+new_val = XEN_PT_MERGE_VALUE(val, data, host_mask) & size_mask;
 /* Leave intact host and emulated values past the size - even 
though
  * we do not care as we write per reg->size granularity, but for 
the
  * logging below lets have the proper value. */
-- 
2.36.1

[PATCH 2/3] gitlab: tweak comments in edk2/opensbi jobs

2022-06-29 Thread Daniel P . Berrangé

Get rid of comments stating the obvious and re-arrange remaining
comments. The opensbi split of rules for file matches is also
merged into one rule.

Signed-off-by: Daniel P. Berrangé 
---
 .gitlab-ci.d/edk2.yml| 14 --
 .gitlab-ci.d/opensbi.yml | 15 ---
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/.gitlab-ci.d/edk2.yml b/.gitlab-ci.d/edk2.yml
index fbe763a282..905e02440f 100644
--- a/.gitlab-ci.d/edk2.yml
+++ b/.gitlab-ci.d/edk2.yml
@@ -1,17 +1,19 @@
 # All jobs needing docker-edk2 must use the same rules it uses.
 .edk2_job_rules:
-  rules: # Only run this job when ...
+  rules:
+# Run if any files affecting the build output are touched
 - changes:
-# this file is modified
 - .gitlab-ci.d/edk2.yml
-# or the Dockerfile is modified
 - .gitlab-ci.d/edk2/Dockerfile
-# or roms/edk2/ is modified (submodule updated)
 - roms/edk2/*
   when: on_success
-- if: '$CI_COMMIT_REF_NAME =~ /^edk2/' # or the branch/tag starts with 
'edk2'
+
+# Run if the branch/tag starts with 'edk2'
+- if: '$CI_COMMIT_REF_NAME =~ /^edk2/'
   when: on_success
-- if: '$CI_COMMIT_MESSAGE =~ /edk2/i' # or last commit description 
contains 'EDK2'
+
+# Run if last commit msg contains 'EDK2' (case insensitive)
+- if: '$CI_COMMIT_MESSAGE =~ /edk2/i'
   when: on_success
 
 docker-edk2:
diff --git a/.gitlab-ci.d/opensbi.yml b/.gitlab-ci.d/opensbi.yml
index 0745ccdf10..753a003f93 100644
--- a/.gitlab-ci.d/opensbi.yml
+++ b/.gitlab-ci.d/opensbi.yml
@@ -1,18 +1,19 @@
 # All jobs needing docker-opensbi must use the same rules it uses.
 .opensbi_job_rules:
-  rules: # Only run this job when ...
+  rules:
+# Run if any files affecting the build output are touched
 - changes:
-# this file is modified
 - .gitlab-ci.d/opensbi.yml
-# or the Dockerfile is modified
 - .gitlab-ci.d/opensbi/Dockerfile
-  when: on_success
-- changes: # or roms/opensbi/ is modified (submodule updated)
 - roms/opensbi/*
   when: on_success
-- if: '$CI_COMMIT_REF_NAME =~ /^opensbi/' # or the branch/tag starts with 
'opensbi'
+
+# Run if the branch/tag starts with 'opensbi'
+- if: '$CI_COMMIT_REF_NAME =~ /^opensbi/'
   when: on_success
-- if: '$CI_COMMIT_MESSAGE =~ /opensbi/i' # or last commit description 
contains 'OpenSBI'
+
+# Run if the last commit msg contains 'OpenSBI' (case insensitive)
+- if: '$CI_COMMIT_MESSAGE =~ /opensbi/i'
   when: on_success
 
 docker-opensbi:
-- 
2.36.1

[PATCH 1/3] gitlab: normalize indentation in edk2/opensbi rules

2022-06-29 Thread Daniel P . Berrangé

The edk2/opensbi gitlab CI config was using single space indents
which is not consistent with the rest of the gitlab CI config
files.

Signed-off-by: Daniel P. Berrangé 
---
 .gitlab-ci.d/edk2.yml| 108 +++---
 .gitlab-ci.d/opensbi.yml | 110 +++
 2 files changed, 109 insertions(+), 109 deletions(-)

diff --git a/.gitlab-ci.d/edk2.yml b/.gitlab-ci.d/edk2.yml
index 13d0f8b019..fbe763a282 100644
--- a/.gitlab-ci.d/edk2.yml
+++ b/.gitlab-ci.d/edk2.yml
@@ -1,60 +1,60 @@
 # All jobs needing docker-edk2 must use the same rules it uses.
 .edk2_job_rules:
- rules: # Only run this job when ...
- - changes:
-   # this file is modified
-   - .gitlab-ci.d/edk2.yml
-   # or the Dockerfile is modified
-   - .gitlab-ci.d/edk2/Dockerfile
-   # or roms/edk2/ is modified (submodule updated)
-   - roms/edk2/*
-   when: on_success
- - if: '$CI_COMMIT_REF_NAME =~ /^edk2/' # or the branch/tag starts with 'edk2'
-   when: on_success
- - if: '$CI_COMMIT_MESSAGE =~ /edk2/i' # or last commit description contains 
'EDK2'
-   when: on_success
+  rules: # Only run this job when ...
+- changes:
+# this file is modified
+- .gitlab-ci.d/edk2.yml
+# or the Dockerfile is modified
+- .gitlab-ci.d/edk2/Dockerfile
+# or roms/edk2/ is modified (submodule updated)
+- roms/edk2/*
+  when: on_success
+- if: '$CI_COMMIT_REF_NAME =~ /^edk2/' # or the branch/tag starts with 
'edk2'
+  when: on_success
+- if: '$CI_COMMIT_MESSAGE =~ /edk2/i' # or last commit description 
contains 'EDK2'
+  when: on_success
 
 docker-edk2:
- extends: .edk2_job_rules
- stage: containers
- image: docker:19.03.1
- services:
- - docker:19.03.1-dind
- variables:
-  GIT_DEPTH: 3
-  IMAGE_TAG: $CI_REGISTRY_IMAGE:edk2-cross-build
-  # We don't use TLS
-  DOCKER_HOST: tcp://docker:2375
-  DOCKER_TLS_CERTDIR: ""
- before_script:
- - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
- script:
- - docker pull $IMAGE_TAG || true
- - docker build --cache-from $IMAGE_TAG --tag $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
---tag $IMAGE_TAG .gitlab-ci.d/edk2
- - docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
- - docker push $IMAGE_TAG
+  extends: .edk2_job_rules
+  stage: containers
+  image: docker:19.03.1
+  services:
+- docker:19.03.1-dind
+  variables:
+GIT_DEPTH: 3
+IMAGE_TAG: $CI_REGISTRY_IMAGE:edk2-cross-build
+# We don't use TLS
+DOCKER_HOST: tcp://docker:2375
+DOCKER_TLS_CERTDIR: ""
+  before_script:
+- docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
+  script:
+- docker pull $IMAGE_TAG || true
+- docker build --cache-from $IMAGE_TAG --tag 
$CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
+   --tag $IMAGE_TAG .gitlab-ci.d/edk2
+- docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
+- docker push $IMAGE_TAG
 
 build-edk2:
- extends: .edk2_job_rules
- stage: build
- needs: ['docker-edk2']
- artifacts:
-   paths: # 'artifacts.zip' will contains the following files:
-   - pc-bios/edk2*bz2
-   - pc-bios/edk2-licenses.txt
-   - edk2-stdout.log
-   - edk2-stderr.log
- image: $CI_REGISTRY_IMAGE:edk2-cross-build
- variables:
-   GIT_DEPTH: 3
- script: # Clone the required submodules and build EDK2
- - git submodule update --init roms/edk2
- - git -C roms/edk2 submodule update --init --
- ArmPkg/Library/ArmSoftFloatLib/berkeley-softfloat-3
- BaseTools/Source/C/BrotliCompress/brotli
- CryptoPkg/Library/OpensslLib/openssl
- MdeModulePkg/Library/BrotliCustomDecompressLib/brotli
- - export JOBS=$(($(getconf _NPROCESSORS_ONLN) + 1))
- - echo "=== Using ${JOBS} simultaneous jobs ==="
- - make -j${JOBS} -C roms efi 2>&1 1>edk2-stdout.log | tee -a edk2-stderr.log 
>&2
+  extends: .edk2_job_rules
+  stage: build
+  needs: ['docker-edk2']
+  artifacts:
+paths: # 'artifacts.zip' will contains the following files:
+  - pc-bios/edk2*bz2
+  - pc-bios/edk2-licenses.txt
+  - edk2-stdout.log
+  - edk2-stderr.log
+  image: $CI_REGISTRY_IMAGE:edk2-cross-build
+  variables:
+GIT_DEPTH: 3
+  script: # Clone the required submodules and build EDK2
+- git submodule update --init roms/edk2
+- git -C roms/edk2 submodule update --init --
+   ArmPkg/Library/ArmSoftFloatLib/berkeley-softfloat-3
+   BaseTools/Source/C/BrotliCompress/brotli
+   CryptoPkg/Library/OpensslLib/openssl
+   MdeModulePkg/Library/BrotliCustomDecompressLib/brotli
+- export JOBS=$(($(getconf _NPROCESSORS_ONLN) + 1))
+- echo "=== Using ${JOBS} simultaneous jobs ==="
+- make -j${JOBS} -C roms efi 2>&1 1>edk2-stdout.log | tee -a 
edk2-stderr.log >&2
diff --git a/.gitlab-ci.d/opensbi.yml b/.gitlab-ci.d/opensbi.yml
index 29a22930d1..0745ccdf10 100644
--- a/.gitlab-ci.d/opensbi.yml
+++ b/.gitlab-ci.d/opensbi.yml
@@ -1,61 +1,61 @@
 # All jobs needing docker-opensbi must use the same rules it uses.

[PATCH 0/3] gitlab: stopp edk2/opensbi jobs running in forks by default

2022-06-29 Thread Daniel P . Berrangé

When we introducd the QEMU_CI variable to control running of pipelines
in gitlab forks, we didn't include the ekd2/opensbi jobs in the rules.
This caused pipelines to be unexpectedly created in some cases. This
addresses that oversight.

Daniel P. Berrangé (3):
  gitlab: normalize indentation in edk2/opensbi rules
  gitlab: tweak comments in edk2/opensbi jobs
  gitlab: honour QEMU_CI variable in edk2/opensbi jobs

 .gitlab-ci.d/edk2.yml| 133 ++
 .gitlab-ci.d/opensbi.yml | 134 +++
 2 files changed, 158 insertions(+), 109 deletions(-)

-- 
2.36.1

Re: [PATCH v2 10/13] hw/misc/aspeed: Add PECI controller

2022-06-29 Thread Cédric Le Goater


On 6/29/22 18:07, Peter Delevoryas wrote:




On Jun 29, 2022, at 2:20 AM, Cédric Le Goater  wrote:

On 6/29/22 05:36, Peter Delevoryas wrote:

This introduces a really basic PECI controller that responses to
commands by always setting the response code to success and then raising
an interrupt to indicate the command is done. This helps avoid getting
hit with constant errors if the driver continuously attempts to send a
command and keeps timing out.
The AST2400 and AST2500 only included registers up to 0x5C, not 0xFC.
They supported PECI 1.1, 2.0, and 3.0. The AST2600 and AST1030 support
PECI 4.0, which includes more read/write buffer registers from 0x80 to
0xFC to support 64-byte mode.
This patch doesn't attempt to handle that, or to create a different
version of the controller for the different generations, since it's only
implementing functionality that is common to all generations.
The basic sequence of events is that the firmware will read and write to
various registers and then trigger a command by setting the FIRE bit in
the command register (similar to the I2C controller).
Then the firmware waits for an interrupt from the PECI controller,
expecting the interrupt status register to be filled in with info on
what happened. If the command was transmitted and received successfully,
then response codes from the host CPU will be found in the data buffer
registers.
Signed-off-by: Peter Delevoryas 



LGTM. A few small comments below.



---
  hw/arm/aspeed_ast10x0.c   |  12 +++
  hw/arm/aspeed_ast2600.c   |  12 +++
  hw/arm/aspeed_soc.c   |  13 
  hw/misc/aspeed_peci.c | 136 ++
  hw/misc/meson.build   |   3 +-
  hw/misc/trace-events  |   4 +
  include/hw/arm/aspeed_soc.h   |   3 +
  include/hw/misc/aspeed_peci.h |  47 
  8 files changed, 229 insertions(+), 1 deletion(-)
  create mode 100644 hw/misc/aspeed_peci.c
  create mode 100644 include/hw/misc/aspeed_peci.h
diff --git a/hw/arm/aspeed_ast10x0.c b/hw/arm/aspeed_ast10x0.c
index 5df480a21f..56e8de3d89 100644
--- a/hw/arm/aspeed_ast10x0.c
+++ b/hw/arm/aspeed_ast10x0.c
@@ -47,6 +47,7 @@ static const hwaddr aspeed_soc_ast1030_memmap[] = {
  [ASPEED_DEV_UART13]= 0x7E790700,
  [ASPEED_DEV_WDT]   = 0x7E785000,
  [ASPEED_DEV_LPC]   = 0x7E789000,
+[ASPEED_DEV_PECI]  = 0x7E78B000,
  [ASPEED_DEV_I2C]   = 0x7E7B,
  };
  @@ -75,6 +76,7 @@ static const int aspeed_soc_ast1030_irqmap[] = {
  [ASPEED_DEV_TIMER8]= 23,
  [ASPEED_DEV_WDT]   = 24,
  [ASPEED_DEV_LPC]   = 35,
+[ASPEED_DEV_PECI]  = 38,
  [ASPEED_DEV_FMC]   = 39,
  [ASPEED_DEV_PWM]   = 44,
  [ASPEED_DEV_ADC]   = 46,
@@ -133,6 +135,8 @@ static void aspeed_soc_ast1030_init(Object *obj)
object_initialize_child(obj, "lpc", >lpc, TYPE_ASPEED_LPC);
  +object_initialize_child(obj, "peci", >peci, TYPE_ASPEED_PECI);
+
  object_initialize_child(obj, "sbc", >sbc, TYPE_ASPEED_SBC);
for (i = 0; i < sc->wdts_num; i++) {
@@ -206,6 +210,14 @@ static void aspeed_soc_ast1030_realize(DeviceState 
*dev_soc, Error **errp)
  sysbus_connect_irq(SYS_BUS_DEVICE(>i2c.busses[i]), 0, irq);
  }
  +/* PECI */
+if (!sysbus_realize(SYS_BUS_DEVICE(>peci), errp)) {
+return;
+}
+sysbus_mmio_map(SYS_BUS_DEVICE(>peci), 0, sc->memmap[ASPEED_DEV_PECI]);
+sysbus_connect_irq(SYS_BUS_DEVICE(>peci), 0,
+   aspeed_soc_get_irq(s, ASPEED_DEV_PECI));
+
  /* LPC */
  if (!sysbus_realize(SYS_BUS_DEVICE(>lpc), errp)) {
  return;
diff --git a/hw/arm/aspeed_ast2600.c b/hw/arm/aspeed_ast2600.c
index b0a4199b69..85178fabea 100644
--- a/hw/arm/aspeed_ast2600.c
+++ b/hw/arm/aspeed_ast2600.c
@@ -59,6 +59,7 @@ static const hwaddr aspeed_soc_ast2600_memmap[] = {
  [ASPEED_DEV_LPC]   = 0x1E789000,
  [ASPEED_DEV_IBT]   = 0x1E789140,
  [ASPEED_DEV_I2C]   = 0x1E78A000,
+[ASPEED_DEV_PECI]  = 0x1E78B000,
  [ASPEED_DEV_UART1] = 0x1E783000,
  [ASPEED_DEV_UART2] = 0x1E78D000,
  [ASPEED_DEV_UART3] = 0x1E78E000,
@@ -122,6 +123,7 @@ static const int aspeed_soc_ast2600_irqmap[] = {
  [ASPEED_DEV_LPC]   = 35,
  [ASPEED_DEV_IBT]   = 143,
  [ASPEED_DEV_I2C]   = 110,   /* 110 -> 125 */
+[ASPEED_DEV_PECI]  = 38,
  [ASPEED_DEV_ETH1]  = 2,
  [ASPEED_DEV_ETH2]  = 3,
  [ASPEED_DEV_HACE]  = 4,
@@ -180,6 +182,8 @@ static void aspeed_soc_ast2600_init(Object *obj)
  snprintf(typename, sizeof(typename), "aspeed.i2c-%s", socname);
  object_initialize_child(obj, "i2c", >i2c, typename);
  +object_initialize_child(obj, "peci", >peci, TYPE_ASPEED_PECI);
+
  snprintf(typename, sizeof(typename), "aspeed.fmc-%s", socname);
  object_initialize_child(obj, "fmc", >fmc, typename);
  @@ -388,6 +392,14 @@ static void aspeed_soc_ast2600_realize(DeviceState *dev, 
Error **errp)

[PATCH 2/2] hw/ide/core.c: Implement ATA INITIALIZE_DEVICE_PARAMETERS command

2022-06-29 Thread Lev Kujawski

CHS-based disk utilities and operating systems may adjust the logical
geometry of a hard drive to cope with the expectations or limitations
of software using the ATA INITIALIZE_DEVICE_PARAMETERS command.

Prior to this patch, INITIALIZE_DEVICE_PARAMETERS was a nop that
always returned success, raising the possibility of data loss or
corruption if the CHS<->LBA translation redirected a write to the
wrong sector.

* hw/ide/core.c
ide_reset():
- Reset the logical CHS geometry of the hard disk when the power-on
  defaults feature is enabled.
cmd_specify():
- New function implementing INITIALIZE_DEVICE_PARAMETERS.
- Ignore calls for empty or ATAPI devices.
cmd_set_features():
- Implement the power-on defaults enable and disable features.
struct ide_cmd_table:
- Switch WIN_SPECIFY from cmd_nop() to cmd_specify().
ide_init_drive():
- Set new fields 'drive_heads' and 'drive_sectors' based upon the
  actual disk geometry.

* include/hw/ide/internal.h
struct IDEState:
- Store the actual drive CHS values within the new fields
  'drive_heads' and 'drive_sectors.'
- Track whether a soft IDE reset should also reset the logical CHS
  geometry of the hard disk within the new field 'reset_reverts'.
---
 hw/ide/core.c | 29 ++---
 include/hw/ide/internal.h |  3 +++
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/hw/ide/core.c b/hw/ide/core.c
index b747191ebf..39afdc0006 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -1340,6 +1340,11 @@ static void ide_reset(IDEState *s)
 s->pio_aiocb = NULL;
 }
 
+if (s->reset_reverts) {
+s->reset_reverts = false;
+s->heads = s->drive_heads;
+s->sectors   = s->drive_sectors;
+}
 if (s->drive_kind == IDE_CFATA)
 s->mult_sectors = 0;
 else
@@ -1618,6 +1623,20 @@ static bool cmd_check_power_mode(IDEState *s, uint8_t 
cmd)
 return true;
 }
 
+/* INITIALIZE DEVICE PARAMETERS */
+static bool cmd_specify(IDEState *s, uint8_t cmd)
+{
+if (s->blk && s->drive_kind != IDE_CD) {
+s->heads = (s->select & (ATA_DEV_HS)) + 1;
+s->sectors = s->nsector;
+ide_set_irq(s->bus);
+} else {
+ide_abort_command(s);
+}
+
+return true;
+}
+
 static bool cmd_set_features(IDEState *s, uint8_t cmd)
 {
 uint16_t *identify_data;
@@ -1641,7 +1660,11 @@ static bool cmd_set_features(IDEState *s, uint8_t cmd)
 ide_flush_cache(s);
 return false;
 case 0xcc: /* reverting to power-on defaults enable */
+s->reset_reverts = true;
+return true;
 case 0x66: /* reverting to power-on defaults disable */
+s->reset_reverts = false;
+return true;
 case 0xaa: /* read look-ahead enable */
 case 0x55: /* read look-ahead disable */
 case 0x05: /* set advanced power management mode */
@@ -2051,7 +2074,7 @@ static const struct {
 [WIN_SEEK]= { cmd_seek, HD_CFA_OK | SET_DSC },
 [CFA_TRANSLATE_SECTOR]= { cmd_cfa_translate_sector, CFA_OK },
 [WIN_DIAGNOSE]= { cmd_exec_dev_diagnostic, ALL_OK },
-[WIN_SPECIFY] = { cmd_nop, HD_CFA_OK | SET_DSC },
+[WIN_SPECIFY] = { cmd_specify, HD_CFA_OK | SET_DSC },
 [WIN_STANDBYNOW2] = { cmd_nop, HD_CFA_OK },
 [WIN_IDLEIMMEDIATE2]  = { cmd_nop, HD_CFA_OK },
 [WIN_STANDBY2]= { cmd_nop, HD_CFA_OK },
@@ -2541,8 +2564,8 @@ int ide_init_drive(IDEState *s, BlockBackend *blk, 
IDEDriveKind kind,
 
 blk_get_geometry(blk, _sectors);
 s->cylinders = cylinders;
-s->heads = heads;
-s->sectors = secs;
+s->heads = s->drive_heads = heads;
+s->sectors = s->drive_sectors = secs;
 s->chs_trans = chs_trans;
 s->nb_sectors = nb_sectors;
 s->wwn = wwn;
diff --git a/include/hw/ide/internal.h b/include/hw/ide/internal.h
index 97e7e59dc5..b17f36df95 100644
--- a/include/hw/ide/internal.h
+++ b/include/hw/ide/internal.h
@@ -375,6 +375,7 @@ struct IDEState {
 uint8_t unit;
 /* ide config */
 IDEDriveKind drive_kind;
+int drive_heads, drive_sectors;
 int cylinders, heads, sectors, chs_trans;
 int64_t nb_sectors;
 int mult_sectors;
@@ -401,6 +402,8 @@ struct IDEState {
 uint8_t select;
 uint8_t status;
 
+bool reset_reverts;
+
 /* set for lba48 access */
 uint8_t lba48;
 BlockBackend *blk;
-- 
2.34.1

[PATCH v3 09/11] target/ppc: implement addg6s

2022-06-29 Thread Víctor Colombo

From: Matheus Ferst 

Implements the following Power ISA v2.06 instruction:
addg6s: Add and Generate Sixes

Signed-off-by: Matheus Ferst 
Signed-off-by: Víctor Colombo 
Reviewed-by: Víctor Colombo 
---
 target/ppc/insn32.decode   |  4 +++
 target/ppc/translate/fixedpoint-impl.c.inc | 37 ++
 2 files changed, 41 insertions(+)

diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 400ca41bc6..36db427537 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -311,6 +311,10 @@ CNTTZDM 01 . . . 1000111011 -   @X
 PDEPD   01 . . . 0010011100 -   @X
 PEXTD   01 . . . 001000 -   @X
 
+## BCD Assist
+
+ADDG6S  01 . . . - 001001010 -  @X
+
 ### Float-Point Load Instructions
 
 LFS 11 . .  @D
diff --git a/target/ppc/translate/fixedpoint-impl.c.inc 
b/target/ppc/translate/fixedpoint-impl.c.inc
index 1aab32be03..490e49cfc7 100644
--- a/target/ppc/translate/fixedpoint-impl.c.inc
+++ b/target/ppc/translate/fixedpoint-impl.c.inc
@@ -492,3 +492,40 @@ static bool trans_PEXTD(DisasContext *ctx, arg_X *a)
 #endif
 return true;
 }
+
+static bool trans_ADDG6S(DisasContext *ctx, arg_X *a)
+{
+const uint64_t carry_bits = 0xULL;
+TCGv t0, t1, carry, zero = tcg_constant_tl(0);
+
+REQUIRE_INSNS_FLAGS2(ctx, BCDA_ISA206);
+
+t0 = tcg_temp_new();
+t1 = tcg_const_tl(0);
+carry = tcg_const_tl(0);
+
+for (int i = 0; i < 16; i++) {
+tcg_gen_shri_tl(t0, cpu_gpr[a->ra], i * 4);
+tcg_gen_andi_tl(t0, t0, 0xf);
+tcg_gen_add_tl(t1, t1, t0);
+
+tcg_gen_shri_tl(t0, cpu_gpr[a->rb], i * 4);
+tcg_gen_andi_tl(t0, t0, 0xf);
+tcg_gen_add_tl(t1, t1, t0);
+
+tcg_gen_andi_tl(t1, t1, 0x10);
+tcg_gen_setcond_tl(TCG_COND_NE, t1, t1, zero);
+
+tcg_gen_shli_tl(t0, t1, i * 4);
+tcg_gen_or_tl(carry, carry, t0);
+}
+
+tcg_gen_xori_tl(carry, carry, (target_long)carry_bits);
+tcg_gen_muli_tl(cpu_gpr[a->rt], carry, 6);
+
+tcg_temp_free(t0);
+tcg_temp_free(t1);
+tcg_temp_free(carry);
+
+return true;
+}
-- 
2.25.1

[PATCH 1/2] qpci_device_enable: Allow for command bits hardwired to 0

2022-06-29 Thread Lev Kujawski

Devices like the PIIX3/4 IDE controller do not support certain modes
of operation, such as memory space accesses, and indicate this lack of
support by hardwiring the applicable bits to zero. The QEMU PCI device
testing framework is hereby extended to accommodate such devices.

* tests/qtest/libqos/pci.h: Add the command_disabled word to indicate
  bits hardwired to 0.
* tests/qtest/libqos/pci.c: Verify that hardwired bits are actually
  hardwired.
* tests/qtest/ide-test.c: Use the new command_disabled member to
  indicate that PCI_COMMAND_MEMORY is hardwired in the PIIX3/4
  IDE controller.

Signed-off-by: Lev Kujawski 
---
 tests/qtest/ide-test.c   |  1 +
 tests/qtest/libqos/pci.c | 13 +++--
 tests/qtest/libqos/pci.h |  1 +
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/tests/qtest/ide-test.c b/tests/qtest/ide-test.c
index 5bcb75a7e5..85a3967063 100644
--- a/tests/qtest/ide-test.c
+++ b/tests/qtest/ide-test.c
@@ -173,6 +173,7 @@ static QPCIDevice *get_pci_device(QTestState *qts, QPCIBar 
*bmdma_bar,
 
 *ide_bar = qpci_legacy_iomap(dev, IDE_BASE);
 
+dev->command_disabled = PCI_COMMAND_MEMORY;
 qpci_device_enable(dev);
 
 return dev;
diff --git a/tests/qtest/libqos/pci.c b/tests/qtest/libqos/pci.c
index b23d72346b..4f3d28d8d9 100644
--- a/tests/qtest/libqos/pci.c
+++ b/tests/qtest/libqos/pci.c
@@ -220,18 +220,19 @@ int qpci_secondary_buses_init(QPCIBus *bus)
 
 void qpci_device_enable(QPCIDevice *dev)
 {
-uint16_t cmd;
+const uint16_t enable_bits =
+PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER;
+uint16_t cmd, new_cmd;
 
 /* FIXME -- does this need to be a bus callout? */
 cmd = qpci_config_readw(dev, PCI_COMMAND);
-cmd |= PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER;
+cmd |= enable_bits;
 qpci_config_writew(dev, PCI_COMMAND, cmd);
 
 /* Verify the bits are now set. */
-cmd = qpci_config_readw(dev, PCI_COMMAND);
-g_assert_cmphex(cmd & PCI_COMMAND_IO, ==, PCI_COMMAND_IO);
-g_assert_cmphex(cmd & PCI_COMMAND_MEMORY, ==, PCI_COMMAND_MEMORY);
-g_assert_cmphex(cmd & PCI_COMMAND_MASTER, ==, PCI_COMMAND_MASTER);
+new_cmd = qpci_config_readw(dev, PCI_COMMAND);
+new_cmd &= enable_bits;
+g_assert_cmphex(new_cmd, ==, enable_bits & ~dev->command_disabled);
 }
 
 /**
diff --git a/tests/qtest/libqos/pci.h b/tests/qtest/libqos/pci.h
index 8389614523..eaedb98588 100644
--- a/tests/qtest/libqos/pci.h
+++ b/tests/qtest/libqos/pci.h
@@ -68,6 +68,7 @@ struct QPCIDevice
 bool msix_enabled;
 QPCIBar msix_table_bar, msix_pba_bar;
 uint64_t msix_table_off, msix_pba_off;
+uint16_t command_disabled;
 };
 
 struct QPCIAddress {
-- 
2.34.1

[PATCH v3 11/11] target/ppc: implement cdtbcd

2022-06-29 Thread Víctor Colombo

From: Matheus Ferst 

Implements the Convert Declets To Binary Coded Decimal instruction.
Since libdecnumber doesn't expose the methods for direct conversion
(decDigitsFromDPD, DPD2BCD, etc), a positive decimal32 with zero
exponent is used as an intermediate value to convert the declets.

Reviewed-by: Richard Henderson 
Signed-off-by: Matheus Ferst 
Signed-off-by: Víctor Colombo 
---
 target/ppc/dfp_helper.c| 26 ++
 target/ppc/helper.h|  1 +
 target/ppc/insn32.decode   |  1 +
 target/ppc/translate/fixedpoint-impl.c.inc |  7 ++
 4 files changed, 35 insertions(+)

diff --git a/target/ppc/dfp_helper.c b/target/ppc/dfp_helper.c
index db9e994c8c..5ba74b2124 100644
--- a/target/ppc/dfp_helper.c
+++ b/target/ppc/dfp_helper.c
@@ -1392,6 +1392,32 @@ DFP_HELPER_SHIFT(DSCLIQ, 128, 1)
 DFP_HELPER_SHIFT(DSCRI, 64, 0)
 DFP_HELPER_SHIFT(DSCRIQ, 128, 0)
 
+target_ulong helper_CDTBCD(target_ulong s)
+{
+uint64_t res = 0;
+uint32_t dec32, declets;
+uint8_t bcd[6];
+int i, w, sh;
+decNumber a;
+
+for (w = 1; w >= 0; w--) {
+res <<= 32;
+declets = extract64(s, 32 * w, 20);
+if (declets) {
+/* decimal32 with zero exponent and word "w" declets */
+dec32 = (0x225ULL << 20) | declets;
+decimal32ToNumber((decimal32 *), );
+decNumberGetBCD(, bcd);
+for (i = 0; i < a.digits; i++) {
+sh = 4 * (a.digits - 1 - i);
+res |= (uint64_t)bcd[i] << sh;
+}
+}
+}
+
+return res;
+}
+
 target_ulong helper_CBCDTD(target_ulong s)
 {
 uint64_t res = 0;
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 643bd69db8..b0fcebf8b5 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -54,6 +54,7 @@ DEF_HELPER_3(sraw, tl, env, tl, tl)
 DEF_HELPER_FLAGS_2(CFUGED, TCG_CALL_NO_RWG_SE, i64, i64, i64)
 DEF_HELPER_FLAGS_2(PDEPD, TCG_CALL_NO_RWG_SE, i64, i64, i64)
 DEF_HELPER_FLAGS_2(PEXTD, TCG_CALL_NO_RWG_SE, i64, i64, i64)
+DEF_HELPER_FLAGS_1(CDTBCD, TCG_CALL_NO_RWG_SE, tl, tl)
 DEF_HELPER_FLAGS_1(CBCDTD, TCG_CALL_NO_RWG_SE, tl, tl)
 #if defined(TARGET_PPC64)
 DEF_HELPER_FLAGS_2(cmpeqb, TCG_CALL_NO_RWG_SE, i32, tl, tl)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 5222d540b1..b673099eaa 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -317,6 +317,7 @@ PEXTD   01 . . . 001000 -   @X
 ## BCD Assist
 
 ADDG6S  01 . . . - 001001010 -  @X
+CDTBCD  01 . . - 0100011010 -   @X_sa
 CBCDTD  01 . . - 0100111010 -   @X_sa
 
 ### Float-Point Load Instructions
diff --git a/target/ppc/translate/fixedpoint-impl.c.inc 
b/target/ppc/translate/fixedpoint-impl.c.inc
index 892c9d2568..cb0097bedb 100644
--- a/target/ppc/translate/fixedpoint-impl.c.inc
+++ b/target/ppc/translate/fixedpoint-impl.c.inc
@@ -530,6 +530,13 @@ static bool trans_ADDG6S(DisasContext *ctx, arg_X *a)
 return true;
 }
 
+static bool trans_CDTBCD(DisasContext *ctx, arg_X_sa *a)
+{
+REQUIRE_INSNS_FLAGS2(ctx, BCDA_ISA206);
+gen_helper_CDTBCD(cpu_gpr[a->ra], cpu_gpr[a->rs]);
+return true;
+}
+
 static bool trans_CBCDTD(DisasContext *ctx, arg_X_sa *a)
 {
 REQUIRE_INSNS_FLAGS2(ctx, BCDA_ISA206);
-- 
2.25.1

[PATCH v3 07/11] tests/tcg/ppc64: Add mffsce test

2022-06-29 Thread Víctor Colombo

Add mffsce test to check both the return value and the new fpscr
stored in the cpu.

Signed-off-by: Víctor Colombo 
Reviewed-by: Matheus Ferst 
---
 tests/tcg/ppc64/Makefile.target   |  1 +
 tests/tcg/ppc64le/Makefile.target |  1 +
 tests/tcg/ppc64le/mffsce.c| 37 +++
 3 files changed, 39 insertions(+)
 create mode 100644 tests/tcg/ppc64le/mffsce.c

diff --git a/tests/tcg/ppc64/Makefile.target b/tests/tcg/ppc64/Makefile.target
index babd209573..331fae628e 100644
--- a/tests/tcg/ppc64/Makefile.target
+++ b/tests/tcg/ppc64/Makefile.target
@@ -11,6 +11,7 @@ endif
 $(PPC64_TESTS): CFLAGS += -mpower8-vector
 
 PPC64_TESTS += mtfsf
+PPC64_TESTS += mffsce
 
 ifneq ($(CROSS_CC_HAS_POWER10),)
 PPC64_TESTS += byte_reverse sha512-vector
diff --git a/tests/tcg/ppc64le/Makefile.target 
b/tests/tcg/ppc64le/Makefile.target
index 5b0eb5e870..6ca3003f02 100644
--- a/tests/tcg/ppc64le/Makefile.target
+++ b/tests/tcg/ppc64le/Makefile.target
@@ -24,6 +24,7 @@ run-sha512-vector: QEMU_OPTS+=-cpu POWER10
 run-plugin-sha512-vector-with-%: QEMU_OPTS+=-cpu POWER10
 
 PPC64LE_TESTS += mtfsf
+PPC64LE_TESTS += mffsce
 PPC64LE_TESTS += signal_save_restore_xer
 PPC64LE_TESTS += xxspltw
 
diff --git a/tests/tcg/ppc64le/mffsce.c b/tests/tcg/ppc64le/mffsce.c
new file mode 100644
index 00..20d882cb45
--- /dev/null
+++ b/tests/tcg/ppc64le/mffsce.c
@@ -0,0 +1,37 @@
+#include 
+#include 
+#include 
+
+#define MTFSF(FLM, FRB) asm volatile ("mtfsf %0, %1" :: "i" (FLM), "f" (FRB))
+#define MFFS(FRT) asm("mffs %0" : "=f" (FRT))
+#define MFFSCE(FRT) asm("mffsce %0" : "=f" (FRT))
+
+#define PPC_BIT_NR(nr) (63 - (nr))
+
+#define FP_VE  (1ull << PPC_BIT_NR(56))
+#define FP_UE  (1ull << PPC_BIT_NR(58))
+#define FP_ZE  (1ull << PPC_BIT_NR(59))
+#define FP_XE  (1ull << PPC_BIT_NR(60))
+#define FP_NI  (1ull << PPC_BIT_NR(61))
+#define FP_RN1 (1ull << PPC_BIT_NR(63))
+
+int main(void)
+{
+uint64_t frt, fpscr;
+uint64_t test_value = FP_VE | FP_UE | FP_ZE |
+  FP_XE | FP_NI | FP_RN1;
+MTFSF(0b, test_value); /* set test value to cpu fpscr */
+MFFSCE(frt);
+MFFS(fpscr); /* read the value that mffsce stored to cpu fpscr */
+
+/* the returned value should be as the cpu fpscr was before */
+assert((frt & 0xff) == test_value);
+
+/*
+ * the cpu fpscr last 3 bits should be unchanged
+ * and enable bits should be unset
+ */
+assert((fpscr & 0xff) == (test_value & 0x7));
+
+return 0;
+}
-- 
2.25.1

[PATCH v3 08/11] target/ppc: Add flag for ISA v2.06 BCDA instructions

2022-06-29 Thread Víctor Colombo

From: Matheus Ferst 

Adds an insns_flags2 for the BCD assist instructions introduced in
Power ISA 2.06. These instructions are not listed in the manuals for
e5500[1] and e6500[2], so the flag is only added for POWER7/8/9/10
models.

[1] https://www.nxp.com/files-static/32bit/doc/ref_manual/EREF_RM.pdf
[2] https://www.nxp.com/docs/en/reference-manual/E6500RM.pdf

Signed-off-by: Matheus Ferst 
Signed-off-by: Víctor Colombo 
Reviewed-by: Richard Henderson 
---
 target/ppc/cpu.h  |  5 -
 target/ppc/cpu_init.c | 10 ++
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 6d78078f37..642bae311f 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -2277,6 +2277,8 @@ enum {
 PPC2_ISA310= 0x0010ULL,
 /*   lwsync instruction  */
 PPC2_MEM_LWSYNC= 0x0020ULL,
+/* ISA 2.06 BCD assist instructions  */
+PPC2_BCDA_ISA206   = 0x0040ULL,
 
 #define PPC_TCG_INSNS2 (PPC2_BOOKE206 | PPC2_VSX | PPC2_PRCNTL | PPC2_DBRX | \
 PPC2_ISA205 | PPC2_VSX207 | PPC2_PERM_ISA206 | \
@@ -2285,7 +2287,8 @@ enum {
 PPC2_BCTAR_ISA207 | PPC2_LSQ_ISA207 | \
 PPC2_ALTIVEC_207 | PPC2_ISA207S | PPC2_DFP | \
 PPC2_FP_CVT_S64 | PPC2_TM | PPC2_PM_ISA206 | \
-PPC2_ISA300 | PPC2_ISA310 | PPC2_MEM_LWSYNC)
+PPC2_ISA300 | PPC2_ISA310 | PPC2_MEM_LWSYNC | \
+PPC2_BCDA_ISA206)
 };
 
 /*/
diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index c16cb8dbe7..bdfb1a5c6f 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -5985,7 +5985,7 @@ POWERPC_FAMILY(POWER7)(ObjectClass *oc, void *data)
 PPC2_PERM_ISA206 | PPC2_DIVE_ISA206 |
 PPC2_ATOMIC_ISA206 | PPC2_FP_CVT_ISA206 |
 PPC2_FP_TST_ISA206 | PPC2_FP_CVT_S64 |
-PPC2_PM_ISA206 | PPC2_MEM_LWSYNC;
+PPC2_PM_ISA206 | PPC2_MEM_LWSYNC | PPC2_BCDA_ISA206;
 pcc->msr_mask = (1ull << MSR_SF) |
 (1ull << MSR_VR) |
 (1ull << MSR_VSX) |
@@ -6159,7 +6159,8 @@ POWERPC_FAMILY(POWER8)(ObjectClass *oc, void *data)
 PPC2_FP_TST_ISA206 | PPC2_BCTAR_ISA207 |
 PPC2_LSQ_ISA207 | PPC2_ALTIVEC_207 |
 PPC2_ISA205 | PPC2_ISA207S | PPC2_FP_CVT_S64 |
-PPC2_TM | PPC2_PM_ISA206 | PPC2_MEM_LWSYNC;
+PPC2_TM | PPC2_PM_ISA206 | PPC2_MEM_LWSYNC |
+PPC2_BCDA_ISA206;
 pcc->msr_mask = (1ull << MSR_SF) |
 (1ull << MSR_HV) |
 (1ull << MSR_TM) |
@@ -6379,7 +6380,8 @@ POWERPC_FAMILY(POWER9)(ObjectClass *oc, void *data)
 PPC2_FP_TST_ISA206 | PPC2_BCTAR_ISA207 |
 PPC2_LSQ_ISA207 | PPC2_ALTIVEC_207 |
 PPC2_ISA205 | PPC2_ISA207S | PPC2_FP_CVT_S64 |
-PPC2_TM | PPC2_ISA300 | PPC2_PRCNTL | PPC2_MEM_LWSYNC;
+PPC2_TM | PPC2_ISA300 | PPC2_PRCNTL | PPC2_MEM_LWSYNC |
+PPC2_BCDA_ISA206;
 pcc->msr_mask = (1ull << MSR_SF) |
 (1ull << MSR_HV) |
 (1ull << MSR_TM) |
@@ -6597,7 +6599,7 @@ POWERPC_FAMILY(POWER10)(ObjectClass *oc, void *data)
 PPC2_LSQ_ISA207 | PPC2_ALTIVEC_207 |
 PPC2_ISA205 | PPC2_ISA207S | PPC2_FP_CVT_S64 |
 PPC2_TM | PPC2_ISA300 | PPC2_PRCNTL | PPC2_ISA310 |
-PPC2_MEM_LWSYNC;
+PPC2_MEM_LWSYNC | PPC2_BCDA_ISA206;
 pcc->msr_mask = (1ull << MSR_SF) |
 (1ull << MSR_HV) |
 (1ull << MSR_TM) |
-- 
2.25.1

[PATCH v3 04/11] target/ppc: Move mffsl to decodetree

2022-06-29 Thread Víctor Colombo

Signed-off-by: Víctor Colombo 
Reviewed-by: Matheus Ferst 
---
 target/ppc/insn32.decode   |  1 +
 target/ppc/translate/fp-impl.c.inc | 38 +-
 target/ppc/translate/fp-ops.c.inc  |  2 --
 3 files changed, 17 insertions(+), 24 deletions(-)

diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index b6a7a3a3ff..6d3b98a127 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -345,6 +345,7 @@ SETNBCR 01 . . - 00 -   
@X_bi
 MFFSCE  11 . 1 - 1001000111 -   @X_t
 MFFSCRN 11 . 10110 . 1001000111 -   @X_tb
 MFFSCRNI11 . 10111 ---.. 1001000111 -   @X_imm2
+MFFSL   11 . 11000 - 1001000111 -   @X_t
 
 ### Decimal Floating-Point Arithmetic Instructions
 
diff --git a/target/ppc/translate/fp-impl.c.inc 
b/target/ppc/translate/fp-impl.c.inc
index 64e26b9b42..4f4d57c611 100644
--- a/target/ppc/translate/fp-impl.c.inc
+++ b/target/ppc/translate/fp-impl.c.inc
@@ -633,28 +633,6 @@ static void gen_mffs(DisasContext *ctx)
 tcg_temp_free_i64(t0);
 }
 
-/* mffsl */
-static void gen_mffsl(DisasContext *ctx)
-{
-TCGv_i64 t0;
-
-if (unlikely(!(ctx->insns_flags2 & PPC2_ISA300))) {
-return gen_mffs(ctx);
-}
-
-if (unlikely(!ctx->fpu_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_FPU);
-return;
-}
-t0 = tcg_temp_new_i64();
-gen_reset_fpstatus();
-tcg_gen_extu_tl_i64(t0, cpu_fpscr);
-/* Mask everything except mode, status, and enables.  */
-tcg_gen_andi_i64(t0, t0, FP_DRN | FP_STATUS | FP_ENABLES | FP_RN);
-set_fpr(rD(ctx->opcode), t0);
-tcg_temp_free_i64(t0);
-}
-
 static TCGv_i64 place_from_fpscr(int rt, uint64_t mask)
 {
 TCGv_i64 fpscr = tcg_temp_new_i64();
@@ -739,6 +717,22 @@ static bool trans_MFFSCRNI(DisasContext *ctx, arg_X_imm2 
*a)
 return true;
 }
 
+static bool trans_MFFSL(DisasContext *ctx, arg_X_t *a)
+{
+TCGv_i64 fpscr;
+
+REQUIRE_INSNS_FLAGS2(ctx, ISA300);
+REQUIRE_FPU(ctx);
+
+gen_reset_fpstatus();
+fpscr = place_from_fpscr(a->rt,
+FP_DRN | FP_STATUS | FP_ENABLES | FP_NI | FP_RN);
+
+tcg_temp_free_i64(fpscr);
+
+return true;
+}
+
 /* mtfsb0 */
 static void gen_mtfsb0(DisasContext *ctx)
 {
diff --git a/target/ppc/translate/fp-ops.c.inc 
b/target/ppc/translate/fp-ops.c.inc
index a76943b8bf..f8c35124ae 100644
--- a/target/ppc/translate/fp-ops.c.inc
+++ b/target/ppc/translate/fp-ops.c.inc
@@ -75,8 +75,6 @@ GEN_HANDLER_E(fmrgew, 0x3F, 0x06, 0x1E, 0x0001, PPC_NONE, 
PPC2_VSX207),
 GEN_HANDLER_E(fmrgow, 0x3F, 0x06, 0x1A, 0x0001, PPC_NONE, PPC2_VSX207),
 GEN_HANDLER(mcrfs, 0x3F, 0x00, 0x02, 0x0063F801, PPC_FLOAT),
 GEN_HANDLER_E_2(mffs, 0x3F, 0x07, 0x12, 0x00, 0x, PPC_FLOAT, PPC_NONE),
-GEN_HANDLER_E_2(mffsl, 0x3F, 0x07, 0x12, 0x18, 0x, PPC_FLOAT,
-PPC2_ISA300),
 GEN_HANDLER(mtfsb0, 0x3F, 0x06, 0x02, 0x001FF800, PPC_FLOAT),
 GEN_HANDLER(mtfsb1, 0x3F, 0x06, 0x01, 0x001FF800, PPC_FLOAT),
 GEN_HANDLER(mtfsf, 0x3F, 0x07, 0x16, 0x, PPC_FLOAT),
-- 
2.25.1

[PATCH v3 10/11] target/ppc: implement cbcdtd

2022-06-29 Thread Víctor Colombo

From: Matheus Ferst 

Implements the Convert Binary Coded Decimal To Declets instruction.
Since libdecnumber doesn't expose the methods for direct conversion
(decDigitsToDPD, BCD2DPD, etc.), the BCD values are converted to
decimal32 format, from which the declets are extracted.

Where the behavior is undefined, we try to match the result observed in
a POWER9 DD2.3.

Reviewed-by: Richard Henderson 
Signed-off-by: Matheus Ferst 
Signed-off-by: Víctor Colombo 
---
 target/ppc/dfp_helper.c| 39 ++
 target/ppc/helper.h|  1 +
 target/ppc/insn32.decode   |  4 +++
 target/ppc/translate/fixedpoint-impl.c.inc |  7 
 4 files changed, 51 insertions(+)

diff --git a/target/ppc/dfp_helper.c b/target/ppc/dfp_helper.c
index 0d01ac3de0..db9e994c8c 100644
--- a/target/ppc/dfp_helper.c
+++ b/target/ppc/dfp_helper.c
@@ -1391,3 +1391,42 @@ DFP_HELPER_SHIFT(DSCLI, 64, 1)
 DFP_HELPER_SHIFT(DSCLIQ, 128, 1)
 DFP_HELPER_SHIFT(DSCRI, 64, 0)
 DFP_HELPER_SHIFT(DSCRIQ, 128, 0)
+
+target_ulong helper_CBCDTD(target_ulong s)
+{
+uint64_t res = 0;
+uint32_t dec32;
+uint8_t bcd[6];
+int w, i, offs;
+decNumber a;
+decContext context;
+
+decContextDefault(, DEC_INIT_DECIMAL32);
+
+for (w = 1; w >= 0; w--) {
+res <<= 32;
+decNumberZero();
+/* Extract each BCD field of word "w" */
+for (i = 5; i >= 0; i--) {
+offs = 4 * (5 - i) + 32 * w;
+bcd[i] = extract64(s, offs, 4);
+if (bcd[i] > 9) {
+/*
+ * If the field value is greater than 9, the results are
+ * undefined. We could use a fixed value like 0 or 9, but
+ * an and with 9 seems to better match the hardware behavior.
+ */
+bcd[i] &= 9;
+}
+}
+
+/* Create a decNumber with the BCD values and convert to decimal32 */
+decNumberSetBCD(, bcd, 6);
+decimal32FromNumber((decimal32 *), , );
+
+/* Extract the two declets from the decimal32 value */
+res |= dec32 & 0xf;
+}
+
+return res;
+}
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index d627cfe6ed..643bd69db8 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -54,6 +54,7 @@ DEF_HELPER_3(sraw, tl, env, tl, tl)
 DEF_HELPER_FLAGS_2(CFUGED, TCG_CALL_NO_RWG_SE, i64, i64, i64)
 DEF_HELPER_FLAGS_2(PDEPD, TCG_CALL_NO_RWG_SE, i64, i64, i64)
 DEF_HELPER_FLAGS_2(PEXTD, TCG_CALL_NO_RWG_SE, i64, i64, i64)
+DEF_HELPER_FLAGS_1(CBCDTD, TCG_CALL_NO_RWG_SE, tl, tl)
 #if defined(TARGET_PPC64)
 DEF_HELPER_FLAGS_2(cmpeqb, TCG_CALL_NO_RWG_SE, i32, tl, tl)
 DEF_HELPER_FLAGS_1(popcntw, TCG_CALL_NO_RWG_SE, tl, tl)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 36db427537..5222d540b1 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -87,6 +87,9 @@
 _rc   rt ra rb rc:bool
 @X_rc   .. rt:5 ra:5 rb:5 .. rc:1   _rc
 
+_sa   rs ra
+@X_sa   .. rs:5 ra:5 . .. . _sa
+
 %x_frtp 22:4 !function=times_2
 %x_frap 17:4 !function=times_2
 %x_frbp 12:4 !function=times_2
@@ -314,6 +317,7 @@ PEXTD   01 . . . 001000 -   @X
 ## BCD Assist
 
 ADDG6S  01 . . . - 001001010 -  @X
+CBCDTD  01 . . - 0100111010 -   @X_sa
 
 ### Float-Point Load Instructions
 
diff --git a/target/ppc/translate/fixedpoint-impl.c.inc 
b/target/ppc/translate/fixedpoint-impl.c.inc
index 490e49cfc7..892c9d2568 100644
--- a/target/ppc/translate/fixedpoint-impl.c.inc
+++ b/target/ppc/translate/fixedpoint-impl.c.inc
@@ -529,3 +529,10 @@ static bool trans_ADDG6S(DisasContext *ctx, arg_X *a)
 
 return true;
 }
+
+static bool trans_CBCDTD(DisasContext *ctx, arg_X_sa *a)
+{
+REQUIRE_INSNS_FLAGS2(ctx, BCDA_ISA206);
+gen_helper_CBCDTD(cpu_gpr[a->ra], cpu_gpr[a->rs]);
+return true;
+}
-- 
2.25.1

[PATCH v3 06/11] target/ppc: Implement mffscdrn[i] instructions

2022-06-29 Thread Víctor Colombo

Signed-off-by: Víctor Colombo 
Reviewed-by: Matheus Ferst 
---
 target/ppc/insn32.decode   |  5 
 target/ppc/translate/fp-impl.c.inc | 41 ++
 2 files changed, 46 insertions(+)

diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 736a7c6f3f..400ca41bc6 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -133,6 +133,9 @@
 _imm2 rt imm
 @X_imm2 .. rt:5 . ... imm:2 .. ._imm2
 
+_imm3 rt imm
+@X_imm3 .. rt:5 . .. imm:3 .. . _imm3
+
 %x_xt   0:1 21:5
 _imm5 xt imm:uint8_t vrb
 @X_imm5 .. . imm:5 vrb:5 .. .   _imm5 
xt=%x_xt
@@ -348,7 +351,9 @@ SETNBCR 01 . . - 00 -   
@X_bi
 MFFS11 . 0 - 1001000111 .   @X_t_rc
 MFFSCE  11 . 1 - 1001000111 -   @X_t
 MFFSCRN 11 . 10110 . 1001000111 -   @X_tb
+MFFSCDRN11 . 10100 . 1001000111 -   @X_tb
 MFFSCRNI11 . 10111 ---.. 1001000111 -   @X_imm2
+MFFSCDRNI   11 . 10101 --... 1001000111 -   @X_imm3
 MFFSL   11 . 11000 - 1001000111 -   @X_t
 
 ### Decimal Floating-Point Arithmetic Instructions
diff --git a/target/ppc/translate/fp-impl.c.inc 
b/target/ppc/translate/fp-impl.c.inc
index d6231358f8..319513d001 100644
--- a/target/ppc/translate/fp-impl.c.inc
+++ b/target/ppc/translate/fp-impl.c.inc
@@ -696,6 +696,27 @@ static bool trans_MFFSCRN(DisasContext *ctx, arg_X_tb *a)
 return true;
 }
 
+static bool trans_MFFSCDRN(DisasContext *ctx, arg_X_tb *a)
+{
+TCGv_i64 t1, fpscr;
+
+REQUIRE_INSNS_FLAGS2(ctx, ISA300);
+REQUIRE_FPU(ctx);
+
+t1 = tcg_temp_new_i64();
+get_fpr(t1, a->rb);
+tcg_gen_andi_i64(t1, t1, FP_DRN);
+
+gen_reset_fpstatus();
+fpscr = place_from_fpscr(a->rt, FP_DRN | FP_ENABLES | FP_NI | FP_RN);
+store_fpscr_masked(fpscr, FP_DRN, t1, 0x0100);
+
+tcg_temp_free_i64(t1);
+tcg_temp_free_i64(fpscr);
+
+return true;
+}
+
 static bool trans_MFFSCRNI(DisasContext *ctx, arg_X_imm2 *a)
 {
 TCGv_i64 t1, fpscr;
@@ -716,6 +737,26 @@ static bool trans_MFFSCRNI(DisasContext *ctx, arg_X_imm2 
*a)
 return true;
 }
 
+static bool trans_MFFSCDRNI(DisasContext *ctx, arg_X_imm3 *a)
+{
+TCGv_i64 t1, fpscr;
+
+REQUIRE_INSNS_FLAGS2(ctx, ISA300);
+REQUIRE_FPU(ctx);
+
+t1 = tcg_temp_new_i64();
+tcg_gen_movi_i64(t1, (uint64_t)a->imm << FPSCR_DRN0);
+
+gen_reset_fpstatus();
+fpscr = place_from_fpscr(a->rt, FP_DRN | FP_ENABLES | FP_NI | FP_RN);
+store_fpscr_masked(fpscr, FP_DRN, t1, 0x0100);
+
+tcg_temp_free_i64(t1);
+tcg_temp_free_i64(fpscr);
+
+return true;
+}
+
 static bool trans_MFFSL(DisasContext *ctx, arg_X_t *a)
 {
 TCGv_i64 fpscr;
-- 
2.25.1

[PATCH v3 02/11] target/ppc: Move mffscrn[i] to decodetree

2022-06-29 Thread Víctor Colombo

Signed-off-by: Víctor Colombo 
Reviewed-by: Matheus Ferst 
---
 target/ppc/insn32.decode   |  8 +++
 target/ppc/internal.h  |  3 --
 target/ppc/translate/fp-impl.c.inc | 83 +++---
 target/ppc/translate/fp-ops.c.inc  |  4 --
 4 files changed, 50 insertions(+), 48 deletions(-)

diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 8b723b5433..3b61c3a073 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -124,6 +124,9 @@
 _bfl  bf l:bool ra rb
 @X_bfl  .. bf:3 . l:1 ra:5 rb:5 .. ._bfl
 
+_imm2 rt imm
+@X_imm2 .. rt:5 . ... imm:2 .. ._imm2
+
 %x_xt   0:1 21:5
 _imm5 xt imm:uint8_t vrb
 @X_imm5 .. . imm:5 vrb:5 .. .   _imm5 
xt=%x_xt
@@ -334,6 +337,11 @@ SETBCR  01 . . - 011010 -   
@X_bi
 SETNBC  01 . . - 011100 -   @X_bi
 SETNBCR 01 . . - 00 -   @X_bi
 
+### Move To/From FPSCR
+
+MFFSCRN 11 . 10110 . 1001000111 -   @X_tb
+MFFSCRNI11 . 10111 ---.. 1001000111 -   @X_imm2
+
 ### Decimal Floating-Point Arithmetic Instructions
 
 DADD111011 . . . 10 .   @X_rc
diff --git a/target/ppc/internal.h b/target/ppc/internal.h
index 2add128cd1..467f3046c8 100644
--- a/target/ppc/internal.h
+++ b/target/ppc/internal.h
@@ -159,9 +159,6 @@ EXTRACT_HELPER(FPL, 25, 1);
 EXTRACT_HELPER(FPFLM, 17, 8);
 EXTRACT_HELPER(FPW, 16, 1);
 
-/* mffscrni */
-EXTRACT_HELPER(RM, 11, 2);
-
 /* addpcis */
 EXTRACT_HELPER_SPLIT_3(DX, 10, 6, 6, 5, 16, 1, 1, 0, 0)
 #if defined(TARGET_PPC64)
diff --git a/target/ppc/translate/fp-impl.c.inc 
b/target/ppc/translate/fp-impl.c.inc
index f9b58b844e..bcb7ec2689 100644
--- a/target/ppc/translate/fp-impl.c.inc
+++ b/target/ppc/translate/fp-impl.c.inc
@@ -685,71 +685,72 @@ static void gen_mffsce(DisasContext *ctx)
 tcg_temp_free_i64(t0);
 }
 
-static void gen_helper_mffscrn(DisasContext *ctx, TCGv_i64 t1)
+static TCGv_i64 place_from_fpscr(int rt, uint64_t mask)
 {
-TCGv_i64 t0 = tcg_temp_new_i64();
-TCGv_i32 mask = tcg_const_i32(0x0001);
+TCGv_i64 fpscr = tcg_temp_new_i64();
+TCGv_i64 fpscr_masked = tcg_temp_new_i64();
 
-gen_reset_fpstatus();
-tcg_gen_extu_tl_i64(t0, cpu_fpscr);
-tcg_gen_andi_i64(t0, t0, FP_DRN | FP_ENABLES | FP_RN);
-set_fpr(rD(ctx->opcode), t0);
+tcg_gen_extu_tl_i64(fpscr, cpu_fpscr);
+tcg_gen_andi_i64(fpscr_masked, fpscr, mask);
+set_fpr(rt, fpscr_masked);
 
-/* Mask FPSCR value to clear RN.  */
-tcg_gen_andi_i64(t0, t0, ~FP_RN);
+tcg_temp_free_i64(fpscr_masked);
 
-/* Merge RN into FPSCR value.  */
-tcg_gen_or_i64(t0, t0, t1);
+return fpscr;
+}
 
-gen_helper_store_fpscr(cpu_env, t0, mask);
+static void store_fpscr_masked(TCGv_i64 fpscr, uint64_t clear_mask,
+   TCGv_i64 set_mask, uint32_t store_mask)
+{
+TCGv_i64 fpscr_masked = tcg_temp_new_i64();
+TCGv_i32 st_mask = tcg_constant_i32(store_mask);
 
-tcg_temp_free_i32(mask);
-tcg_temp_free_i64(t0);
+tcg_gen_andi_i64(fpscr_masked, fpscr, ~clear_mask);
+tcg_gen_or_i64(fpscr_masked, fpscr_masked, set_mask);
+gen_helper_store_fpscr(cpu_env, fpscr_masked, st_mask);
+
+tcg_temp_free_i64(fpscr_masked);
 }
 
-/* mffscrn */
-static void gen_mffscrn(DisasContext *ctx)
+static bool trans_MFFSCRN(DisasContext *ctx, arg_X_tb *a)
 {
-TCGv_i64 t1;
+TCGv_i64 t1, fpscr;
 
-if (unlikely(!(ctx->insns_flags2 & PPC2_ISA300))) {
-return gen_mffs(ctx);
-}
-
-if (unlikely(!ctx->fpu_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_FPU);
-return;
-}
+REQUIRE_INSNS_FLAGS2(ctx, ISA300);
+REQUIRE_FPU(ctx);
 
 t1 = tcg_temp_new_i64();
-get_fpr(t1, rB(ctx->opcode));
-/* Mask FRB to get just RN.  */
+get_fpr(t1, a->rb);
 tcg_gen_andi_i64(t1, t1, FP_RN);
 
-gen_helper_mffscrn(ctx, t1);
+gen_reset_fpstatus();
+fpscr = place_from_fpscr(a->rt, FP_DRN | FP_ENABLES | FP_NI | FP_RN);
+store_fpscr_masked(fpscr, FP_RN, t1, 0x0001);
 
 tcg_temp_free_i64(t1);
+tcg_temp_free_i64(fpscr);
+
+return true;
 }
 
-/* mffscrni */
-static void gen_mffscrni(DisasContext *ctx)
+static bool trans_MFFSCRNI(DisasContext *ctx, arg_X_imm2 *a)
 {
-TCGv_i64 t1;
-
-if (unlikely(!(ctx->insns_flags2 & PPC2_ISA300))) {
-return gen_mffs(ctx);
-}
+TCGv_i64 t1, fpscr;
 
-if (unlikely(!ctx->fpu_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_FPU);
-return;
-}
+REQUIRE_INSNS_FLAGS2(ctx, ISA300);
+REQUIRE_FPU(ctx);
 
-t1 = tcg_const_i64((uint64_t)RM(ctx->opcode));
+t1 = tcg_temp_new_i64();
+tcg_gen_movi_i64(t1, a->imm);
 
-gen_helper_mffscrn(ctx, t1);
+gen_reset_fpstatus();
+fpscr = place_from_fpscr(a->rt, FP_DRN | FP_ENABLES | FP_NI | FP_RN);

[PATCH v3 05/11] target/ppc: Move mffs[.] to decodetree

2022-06-29 Thread Víctor Colombo

Signed-off-by: Víctor Colombo 
Reviewed-by: Matheus Ferst 
---
 target/ppc/insn32.decode   |  4 
 target/ppc/translate/fp-impl.c.inc | 35 +++---
 target/ppc/translate/fp-ops.c.inc  |  1 -
 3 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 6d3b98a127..736a7c6f3f 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -100,6 +100,9 @@
 _tb   rt rb
 @X_tb   .. rt:5 . rb:5 .. . _tb
 
+_t_rc rt rc:bool
+@X_t_rc .. rt:5 . . .. rc:1 _t_rc
+
 _tb_rcrt rb rc:bool
 @X_tb_rc.. rt:5 . rb:5 .. rc:1  _tb_rc
 
@@ -342,6 +345,7 @@ SETNBCR 01 . . - 00 -   
@X_bi
 
 ### Move To/From FPSCR
 
+MFFS11 . 0 - 1001000111 .   @X_t_rc
 MFFSCE  11 . 1 - 1001000111 -   @X_t
 MFFSCRN 11 . 10110 . 1001000111 -   @X_tb
 MFFSCRNI11 . 10111 ---.. 1001000111 -   @X_imm2
diff --git a/target/ppc/translate/fp-impl.c.inc 
b/target/ppc/translate/fp-impl.c.inc
index 4f4d57c611..d6231358f8 100644
--- a/target/ppc/translate/fp-impl.c.inc
+++ b/target/ppc/translate/fp-impl.c.inc
@@ -615,24 +615,6 @@ static void gen_mcrfs(DisasContext *ctx)
 tcg_temp_free_i64(tnew_fpscr);
 }
 
-/* mffs */
-static void gen_mffs(DisasContext *ctx)
-{
-TCGv_i64 t0;
-if (unlikely(!ctx->fpu_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_FPU);
-return;
-}
-t0 = tcg_temp_new_i64();
-gen_reset_fpstatus();
-tcg_gen_extu_tl_i64(t0, cpu_fpscr);
-set_fpr(rD(ctx->opcode), t0);
-if (unlikely(Rc(ctx->opcode))) {
-gen_set_cr1_from_fpscr(ctx);
-}
-tcg_temp_free_i64(t0);
-}
-
 static TCGv_i64 place_from_fpscr(int rt, uint64_t mask)
 {
 TCGv_i64 fpscr = tcg_temp_new_i64();
@@ -660,6 +642,23 @@ static void store_fpscr_masked(TCGv_i64 fpscr, uint64_t 
clear_mask,
 tcg_temp_free_i64(fpscr_masked);
 }
 
+static bool trans_MFFS(DisasContext *ctx, arg_X_t_rc *a)
+{
+TCGv_i64 fpscr;
+
+REQUIRE_FPU(ctx);
+
+gen_reset_fpstatus();
+fpscr = place_from_fpscr(a->rt, UINT64_MAX);
+if (a->rc) {
+gen_set_cr1_from_fpscr(ctx);
+}
+
+tcg_temp_free_i64(fpscr);
+
+return true;
+}
+
 static bool trans_MFFSCE(DisasContext *ctx, arg_X_t *a)
 {
 TCGv_i64 fpscr;
diff --git a/target/ppc/translate/fp-ops.c.inc 
b/target/ppc/translate/fp-ops.c.inc
index f8c35124ae..1b65f5ab73 100644
--- a/target/ppc/translate/fp-ops.c.inc
+++ b/target/ppc/translate/fp-ops.c.inc
@@ -74,7 +74,6 @@ GEN_HANDLER_E(fcpsgn, 0x3F, 0x08, 0x00, 0x, PPC_NONE, 
PPC2_ISA205),
 GEN_HANDLER_E(fmrgew, 0x3F, 0x06, 0x1E, 0x0001, PPC_NONE, PPC2_VSX207),
 GEN_HANDLER_E(fmrgow, 0x3F, 0x06, 0x1A, 0x0001, PPC_NONE, PPC2_VSX207),
 GEN_HANDLER(mcrfs, 0x3F, 0x00, 0x02, 0x0063F801, PPC_FLOAT),
-GEN_HANDLER_E_2(mffs, 0x3F, 0x07, 0x12, 0x00, 0x, PPC_FLOAT, PPC_NONE),
 GEN_HANDLER(mtfsb0, 0x3F, 0x06, 0x02, 0x001FF800, PPC_FLOAT),
 GEN_HANDLER(mtfsb1, 0x3F, 0x06, 0x01, 0x001FF800, PPC_FLOAT),
 GEN_HANDLER(mtfsf, 0x3F, 0x07, 0x16, 0x, PPC_FLOAT),
-- 
2.25.1

[PATCH v3 03/11] target/ppc: Move mffsce to decodetree

2022-06-29 Thread Víctor Colombo

Signed-off-by: Víctor Colombo 
Reviewed-by: Matheus Ferst 
---
 target/ppc/insn32.decode   |  4 +++
 target/ppc/translate/fp-impl.c.inc | 46 +++---
 target/ppc/translate/fp-ops.c.inc  |  2 --
 3 files changed, 20 insertions(+), 32 deletions(-)

diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 3b61c3a073..b6a7a3a3ff 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -94,6 +94,9 @@
 
 @X_tp_a_bp_rc   .. 0 ra:5 0 .. rc:1 _rc 
rt=%x_frtp rb=%x_frbp
 
+_trt
+@X_t.. rt:5 . . .. ._t
+
 _tb   rt rb
 @X_tb   .. rt:5 . rb:5 .. . _tb
 
@@ -339,6 +342,7 @@ SETNBCR 01 . . - 00 -   
@X_bi
 
 ### Move To/From FPSCR
 
+MFFSCE  11 . 1 - 1001000111 -   @X_t
 MFFSCRN 11 . 10110 . 1001000111 -   @X_tb
 MFFSCRNI11 . 10111 ---.. 1001000111 -   @X_imm2
 
diff --git a/target/ppc/translate/fp-impl.c.inc 
b/target/ppc/translate/fp-impl.c.inc
index bcb7ec2689..64e26b9b42 100644
--- a/target/ppc/translate/fp-impl.c.inc
+++ b/target/ppc/translate/fp-impl.c.inc
@@ -655,36 +655,6 @@ static void gen_mffsl(DisasContext *ctx)
 tcg_temp_free_i64(t0);
 }
 
-/* mffsce */
-static void gen_mffsce(DisasContext *ctx)
-{
-TCGv_i64 t0;
-TCGv_i32 mask;
-
-if (unlikely(!(ctx->insns_flags2 & PPC2_ISA300))) {
-return gen_mffs(ctx);
-}
-
-if (unlikely(!ctx->fpu_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_FPU);
-return;
-}
-
-t0 = tcg_temp_new_i64();
-
-gen_reset_fpstatus();
-tcg_gen_extu_tl_i64(t0, cpu_fpscr);
-set_fpr(rD(ctx->opcode), t0);
-
-/* Clear exception enable bits in the FPSCR.  */
-tcg_gen_andi_i64(t0, t0, ~FP_ENABLES);
-mask = tcg_const_i32(0x0003);
-gen_helper_store_fpscr(cpu_env, t0, mask);
-
-tcg_temp_free_i32(mask);
-tcg_temp_free_i64(t0);
-}
-
 static TCGv_i64 place_from_fpscr(int rt, uint64_t mask)
 {
 TCGv_i64 fpscr = tcg_temp_new_i64();
@@ -712,6 +682,22 @@ static void store_fpscr_masked(TCGv_i64 fpscr, uint64_t 
clear_mask,
 tcg_temp_free_i64(fpscr_masked);
 }
 
+static bool trans_MFFSCE(DisasContext *ctx, arg_X_t *a)
+{
+TCGv_i64 fpscr;
+
+REQUIRE_INSNS_FLAGS2(ctx, ISA300);
+REQUIRE_FPU(ctx);
+
+gen_reset_fpstatus();
+fpscr = place_from_fpscr(a->rt, UINT64_MAX);
+store_fpscr_masked(fpscr, FP_ENABLES, tcg_constant_i64(0), 0x0003);
+
+tcg_temp_free_i64(fpscr);
+
+return true;
+}
+
 static bool trans_MFFSCRN(DisasContext *ctx, arg_X_tb *a)
 {
 TCGv_i64 t1, fpscr;
diff --git a/target/ppc/translate/fp-ops.c.inc 
b/target/ppc/translate/fp-ops.c.inc
index a27a1be9f5..a76943b8bf 100644
--- a/target/ppc/translate/fp-ops.c.inc
+++ b/target/ppc/translate/fp-ops.c.inc
@@ -75,8 +75,6 @@ GEN_HANDLER_E(fmrgew, 0x3F, 0x06, 0x1E, 0x0001, PPC_NONE, 
PPC2_VSX207),
 GEN_HANDLER_E(fmrgow, 0x3F, 0x06, 0x1A, 0x0001, PPC_NONE, PPC2_VSX207),
 GEN_HANDLER(mcrfs, 0x3F, 0x00, 0x02, 0x0063F801, PPC_FLOAT),
 GEN_HANDLER_E_2(mffs, 0x3F, 0x07, 0x12, 0x00, 0x, PPC_FLOAT, PPC_NONE),
-GEN_HANDLER_E_2(mffsce, 0x3F, 0x07, 0x12, 0x01, 0x, PPC_FLOAT,
-PPC2_ISA300),
 GEN_HANDLER_E_2(mffsl, 0x3F, 0x07, 0x12, 0x18, 0x, PPC_FLOAT,
 PPC2_ISA300),
 GEN_HANDLER(mtfsb0, 0x3F, 0x06, 0x02, 0x001FF800, PPC_FLOAT),
-- 
2.25.1

[PATCH v3 01/11] target/ppc: Fix insn32.decode style issues

2022-06-29 Thread Víctor Colombo

Some lines in insn32.decode have inconsistent alignment when compared
to others.
Fix this by changing the alignment of some lines, making it more
consistent throughout the file.

Signed-off-by: Víctor Colombo 
Reviewed-by: Richard Henderson 
---
 target/ppc/insn32.decode | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 6ea48d5163..8b723b5433 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -21,11 +21,11 @@
 @A  .. frt:5 fra:5 frb:5 frc:5 . rc:1   
 
   rt ra si:int64_t
-@D  .. rt:5 ra:5 si:s16 
+@D  .. rt:5 ra:5 si:s16 
 
 _bf   bf l:bool ra imm
-@D_bfs  .. bf:3 - l:1 ra:5 imm:s16  _bf
-@D_bfu  .. bf:3 - l:1 ra:5 imm:16   _bf
+@D_bfs  .. bf:3 . l:1 ra:5 imm:s16  _bf
+@D_bfu  .. bf:3 . l:1 ra:5 imm:16   _bf
 
 %dq_si  4:s12  !function=times_16
 %dq_rtp 22:4   !function=times_2
@@ -38,7 +38,7 @@
 @DQ_TSXP.. . ra:5    si=%dq_si 
rt=%rt_tsxp
 
 %ds_si  2:s14  !function=times_4
-@DS .. rt:5 ra:5 .. ..   si=%ds_si
+@DS .. rt:5 ra:5 .. ..   si=%ds_si
 
 %ds_rtp 22:4   !function=times_2
 @DS_rtp .. 0 ra:5 .. ..  rt=%ds_rtp 
si=%ds_si
@@ -49,10 +49,10 @@
 
  rt d
 %dx_d   6:s10 16:5 0:1
-@DX .. rt:5  . .. . .d=%dx_d
+@DX .. rt:5  . .. . .d=%dx_d
 
  vrt vra vrb rc
-@VA .. vrt:5 vra:5 vrb:5 rc:5 ..
+@VA .. vrt:5 vra:5 vrb:5 rc:5 ..
 
  vrt vra vrb rc:bool
 @VC .. vrt:5 vra:5 vrb:5 rc:1 ..
@@ -61,7 +61,7 @@
 @VN .. vrt:5 vra:5 vrb:5 .. sh:3 .. 
 
  vrt vra vrb
-@VX .. vrt:5 vra:5 vrb:5 .. .   
+@VX .. vrt:5 vra:5 vrb:5 .. .   
 
 _bf  bf vra vrb
 @VX_bf  .. bf:3 .. vra:5 vrb:5 ...  _bf
@@ -76,13 +76,13 @@
 @VX_tb_rc   .. vrt:5 . vrb:5 rc:1 .._tb_rc
 
 _uim4vrt uim vrb
-@VX_uim4.. vrt:5 . uim:4 vrb:5 ...  _uim4
+@VX_uim4.. vrt:5 . uim:4 vrb:5 ...  _uim4
 
 _tb  vrt vrb
-@VX_tb  .. vrt:5 . vrb:5 ..._tb
+@VX_tb  .. vrt:5 . vrb:5 ..._tb
 
   rt ra rb
-@X  .. rt:5 ra:5 rb:5 .. .  
+@X  .. rt:5 ra:5 rb:5 .. .  
 
 _rc   rt ra rb rc:bool
 @X_rc   .. rt:5 ra:5 rb:5 .. rc:1   _rc
@@ -107,7 +107,7 @@
 @X_t_bp_rc  .. rt:5 . 0 .. rc:1 _tb_rc 
rb=%x_frbp
 
 _bi   rt bi
-@X_bi   .. rt:5 bi:5 - .. - _bi
+@X_bi   .. rt:5 bi:5 . .. . _bi
 
 _bf   bf ra rb
 @X_bf   .. bf:3 .. ra:5 rb:5 .. .   _bf
@@ -122,7 +122,7 @@
 @X_bf_uim_bp.. bf:3 . uim:6 0 .. .  _bf_uim 
rb=%x_frbp
 
 _bfl  bf l:bool ra rb
-@X_bfl  .. bf:3 - l:1 ra:5 rb:5 ..- _bfl
+@X_bfl  .. bf:3 . l:1 ra:5 rb:5 .. ._bfl
 
 %x_xt   0:1 21:5
 _imm5 xt imm:uint8_t vrb
-- 
2.25.1

[PATCH v3 00/11] target/ppc: BCDA and mffscdrn implementations

2022-06-29 Thread Víctor Colombo

Hello everyone,

Set of patches containing implementations for some instructions that
were missing before. Also, moves some related instructions to
decodetree. Add mffsce test.

v3:
- Rebase on master
- Add r-b

v2:
- Added R-b on patches 1, 8, 10, and 11. Dropped the R-b on some
  of the patches as there were big changes on them.
- Fixed addg6s issues
- Separated do_mffsc in two different, more specialized functions
- Changed mffs* patches order to make it more readable, as suggested
  by Richard
- Added a new patch with a test for the mffsce instruction

Matheus Ferst (4):
  target/ppc: Add flag for ISA v2.06 BCDA instructions
  target/ppc: implement addg6s
  target/ppc: implement cbcdtd
  target/ppc: implement cdtbcd

Víctor Colombo (7):
  target/ppc: Fix insn32.decode style issues
  target/ppc: Move mffscrn[i] to decodetree
  target/ppc: Move mffsce to decodetree
  target/ppc: Move mffsl to decodetree
  target/ppc: Move mffs[.] to decodetree
  target/ppc: Implement mffscdrn[i] instructions
  tests/tcg/ppc64: Add mffsce test

 target/ppc/cpu.h   |   5 +-
 target/ppc/cpu_init.c  |  10 +-
 target/ppc/dfp_helper.c|  65 +++
 target/ppc/helper.h|   2 +
 target/ppc/insn32.decode   |  55 --
 target/ppc/internal.h  |   3 -
 target/ppc/translate/fixedpoint-impl.c.inc |  51 ++
 target/ppc/translate/fp-impl.c.inc | 203 -
 target/ppc/translate/fp-ops.c.inc  |   9 -
 tests/tcg/ppc64/Makefile.target|   1 +
 tests/tcg/ppc64le/Makefile.target  |   1 +
 tests/tcg/ppc64le/mffsce.c |  37 
 12 files changed, 322 insertions(+), 120 deletions(-)
 create mode 100644 tests/tcg/ppc64le/mffsce.c

-- 
2.25.1

Re: Slowness with multi-thread TCG?

2022-06-29 Thread Matheus K. Ferst


On 29/06/2022 12:36, Frederic Barrat wrote:
[E-MAIL EXTERNO] Não clique em links ou abra anexos, a menos que você 
possa confirmar o remetente e saber que o conteúdo é seguro. Em caso de 
e-mail suspeito entre imediatamente em contato com o DTI.


On 29/06/2022 00:17, Alex Bennée wrote:

If you run the sync-profiler (via the HMP "sync-profile on") you can
then get a breakdown of which mutex's are being held and for how long
("info sync-profile").



Alex, a huge thank you!

For the record, the "info sync-profile" showed:
Type   Object  Call site Wait Time (s)
    Count  Average (us)
-- 


BQL mutex  0x55eb89425540  accel/tcg/cpu-exec.c:744   96.31578
     73589937  1.31
BQL mutex  0x55eb89425540  target/ppc/helper_regs.c:207    0.00150
     1178  1.27


And it points to a lock in the interrupt delivery path, in
cpu_handle_interrupt().

I now understand the root cause. The interrupt signal for the
decrementer interrupt remains set because the interrupt is not being
delivered, per the config. I'm not quite sure what the proper fix is yet
(there seems to be several implementations of the decrementer on ppc),
but at least I understand why we are so slow.



To summarize what we talked elsewhere:
1 - The threads that are not decompressing the kernel have a pending 
PPC_INTERRUPT_DECR, and cs->interrupt_request is CPU_INTERRUPT_HARD;
2 - cpu_handle_interrupt calls ppc_cpu_exec_interrupt, that calls 
ppc_hw_interrupt to handle the interrupt;
3 - ppc_cpu_exec_interrupt decides that the interrupt cannot be 
delivered immediately, so the corresponding bit in 
env->pending_interrupts is not reset;
4 - ppc_cpu_exec_interrupt does not change cs->interrupt_request because 
pending_interrupts != 0, so cpu_handle_interrupt will be called again.


This loop will acquire and release qemu_mutex_lock_iothread, slowing 
down other threads that need this lock.



With a quick hack, I could verify that by moving that signal out of the
way, the decompression time of the kernel is now peanuts, no matter the
number of cpus. Even with one cpu, the 15 seconds measured before was
already a huge waste, so it was not really a multiple-cpus problem.
Multiple cpus were just highlighting it.

Thanks again!

   Fred

--
Matheus K. Ferst
Instituto de Pesquisas ELDORADO 
Analista de Software
Aviso Legal - Disclaimer

Re: [PATCH v2 09/13] hw/i2c/pmbus: Add read-only IC_DEVICE_ID support

2022-06-29 Thread Peter Delevoryas



> On Jun 29, 2022, at 1:40 AM, Cédric Le Goater  wrote:
> 
> On 6/29/22 05:36, Peter Delevoryas wrote:
>> Signed-off-by: Peter Delevoryas 
> 
> This is also adding a "Renesas ISL69259 Digital Multiphase Voltage
> Regulator" device. There should be 2 patches.

H yes definitely, I’ll fix this. One patch to add IC_DEVICE_ID
to pmbus, one to add ISL69259 to isl_pmbus_vr.c

> 
> Thanks,
> 
> C.
> 
> 
> 
>> ---
>>  hw/i2c/pmbus_device.c|  5 +
>>  hw/sensor/isl_pmbus_vr.c | 31 +++
>>  include/hw/i2c/pmbus_device.h|  1 +
>>  include/hw/sensor/isl_pmbus_vr.h |  1 +
>>  4 files changed, 38 insertions(+)
>> diff --git a/hw/i2c/pmbus_device.c b/hw/i2c/pmbus_device.c
>> index efddc36fd9..82131fff85 100644
>> --- a/hw/i2c/pmbus_device.c
>> +++ b/hw/i2c/pmbus_device.c
>> @@ -984,6 +984,11 @@ static uint8_t pmbus_receive_byte(SMBusDevice *smd)
>>  }
>>  break;
>>  +case PMBUS_IC_DEVICE_ID:
>> +pmbus_send(pmdev, pmdev->pages[index].ic_device_id,
>> +   sizeof(pmdev->pages[index].ic_device_id));
>> +break;
>> +
>>  case PMBUS_CLEAR_FAULTS:  /* Send Byte */
>>  case PMBUS_PAGE_PLUS_WRITE:   /* Block Write-only */
>>  case PMBUS_STORE_DEFAULT_ALL: /* Send Byte */
>> diff --git a/hw/sensor/isl_pmbus_vr.c b/hw/sensor/isl_pmbus_vr.c
>> index e11e028884..b12c46ab6d 100644
>> --- a/hw/sensor/isl_pmbus_vr.c
>> +++ b/hw/sensor/isl_pmbus_vr.c
>> @@ -218,6 +218,28 @@ static void isl_pmbus_vr_class_init(ObjectClass *klass, 
>> void *data,
>>  k->device_num_pages = pages;
>>  }
>>  +static void isl69259_init(Object *obj)
>> +{
>> +static const uint8_t ic_device_id[] = {0x04, 0x00, 0x81, 0xD2, 0x49};
>> +PMBusDevice *pmdev = PMBUS_DEVICE(obj);
>> +int i;
>> +
>> +raa22xx_init(obj);
>> +for (i = 0; i < pmdev->num_pages; i++) {
>> +memcpy(pmdev->pages[i].ic_device_id, ic_device_id,
>> +   sizeof(ic_device_id));
>> +}
>> +}
>> +
>> +static void isl69259_class_init(ObjectClass *klass, void *data)
>> +{
>> +ResettableClass *rc = RESETTABLE_CLASS(klass);
>> +DeviceClass *dc = DEVICE_CLASS(klass);
>> +dc->desc = "Renesas ISL69259 Digital Multiphase Voltage Regulator";
>> +rc->phases.exit = isl_pmbus_vr_exit_reset;
>> +isl_pmbus_vr_class_init(klass, data, 2);
>> +}
>> +
>>  static void isl69260_class_init(ObjectClass *klass, void *data)
>>  {
>>  ResettableClass *rc = RESETTABLE_CLASS(klass);
>> @@ -245,6 +267,14 @@ static void raa229004_class_init(ObjectClass *klass, 
>> void *data)
>>  isl_pmbus_vr_class_init(klass, data, 2);
>>  }
>>  +static const TypeInfo isl69259_info = {
>> +.name = TYPE_ISL69259,
>> +.parent = TYPE_PMBUS_DEVICE,
>> +.instance_size = sizeof(ISLState),
>> +.instance_init = isl69259_init,
>> +.class_init = isl69259_class_init,
>> +};
>> +
>>  static const TypeInfo isl69260_info = {
>>  .name = TYPE_ISL69260,
>>  .parent = TYPE_PMBUS_DEVICE,
>> @@ -271,6 +301,7 @@ static const TypeInfo raa228000_info = {
>>static void isl_pmbus_vr_register_types(void)
>>  {
>> +type_register_static(_info);
>>  type_register_static(_info);
>>  type_register_static(_info);
>>  type_register_static(_info);
>> diff --git a/include/hw/i2c/pmbus_device.h b/include/hw/i2c/pmbus_device.h
>> index 0f4d6b3fad..aed7809841 100644
>> --- a/include/hw/i2c/pmbus_device.h
>> +++ b/include/hw/i2c/pmbus_device.h
>> @@ -407,6 +407,7 @@ typedef struct PMBusPage {
>>  uint16_t mfr_max_temp_1;   /* R/W word */
>>  uint16_t mfr_max_temp_2;   /* R/W word */
>>  uint16_t mfr_max_temp_3;   /* R/W word */
>> +uint8_t ic_device_id[16];  /* Read-Only block-read */
>>  } PMBusPage;
>>/* State */
>> diff --git a/include/hw/sensor/isl_pmbus_vr.h 
>> b/include/hw/sensor/isl_pmbus_vr.h
>> index 3e47ff7e48..d501b3bc82 100644
>> --- a/include/hw/sensor/isl_pmbus_vr.h
>> +++ b/include/hw/sensor/isl_pmbus_vr.h
>> @@ -12,6 +12,7 @@
>>  #include "hw/i2c/pmbus_device.h"
>>  #include "qom/object.h"
>>  +#define TYPE_ISL69259   "isl69259"
>>  #define TYPE_ISL69260   "isl69260"
>>  #define TYPE_RAA228000  "raa228000"
>>  #define TYPE_RAA229004  "raa229004"
>

[PATCH] scripts: check if .git exists before checking submodule status

2022-06-29 Thread Daniel P . Berrangé

Currently we check status of each submodule, before actually checking
if we're in a git repo. These status commands will all fail, but we
are hiding their output so we don't see it currently.

Signed-off-by: Daniel P. Berrangé 
---
 scripts/git-submodule.sh | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/git-submodule.sh b/scripts/git-submodule.sh
index e225d3a963..7be41f5948 100755
--- a/scripts/git-submodule.sh
+++ b/scripts/git-submodule.sh
@@ -51,6 +51,12 @@ validate_error() {
 exit 1
 }
 
+if test -n "$maybe_modules" && ! test -e ".git"
+then
+echo "$0: unexpectedly called with submodules but no git checkout exists"
+exit 1
+fi
+
 modules=""
 for m in $maybe_modules
 do
@@ -63,12 +69,6 @@ do
 fi
 done
 
-if test -n "$maybe_modules" && ! test -e ".git"
-then
-echo "$0: unexpectedly called with submodules but no git checkout exists"
-exit 1
-fi
-
 case "$command" in
 status|validate)
 if test -z "$maybe_modules"
-- 
2.36.1

Re: [PATCH v2 10/13] hw/misc/aspeed: Add PECI controller

2022-06-29 Thread Peter Delevoryas



> On Jun 29, 2022, at 2:20 AM, Cédric Le Goater  wrote:
> 
> On 6/29/22 05:36, Peter Delevoryas wrote:
>> This introduces a really basic PECI controller that responses to
>> commands by always setting the response code to success and then raising
>> an interrupt to indicate the command is done. This helps avoid getting
>> hit with constant errors if the driver continuously attempts to send a
>> command and keeps timing out.
>> The AST2400 and AST2500 only included registers up to 0x5C, not 0xFC.
>> They supported PECI 1.1, 2.0, and 3.0. The AST2600 and AST1030 support
>> PECI 4.0, which includes more read/write buffer registers from 0x80 to
>> 0xFC to support 64-byte mode.
>> This patch doesn't attempt to handle that, or to create a different
>> version of the controller for the different generations, since it's only
>> implementing functionality that is common to all generations.
>> The basic sequence of events is that the firmware will read and write to
>> various registers and then trigger a command by setting the FIRE bit in
>> the command register (similar to the I2C controller).
>> Then the firmware waits for an interrupt from the PECI controller,
>> expecting the interrupt status register to be filled in with info on
>> what happened. If the command was transmitted and received successfully,
>> then response codes from the host CPU will be found in the data buffer
>> registers.
>> Signed-off-by: Peter Delevoryas 
> 
> 
> LGTM. A few small comments below.
> 
> 
>> ---
>>  hw/arm/aspeed_ast10x0.c   |  12 +++
>>  hw/arm/aspeed_ast2600.c   |  12 +++
>>  hw/arm/aspeed_soc.c   |  13 
>>  hw/misc/aspeed_peci.c | 136 ++
>>  hw/misc/meson.build   |   3 +-
>>  hw/misc/trace-events  |   4 +
>>  include/hw/arm/aspeed_soc.h   |   3 +
>>  include/hw/misc/aspeed_peci.h |  47 
>>  8 files changed, 229 insertions(+), 1 deletion(-)
>>  create mode 100644 hw/misc/aspeed_peci.c
>>  create mode 100644 include/hw/misc/aspeed_peci.h
>> diff --git a/hw/arm/aspeed_ast10x0.c b/hw/arm/aspeed_ast10x0.c
>> index 5df480a21f..56e8de3d89 100644
>> --- a/hw/arm/aspeed_ast10x0.c
>> +++ b/hw/arm/aspeed_ast10x0.c
>> @@ -47,6 +47,7 @@ static const hwaddr aspeed_soc_ast1030_memmap[] = {
>>  [ASPEED_DEV_UART13]= 0x7E790700,
>>  [ASPEED_DEV_WDT]   = 0x7E785000,
>>  [ASPEED_DEV_LPC]   = 0x7E789000,
>> +[ASPEED_DEV_PECI]  = 0x7E78B000,
>>  [ASPEED_DEV_I2C]   = 0x7E7B,
>>  };
>>  @@ -75,6 +76,7 @@ static const int aspeed_soc_ast1030_irqmap[] = {
>>  [ASPEED_DEV_TIMER8]= 23,
>>  [ASPEED_DEV_WDT]   = 24,
>>  [ASPEED_DEV_LPC]   = 35,
>> +[ASPEED_DEV_PECI]  = 38,
>>  [ASPEED_DEV_FMC]   = 39,
>>  [ASPEED_DEV_PWM]   = 44,
>>  [ASPEED_DEV_ADC]   = 46,
>> @@ -133,6 +135,8 @@ static void aspeed_soc_ast1030_init(Object *obj)
>>object_initialize_child(obj, "lpc", >lpc, TYPE_ASPEED_LPC);
>>  +object_initialize_child(obj, "peci", >peci, TYPE_ASPEED_PECI);
>> +
>>  object_initialize_child(obj, "sbc", >sbc, TYPE_ASPEED_SBC);
>>for (i = 0; i < sc->wdts_num; i++) {
>> @@ -206,6 +210,14 @@ static void aspeed_soc_ast1030_realize(DeviceState 
>> *dev_soc, Error **errp)
>>  sysbus_connect_irq(SYS_BUS_DEVICE(>i2c.busses[i]), 0, irq);
>>  }
>>  +/* PECI */
>> +if (!sysbus_realize(SYS_BUS_DEVICE(>peci), errp)) {
>> +return;
>> +}
>> +sysbus_mmio_map(SYS_BUS_DEVICE(>peci), 0, 
>> sc->memmap[ASPEED_DEV_PECI]);
>> +sysbus_connect_irq(SYS_BUS_DEVICE(>peci), 0,
>> +   aspeed_soc_get_irq(s, ASPEED_DEV_PECI));
>> +
>>  /* LPC */
>>  if (!sysbus_realize(SYS_BUS_DEVICE(>lpc), errp)) {
>>  return;
>> diff --git a/hw/arm/aspeed_ast2600.c b/hw/arm/aspeed_ast2600.c
>> index b0a4199b69..85178fabea 100644
>> --- a/hw/arm/aspeed_ast2600.c
>> +++ b/hw/arm/aspeed_ast2600.c
>> @@ -59,6 +59,7 @@ static const hwaddr aspeed_soc_ast2600_memmap[] = {
>>  [ASPEED_DEV_LPC]   = 0x1E789000,
>>  [ASPEED_DEV_IBT]   = 0x1E789140,
>>  [ASPEED_DEV_I2C]   = 0x1E78A000,
>> +[ASPEED_DEV_PECI]  = 0x1E78B000,
>>  [ASPEED_DEV_UART1] = 0x1E783000,
>>  [ASPEED_DEV_UART2] = 0x1E78D000,
>>  [ASPEED_DEV_UART3] = 0x1E78E000,
>> @@ -122,6 +123,7 @@ static const int aspeed_soc_ast2600_irqmap[] = {
>>  [ASPEED_DEV_LPC]   = 35,
>>  [ASPEED_DEV_IBT]   = 143,
>>  [ASPEED_DEV_I2C]   = 110,   /* 110 -> 125 */
>> +[ASPEED_DEV_PECI]  = 38,
>>  [ASPEED_DEV_ETH1]  = 2,
>>  [ASPEED_DEV_ETH2]  = 3,
>>  [ASPEED_DEV_HACE]  = 4,
>> @@ -180,6 +182,8 @@ static void aspeed_soc_ast2600_init(Object *obj)
>>  snprintf(typename, sizeof(typename), "aspeed.i2c-%s", socname);
>>  object_initialize_child(obj, "i2c", >i2c, typename);
>>  +object_initialize_child(obj, "peci", >peci, TYPE_ASPEED_PECI);
>> +
>>

Re: Slowness with multi-thread TCG?

2022-06-29 Thread Alex Bennée



Frederic Barrat  writes:

> On 29/06/2022 00:17, Alex Bennée wrote:
>> If you run the sync-profiler (via the HMP "sync-profile on") you can
>> then get a breakdown of which mutex's are being held and for how long
>> ("info sync-profile").
>
>
> Alex, a huge thank you!
>
> For the record, the "info sync-profile" showed:
> Type   Object  Call site Wait Time (s)
> Count  Average (us)
> --
> BQL mutex  0x55eb89425540  accel/tcg/cpu-exec.c:744   96.31578
> 73589937  1.31
> BQL mutex  0x55eb89425540  target/ppc/helper_regs.c:2070.00150
> 1178  1.27
>
>
> And it points to a lock in the interrupt delivery path, in
> cpu_handle_interrupt().
>
> I now understand the root cause. The interrupt signal for the
> decrementer interrupt remains set because the interrupt is not being
> delivered, per the config. I'm not quite sure what the proper fix is
> yet (there seems to be several implementations of the decrementer on
> ppc), but at least I understand why we are so slow.

That sounds like a bug in the interrupt controller emulation. It should
not even be attempting to cpu_exit() and set cpu->interrupt_request
(which are TCG internals) unless the IRQ is unmasked. Usually when
updates are made to an emulated IRQ controller you re-calculate the
state and decide if an interrupt needs to be asserted to QEMU.

> With a quick hack, I could verify that by moving that signal out of
> the way, the decompression time of the kernel is now peanuts, no
> matter the number of cpus. Even with one cpu, the 15 seconds measured
> before was already a huge waste, so it was not really a multiple-cpus
> problem. Multiple cpus were just highlighting it.
>
> Thanks again!
>
>   Fred


-- 
Alex Bennée

Re: [PATCH] hw/nvme: Use ioeventfd to handle doorbell updates

2022-06-29 Thread Keith Busch

On Wed, Jun 29, 2022 at 05:04:25PM +0800, Jinhao Fan wrote:
> Ping~
> 
> > @@ -4271,6 +4343,11 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl 
> > *n, uint64_t dma_addr,
> > if (n->dbbuf_enabled) {
> > sq->db_addr = n->dbbuf_dbs + (sqid << 3);
> > sq->ei_addr = n->dbbuf_eis + (sqid << 3);
> > +
> > +if (n->params.ioeventfd && sq->sqid != 0) {
> > +ret = nvme_init_sq_ioeventfd(sq);
> > +sq->ioeventfd_enabled = ret == 0;
> > +}
> > }
> > 
> > assert(n->cq[cqid]);
> 
> Is this “ret == 0” a correct way for error handling?

That looks correct since we don't need the ioevent is an optional optimization.

I would just suggest making this easier to read. For example, in
nvme_init_sq_ioeventfd(), instead of assigning within a conditional:

if ((ret = event_notifier_init(>notifier, 0))) {

Do each part separately:

ret = event_notifier_init(>notifier, 0);
if (ret) {
 
> I’ve also been wondering whether using irqfd for sending interrupts can
> bring some benefits. I’m not familiar with how QEMU emulates interrupts.
> What do you think of irqfd’s?

Not sure about this mechanism, I'll need to look into it.

Re: [PATCH] hw/arm/virt: dt: add rng-seed property

2022-06-29 Thread Jason A. Donenfeld

Hi Alex,

On Wed, Jun 29, 2022 at 04:24:20PM +0100, Alex Bennée wrote:
> > The code is exactly the same for kaslr-seed and rng-seed. Everytime
> > there's some kaslr-seed thing, there is now the same rng-seed thing.
> 
> The duplication is annoying but specs are specs - where is this written
> by the way?

The same place as all the ordinary specs:
https://github.com/devicetree-org/dt-schema/blob/main/dtschema/schemas/chosen.yaml

> Given the use case for the dtb-kaslr-seed knob I wonder if we should
> have a common property and deprecate the kaslr one? As of this patch
> existing workflows will break until command lines are updated to suppress
> the second source of randomness.
> 
> Maybe it would be better to have a single a new property
> (dtb-rng-seeds?) which suppresses both dtb entries and make
> dtb-kaslr-seed an alias and mark it as deprecated.

No, I don't think so. If anything, I'll try to get rid of kaslr-seed
upstream at some point if that makes sense. But until that happens --
that is, until I have the conversations with people who added these and
care about their semantics -- assume that there's granularity for some
good reason. No need to put the cart before the horse.

This is a simple patch doing a simple thing in exactly the way that
things are already being done. I really don't want to do much more than
that here. If you want to bikeshed it further, send a follow up patch.

Jason

Re: [PATCH 12/14] aspeed: Make aspeed_board_init_flashes public

2022-06-29 Thread Cédric Le Goater


On 6/29/22 16:14, Alex Bennée wrote:


Cédric Le Goater  writes:


On 6/24/22 18:50, Cédric Le Goater wrote:

On 6/23/22 20:43, Peter Delevoryas wrote:




On Jun 23, 2022, at 8:09 AM, Cédric Le Goater  wrote:

On 6/23/22 12:26, Peter Delevoryas wrote:

Signed-off-by: Peter Delevoryas 


Let's start simple without flash support. We should be able to
load FW blobs in each CPU address space using loader devices.


Actually, I was unable to do this, perhaps because the fb OpenBMC
boot sequence is a little weird. I specifically _needed_ to have
a flash device which maps the firmware in at 0x2000_, because
the fb OpenBMC U-Boot SPL jumps to that address to start executing
from flash? I think this is also why fb OpenBMC machines can be so slow.

$ ./build/qemu-system-arm -machine fby35 \
  -device loader,file=fby35.mtd,addr=0,cpu-num=0 -nographic \
  -d int -drive file=fby35.mtd,format=raw,if=mtd

Ideally we should be booting from the flash device directly using
the machine option '-M ast2600-evb,execute-in-place=true' like HW
does. Instructions are fetched using SPI transfers. But the amount
of code generated is tremendous.


Yeah because there is a potential race when reading from HW so we throw
away TB's after executing them because we have no way of knowing if it
has changed under our feet. See 873d64ac30 (accel/tcg: re-factor non-RAM
execution code) which cleaned up this handling.


See some profiling below for a
run which barely reaches DRAM training in U-Boot.


Some more profiling on both ast2500 and ast2600 machines shows :


* ast2600-evb,execute-in-place=true :

Type   Object  Call siteWait Time (s) Count 
 Average (us)
-
BQL mutex  0x564dc03922e0  accel/tcg/cputlb.c:1365   14.21443
32909927  0.43


This is unavoidable as a HW access needs the BQL held so we will go
through this cycle every executed instruction.

Did I miss why the flash contents are not mapped into the physical
address space? Isn't that how it appear to the processor?



There are two modes :
 
if (ASPEED_MACHINE(machine)->mmio_exec) {

memory_region_init_alias(boot_rom, NULL, "aspeed.boot_rom",
 >mmio, 0, size);
memory_region_add_subregion(get_system_memory(), FIRMWARE_ADDR,
boot_rom);
} else {
memory_region_init_rom(boot_rom, NULL, "aspeed.boot_rom",
   size, _abort);
memory_region_add_subregion(get_system_memory(), FIRMWARE_ADDR,
boot_rom);
write_boot_rom(drive0, FIRMWARE_ADDR, size, _abort);
}

The default boot mode uses the ROM. No issue.

The "execute-in-place=true" option creates an alias on the region of
the flash contents and each instruction is then fetched from the flash
drive with SPI transactions.

With old FW images, using an older U-boot, the machine boots in a couple
of seconds. See the profiling below for a witherspoon-bmc machine using
U-Boot 2016.07.

  qemu-system-arm -M witherspoon-bmc,execute-in-place=true  -drive 
file=./flash-witherspoon-bmc,format=raw,if=mtd -drive 
file=./flash-witherspoon-bmc2,format=raw,if=mtd -nographic -nodefaults 
-snapshot -serial mon:stdio -enable-sync-profile
  ...
  U-Boot 2016.07-00040-g8425e96e2e27-dirty (Jun 24 2022 - 23:21:57 +0200)
  
 Watchdog enabled

  DRAM:  496 MiB
  Flash: 32 MiB
  In:serial
  Out:   serial
  Err:   serial
  Net:
  (qemu) info sync-profile
  Type   Object  Call siteWait Time (s) 
Count  Average (us)
  
-
  BQL mutex  0x56189610b2e0  accel/tcg/cputlb.c:13650.25311  
12346237  0.02
  condvar0x5618970cf220  softmmu/cpus.c:423 0.05506 
2  27530.78
  BQL mutex  0x56189610b2e0  util/rcu.c:269 0.04709 
2  23544.26
  condvar0x561896d0fc78  util/thread-pool.c:90  0.01340
83161.47
  condvar0x56189610b240  softmmu/cpus.c:571 0.5 
1 54.93
  condvar0x56189610b280  softmmu/cpus.c:642 0.3 
1 32.88
  BQL mutex  0x56189610b2e0  util/main-loop.c:318   0.3
34  0.76
  mutex  0x561896eade00  tcg/region.c:204   0.2   
995  0.02
  rec_mutex  [   2]  util/async.c:682   0.2   
493  0.03
  mutex  [   2]  chardev/char.c:118 0.1   
404  0.03
  
-


However, with recent U-boots, it takes quite a while to reach DRAM training.
Close to a

Re: Slowness with multi-thread TCG?

2022-06-29 Thread Frederic Barrat





On 29/06/2022 00:17, Alex Bennée wrote:

If you run the sync-profiler (via the HMP "sync-profile on") you can
then get a breakdown of which mutex's are being held and for how long
("info sync-profile").



Alex, a huge thank you!

For the record, the "info sync-profile" showed:
Type   Object  Call site Wait Time (s) 
   Count  Average (us)

--
BQL mutex  0x55eb89425540  accel/tcg/cpu-exec.c:744   96.31578 
73589937  1.31
BQL mutex  0x55eb89425540  target/ppc/helper_regs.c:2070.00150 
1178  1.27



And it points to a lock in the interrupt delivery path, in 
cpu_handle_interrupt().


I now understand the root cause. The interrupt signal for the 
decrementer interrupt remains set because the interrupt is not being 
delivered, per the config. I'm not quite sure what the proper fix is yet 
(there seems to be several implementations of the decrementer on ppc), 
but at least I understand why we are so slow.


With a quick hack, I could verify that by moving that signal out of the 
way, the decompression time of the kernel is now peanuts, no matter the 
number of cpus. Even with one cpu, the 15 seconds measured before was 
already a huge waste, so it was not really a multiple-cpus problem. 
Multiple cpus were just highlighting it.


Thanks again!

  Fred

Re: [PATCH] hw/arm/virt: dt: add rng-seed property

2022-06-29 Thread Alex Bennée



"Jason A. Donenfeld"  writes:

> On Wed, Jun 29, 2022 at 11:18:23AM +0100, Alex Bennée wrote:
>> 
>> Peter Maydell  writes:
>> 
>> > On Tue, 28 Jun 2022 at 19:45, Jason A. Donenfeld  wrote:
>> >>
>> >> On 6/27/22, Jason A. Donenfeld  wrote:
>> >> > On 6/27/22, Peter Maydell  wrote:
>> >> >> On Mon, 27 Jun 2022 at 17:07, Jason A. Donenfeld  
>> >> >> wrote:
>> >> >>>
>> >> >>> In 60592cfed2 ("hw/arm/virt: dt: add kaslr-seed property"), the
>> >> >>> kaslr-seed property was added, but the equally as important rng-seed
>> >> >>> property was forgotten about, which has identical semantics for a
>> >> >>> similar purpose. This commit implements it in exactly the same way as
>> >> >>> kaslr-seed.
>> >> >>
>> >> >> Not an objection, since if this is what the dtb spec says we need
>> >> >> to provide then I guess we need to provide it, but:
>> >> >> Why do we need to give the kernel two separate random seeds?
>> >> >> Isn't one sufficient for the kernel to seed its RNG and generate
>> >> >> whatever randomness it needs for whatever purposes it wants it?
>> >> >>
>> >> >
>> >> > Seems a bit silly to me too. `rng-seed` alone ought to be sufficient.
>> >> > After the kernel calls add_bootloader_randomness() on it,
>> >> > get_random_long() can be used for kaslr'ing and everything else too.
>> >> > So I'm not sure what's up, but here we are. Maybe down the line I'll
>> >> > look into the details and formulate a plan to remove `kaslr-seed` if
>> >> > my supposition is correct.
>> 
>> Sorry now I've had my coffee and read properly I see you are already
>> aware of kaslr-seed. However my point about suppression would still
>> stand because for the secure boot flow you need checksum-able DTBs.
>
> Please read the patch. Maybe take a sip of coffee first. There's a knob
> for this too.

I was obviously not paying enough attention this morning. Sorry about that.

> The code is exactly the same for kaslr-seed and rng-seed. Everytime
> there's some kaslr-seed thing, there is now the same rng-seed thing.

The duplication is annoying but specs are specs - where is this written
by the way?

Given the use case for the dtb-kaslr-seed knob I wonder if we should
have a common property and deprecate the kaslr one? As of this patch
existing workflows will break until command lines are updated to suppress
the second source of randomness.

Maybe it would be better to have a single a new property
(dtb-rng-seeds?) which suppresses both dtb entries and make
dtb-kaslr-seed an alias and mark it as deprecated.

-- 
Alex Bennée

Re: qemu-system-s390x hang in tcg

2022-06-29 Thread Alex Bennée

Sven Schnelle  writes:

> Sven Schnelle  writes:
>
>> Alex Bennée  writes:
>>
>>> Sven Schnelle  writes:
>>>
 Hi,

 David Hildenbrand  writes:

> On 04.05.22 09:37, Janosch Frank wrote:
>> I had a short look yesterday and the boot usually hangs in the raid6 
>> code. Disabling vector instructions didn't make a difference but a few 
>> interruptions via GDB solve the problem for some reason.
>> 
>> CCing David and Thomas for TCG
>> 
>
> I somehow recall that KASAN was always disabled under TCG, I might be
> wrong (I thought we'd get a message early during boot that the HW
> doesn't support KASAN).
>
> I recall that raid code is a heavy user of vector instructions.
>
> How can I reproduce? Compile upstream (or -next?) with kasan support and
> run it under TCG?

 I spent some time looking into this. It's usually hanging in
 s390vx8_gen_syndrome(). My first thought was that it is a problem with
 the VX instructions, but turned out that it hangs even if i remove all
 the code from s390vx8_gen_syndrome().

 Tracing the execution of TB's, i see that the generated code is always
 jumping between a few TB's, but never exiting the TB's to check for
 interrupts (i.e. return to cpu_tb_exec(). I only see calls to
 helper_lookup_tb_ptr to lookup the tb pointer for the next TB.

 The raid6 code is waiting for some time to expire by reading jiffies,
 but interrupts are never processed and therefore jiffies doesn't change.
 So the raid6 code hangs forever.

 As a test, i made a quick change to test:

 diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
 index c997c2e8e0..35819fd5a7 100644
 --- a/accel/tcg/cpu-exec.c
 +++ b/accel/tcg/cpu-exec.c
 @@ -319,7 +319,8 @@ const void *HELPER(lookup_tb_ptr)(CPUArchState *env)
  cpu_get_tb_cpu_state(env, , _base, );

  cflags = curr_cflags(cpu);
 -if (check_for_breakpoints(cpu, pc, )) {
 +if (check_for_breakpoints(cpu, pc, ) ||
 +unlikely(qatomic_read(>interrupt_request))) {
  cpu_loop_exit(cpu);
  }

 And that makes the problem go away. But i'm not familiar with the TCG
 internals, so i can't say whether the generated code is incorrect or
 something else is wrong. I have tcg log files of a failing + working run
 if someone wants to take a look. They are rather large so i would have to
 upload them somewhere.
>>>
>>> Whatever is setting cpu->interrupt_request should be calling
>>> cpu_exit(cpu) which sets the exit flag which is checked at the start of
>>> every TB execution (see gen_tb_start).
>>
>> Thanks, that was very helpful. I added debugging and it turned out
>> that the TB is left because of a pending irq. The code then calls
>> s390_cpu_exec_interrupt:
>>
>> bool s390_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
>> {
>> if (interrupt_request & CPU_INTERRUPT_HARD) {
>> S390CPU *cpu = S390_CPU(cs);
>> CPUS390XState *env = >env;
>>
>> if (env->ex_value) {
>> /* Execution of the target insn is indivisible from
>>the parent EXECUTE insn.  */
>> return false;
>> }
>> if (s390_cpu_has_int(cpu)) {
>> s390_cpu_do_interrupt(cs);
>> return true;
>> }
>> if (env->psw.mask & PSW_MASK_WAIT) {
>> /* Woken up because of a floating interrupt but it has already
>>  * been delivered. Go back to sleep. */
>> cpu_interrupt(CPU(cpu), CPU_INTERRUPT_HALT);
>> }
>> }
>> return false;
>> }
>>
>> Note the 'if (env->ex_value) { }' check. It looks like this function
>> just returns false in case tcg is executing an EX instruction. After
>> that the information that the TB should be exited because of an
>> interrupt is gone. So the TB's are never exited again, although the
>> interrupt wasn't handled. At least that's my assumption now, if i'm
>> wrong please tell me.
>
> Looking at the code i see CF_NOIRQ to prevent TB's from getting
> interrupted. But i only see that used in the core tcg code. Would
> that be a possibility, or is there something else/better?

Yes CF_NOIRQ is exactly the compiler flag you would use to prevent a
block from exiting early when you absolutely want to execute the next
block. We currently only use it from core code to deal with icount
related things but I can see it's use here. I would probably still wrap
it in a common function in cpu-exec-common.c I'm unsure of the exact
semantics for s390 so I will defer to Richard and others but something
like (untested):

/*
 * Ensure the next N instructions are not interrupted by IRQ checks.
 */
void cpu_loop_exit_unint(CPUState *cpu, uintptr_t pc, int len)
{
if (pc) {
cpu_restore_state(cpu, pc, true);
}
cpu->cflags_next_tb = len | CF_LAST_IO | CF_NOIRQ | curr_cflags(cpu);

Re: [PATCH v8 02/12] s390x/cpu_topology: CPU topology objects and structures

2022-06-29 Thread Pierre Morel





On 6/27/22 15:31, Janosch Frank wrote:

On 6/20/22 16:03, Pierre Morel wrote:

We use new objects to have a dynamic administration of the CPU topology.
The highest level object in this implementation is the s390 book and
in this first implementation of CPU topology for S390 we have a single
book.
The book is built as a SYSBUS bridge during the CPU initialization.
Other objects, sockets and core will be built after the parsing
of the QEMU -smp argument.

Every object under this single book will be build dynamically
immediately after a CPU has be realized if it is needed.
The CPU will fill the sockets once after the other, according to the
number of core per socket defined during the smp parsing.

Each CPU inside a socket will be represented by a bit in a 64bit
unsigned long. Set on plug and clear on unplug of a CPU.

For the S390 CPU topology, thread and cores are merged into
topology cores and the number of topology cores is the multiplication
of cores by the numbers of threads.

Signed-off-by: Pierre Morel 


[...]


diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h
index 7d6d01325b..216adfde26 100644
--- a/target/s390x/cpu.h
+++ b/target/s390x/cpu.h
@@ -565,6 +565,53 @@ typedef union SysIB {
  } SysIB;
  QEMU_BUILD_BUG_ON(sizeof(SysIB) != 4096);
+/* CPU type Topology List Entry */
+typedef struct SysIBTl_cpu {
+    uint8_t nl;
+    uint8_t reserved0[3];
+    uint8_t reserved1:5;
+    uint8_t dedicated:1;
+    uint8_t polarity:2;
+    uint8_t type;
+    uint16_t origin;
+    uint64_t mask;
+} SysIBTl_cpu;
+QEMU_BUILD_BUG_ON(sizeof(SysIBTl_cpu) != 16);
+
+/* Container type Topology List Entry */
+typedef struct SysIBTl_container {
+    uint8_t nl;
+    uint8_t reserved[6];
+    uint8_t id;
+} QEMU_PACKED SysIBTl_container;
+QEMU_BUILD_BUG_ON(sizeof(SysIBTl_container) != 8);
+
+/* Generic Topology List Entry */
+typedef union SysIBTl_entry {
+    uint8_t nl;


This union member is unused, isn't it?


+    SysIBTl_container container;
+    SysIBTl_cpu cpu;
+} SysIBTl_entry;
+
+#define TOPOLOGY_NR_MAG  6


TOPOLOGY_TOTAL_NR_MAGS ?


+#define TOPOLOGY_NR_MAG6 0


TOPOLOGY_NR_TLES_MAG6 ?

I'm open to other suggestions but we need to differentiate between the 
number of mag array entries and the number of TLEs in the MAGs.



typedef enum {
TOPOLOGY_MAG6 = 0,
TOPOLOGY_MAG5 = 1,
TOPOLOGY_MAG4 = 2,
TOPOLOGY_MAG3 = 3,
TOPOLOGY_MAG2 = 4,
TOPOLOGY_MAG1 = 5,
TOPOLOGY_TOTAL_MAGS = 6,
};


oder enum with TOPOLOGY_NR_TLES_MAGx ?




+#define TOPOLOGY_NR_MAG5 1
+#define TOPOLOGY_NR_MAG4 2
+#define TOPOLOGY_NR_MAG3 3
+#define TOPOLOGY_NR_MAG2 4
+#define TOPOLOGY_NR_MAG1 5


I'd appreciate a \n here.


OK




+/* Configuration topology */
+typedef struct SysIB_151x {
+    uint8_t  res0[2];


You're using "reserved" everywhere but now it's "rev"?


OK I will keep reserved




+    uint16_t length;
+    uint8_t  mag[TOPOLOGY_NR_MAG];
+    uint8_t  res1;
+    uint8_t  mnest;
+    uint32_t res2;
+    SysIBTl_entry tle[0];
+} SysIB_151x;
+QEMU_BUILD_BUG_ON(sizeof(SysIB_151x) != 16);
+
  /* MMU defines */
  #define ASCE_ORIGIN   (~0xfffULL) /* segment table 
origin */
  #define ASCE_SUBSPACE 0x200   /* subspace group 
control   */





--
Pierre Morel
IBM Lab Boeblingen

[PATCH v8 17/20] job.c: enable job lock/unlock and remove Aiocontext locks

2022-06-29 Thread Emanuele Giuseppe Esposito

Change the job_{lock/unlock} and macros to use job_mutex.

Now that they are not nop anymore, remove the aiocontext
to avoid deadlocks.

Therefore:
- when possible, remove completely the aiocontext lock/unlock pair
- if it is used by some other function too, reduce the locking
section as much as possible, leaving the job API outside.

There is only one JobDriver callback, ->free() that assumes that
the aiocontext lock is held (because it calls bdrv_unref), so for
now keep that under aiocontext lock.

Also remove real_job_{lock/unlock}, as they are replaced by the
public functions.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 blockdev.c   | 74 +---
 include/qemu/job.h   | 22 -
 job-qmp.c| 44 -
 job.c| 82 ++--
 tests/unit/test-bdrv-drain.c |  4 +-
 tests/unit/test-block-iothread.c |  2 +-
 tests/unit/test-blockjob.c   | 13 ++---
 7 files changed, 51 insertions(+), 190 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 5b79093155..2cd84d206c 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -155,12 +155,7 @@ void blockdev_mark_auto_del(BlockBackend *blk)
 for (job = block_job_next_locked(NULL); job;
  job = block_job_next_locked(job)) {
 if (block_job_has_bdrv(job, blk_bs(blk))) {
-AioContext *aio_context = job->job.aio_context;
-aio_context_acquire(aio_context);
-
 job_cancel_locked(>job, false);
-
-aio_context_release(aio_context);
 }
 }
 
@@ -1836,14 +1831,7 @@ static void drive_backup_abort(BlkActionState *common)
 DriveBackupState *state = DO_UPCAST(DriveBackupState, common, common);
 
 if (state->job) {
-AioContext *aio_context;
-
-aio_context = bdrv_get_aio_context(state->bs);
-aio_context_acquire(aio_context);
-
 job_cancel_sync(>job->job, true);
-
-aio_context_release(aio_context);
 }
 }
 
@@ -1937,14 +1925,7 @@ static void blockdev_backup_abort(BlkActionState *common)
 BlockdevBackupState *state = DO_UPCAST(BlockdevBackupState, common, 
common);
 
 if (state->job) {
-AioContext *aio_context;
-
-aio_context = bdrv_get_aio_context(state->bs);
-aio_context_acquire(aio_context);
-
 job_cancel_sync(>job->job, true);
-
-aio_context_release(aio_context);
 }
 }
 
@@ -3306,19 +3287,14 @@ out:
 }
 
 /*
- * Get a block job using its ID and acquire its AioContext.
- * Called with job_mutex held.
+ * Get a block job using its ID. Called with job_mutex held.
  */
-static BlockJob *find_block_job_locked(const char *id,
-   AioContext **aio_context,
-   Error **errp)
+static BlockJob *find_block_job_locked(const char *id, Error **errp)
 {
 BlockJob *job;
 
 assert(id != NULL);
 
-*aio_context = NULL;
-
 job = block_job_get_locked(id);
 
 if (!job) {
@@ -3327,36 +3303,30 @@ static BlockJob *find_block_job_locked(const char *id,
 return NULL;
 }
 
-*aio_context = block_job_get_aio_context(job);
-aio_context_acquire(*aio_context);
-
 return job;
 }
 
 void qmp_block_job_set_speed(const char *device, int64_t speed, Error **errp)
 {
-AioContext *aio_context;
 BlockJob *job;
 
 JOB_LOCK_GUARD();
-job = find_block_job_locked(device, _context, errp);
+job = find_block_job_locked(device, errp);
 
 if (!job) {
 return;
 }
 
 block_job_set_speed_locked(job, speed, errp);
-aio_context_release(aio_context);
 }
 
 void qmp_block_job_cancel(const char *device,
   bool has_force, bool force, Error **errp)
 {
-AioContext *aio_context;
 BlockJob *job;
 
 JOB_LOCK_GUARD();
-job = find_block_job_locked(device, _context, errp);
+job = find_block_job_locked(device, errp);
 
 if (!job) {
 return;
@@ -3369,22 +3339,19 @@ void qmp_block_job_cancel(const char *device,
 if (job_user_paused_locked(>job) && !force) {
 error_setg(errp, "The block job for device '%s' is currently paused",
device);
-goto out;
+return;
 }
 
 trace_qmp_block_job_cancel(job);
 job_user_cancel_locked(>job, force, errp);
-out:
-aio_context_release(aio_context);
 }
 
 void qmp_block_job_pause(const char *device, Error **errp)
 {
-AioContext *aio_context;
 BlockJob *job;
 
 JOB_LOCK_GUARD();
-job = find_block_job_locked(device, _context, errp);
+job = find_block_job_locked(device, errp);
 
 if (!job) {
 return;
@@ -3392,16 +3359,14 @@ void qmp_block_job_pause(const char *device, Error 
**errp)
 
 trace_qmp_block_job_pause(job);
 job_user_pause_locked(>job, errp);
-aio_context_release(aio_context);
 }
 
 void qmp_block_job_resume(const char *device, Error **errp)
 {
-AioContext *aio_context;

[PATCH v8 15/20] job: detect change of aiocontext within job coroutine

2022-06-29 Thread Emanuele Giuseppe Esposito

From: Paolo Bonzini 

We want to make sure access of job->aio_context is always done
under either BQL or job_mutex. The problem is that using
aio_co_enter(job->aiocontext, job->co) in job_start and job_enter_cond
makes the coroutine immediately resume, so we can't hold the job lock.
And caching it is not safe either, as it might change.

job_start is under BQL, so it can freely read job->aiocontext, but
job_enter_cond is not. In order to fix this, use aio_co_wake():
the advantage is that it won't use job->aiocontext, but the
main disadvantage is that it won't be able to detect a change of
job AioContext.

Calling bdrv_try_set_aio_context() will issue the following calls
(simplified):
* in terms of  bdrv callbacks:
  .drained_begin -> .set_aio_context -> .drained_end
* in terms of child_job functions:
  child_job_drained_begin -> child_job_set_aio_context -> child_job_drained_end
* in terms of job functions:
  job_pause_locked -> job_set_aio_context -> job_resume_locked

We can see that after setting the new aio_context, job_resume_locked
calls again job_enter_cond, which then invokes aio_co_wake(). But
while job->aiocontext has been set in job_set_aio_context,
job->co->ctx has not changed, so the coroutine would be entering in
the wrong aiocontext.

Using aio_co_schedule in job_resume_locked() might seem as a valid
alternative, but the problem is that the bh resuming the coroutine
is not scheduled immediately, and if in the meanwhile another
bdrv_try_set_aio_context() is run (see test_propagate_mirror() in
test-block-iothread.c), we would have the first schedule in the
wrong aiocontext, and the second set of drains won't even manage
to schedule the coroutine, as job->busy would still be true from
the previous job_resume_locked().

The solution is to stick with aio_co_wake(), but then detect every time
the coroutine resumes back from yielding if job->aio_context
has changed. If so, we can reschedule it to the new context.

Check for the aiocontext change in job_do_yield_locked because:
1) aio_co_reschedule_self requires to be in the running coroutine
2) since child_job_set_aio_context allows changing the aiocontext only
   while the job is paused, this is the exact place where the coroutine
   resumes, before running JobDriver's code.

Signed-off-by: Paolo Bonzini 
Reviewed-by: Stefan Hajnoczi 
---
 job.c | 22 +++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/job.c b/job.c
index 19d711dc73..8db80b8086 100644
--- a/job.c
+++ b/job.c
@@ -581,11 +581,12 @@ void job_enter_cond_locked(Job *job, bool(*fn)(Job *job))
 return;
 }
 
-assert(!job->deferred_to_main_loop);
 timer_del(>sleep_timer);
 job->busy = true;
 real_job_unlock();
-aio_co_enter(job->aio_context, job->co);
+job_unlock();
+aio_co_wake(job->co);
+job_lock();
 }
 
 void job_enter_cond(Job *job, bool(*fn)(Job *job))
@@ -611,6 +612,8 @@ void job_enter(Job *job)
  */
 static void coroutine_fn job_do_yield_locked(Job *job, uint64_t ns)
 {
+AioContext *next_aio_context;
+
 real_job_lock();
 if (ns != -1) {
 timer_mod(>sleep_timer, ns);
@@ -622,7 +625,20 @@ static void coroutine_fn job_do_yield_locked(Job *job, 
uint64_t ns)
 qemu_coroutine_yield();
 job_lock();
 
-/* Set by job_enter_cond() before re-entering the coroutine.  */
+next_aio_context = job->aio_context;
+/*
+ * Coroutine has resumed, but in the meanwhile the job AioContext
+ * might have changed via bdrv_try_set_aio_context(), so we need to move
+ * the coroutine too in the new aiocontext.
+ */
+while (qemu_get_current_aio_context() != next_aio_context) {
+job_unlock();
+aio_co_reschedule_self(next_aio_context);
+job_lock();
+next_aio_context = job->aio_context;
+}
+
+/* Set by job_enter_cond_locked() before re-entering the coroutine.  */
 assert(job->busy);
 }
 
-- 
2.31.1

[PATCH v8 12/20] block/mirror.c: use of job helpers in drivers to avoid TOC/TOU

2022-06-29 Thread Emanuele Giuseppe Esposito

Once job lock is used and aiocontext is removed, mirror has
to perform job operations under the same critical section,
using the helpers prepared in previous commit.

Note: at this stage, job_{lock/unlock} and job lock guard macros
are *nop*.

Signed-off-by: Emanuele Giuseppe Esposito 
Reviewed-by: Stefan Hajnoczi 
---
 block/mirror.c | 19 ++-
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index d8ecb9efa2..b38676e19d 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -654,9 +654,13 @@ static int mirror_exit_common(Job *job)
 BlockDriverState *target_bs;
 BlockDriverState *mirror_top_bs;
 Error *local_err = NULL;
-bool abort = job->ret < 0;
+bool abort;
 int ret = 0;
 
+WITH_JOB_LOCK_GUARD() {
+abort = job->ret < 0;
+}
+
 if (s->prepared) {
 return 0;
 }
@@ -1152,8 +1156,10 @@ static void mirror_complete(Job *job, Error **errp)
 s->should_complete = true;
 
 /* If the job is paused, it will be re-entered when it is resumed */
-if (!job->paused) {
-job_enter(job);
+WITH_JOB_LOCK_GUARD() {
+if (!job->paused) {
+job_enter_cond_locked(job, NULL);
+}
 }
 }
 
@@ -1173,8 +1179,11 @@ static bool mirror_drained_poll(BlockJob *job)
  * from one of our own drain sections, to avoid a deadlock waiting for
  * ourselves.
  */
-if (!s->common.job.paused && !job_is_cancelled(>job) && !s->in_drain) 
{
-return true;
+WITH_JOB_LOCK_GUARD() {
+if (!s->common.job.paused && !job_is_cancelled_locked(>job)
+&& !s->in_drain) {
+return true;
+}
 }
 
 return !!s->in_flight;
-- 
2.31.1

[PATCH v8 13/20] jobs: group together API calls under the same job lock

2022-06-29 Thread Emanuele Giuseppe Esposito

Now that the API offers also _locked() functions, take advantage
of it and give also the caller control to take the lock and call
_locked functions.

This makes sense especially when we have for loops, because it
makes no sense to have:

for(job = job_next(); ...)

where each job_next() takes the lock internally.
Instead we want

JOB_LOCK_GUARD();
for(job = job_next_locked(); ...)

Note: at this stage, job_{lock/unlock} and job lock guard macros
are *nop*.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block.c| 20 +++---
 blockdev.c | 12 ---
 blockjob.c | 52 +++---
 job-qmp.c  |  4 +++-
 job.c  | 13 +++-
 monitor/qmp-cmds.c |  7 +--
 qemu-img.c | 41 +---
 7 files changed, 97 insertions(+), 52 deletions(-)

diff --git a/block.c b/block.c
index 2c0080..d0db104d71 100644
--- a/block.c
+++ b/block.c
@@ -4978,9 +4978,12 @@ static void bdrv_close(BlockDriverState *bs)
 
 void bdrv_close_all(void)
 {
-assert(job_next(NULL) == NULL);
 GLOBAL_STATE_CODE();
 
+WITH_JOB_LOCK_GUARD() {
+assert(job_next_locked(NULL) == NULL);
+}
+
 /* Drop references from requests still in flight, such as canceled block
  * jobs whose AIO context has not been polled yet */
 bdrv_drain_all();
@@ -6165,13 +6168,16 @@ XDbgBlockGraph *bdrv_get_xdbg_block_graph(Error **errp)
 }
 }
 
-for (job = block_job_next(NULL); job; job = block_job_next(job)) {
-GSList *el;
+WITH_JOB_LOCK_GUARD() {
+for (job = block_job_next_locked(NULL); job;
+ job = block_job_next_locked(job)) {
+GSList *el;
 
-xdbg_graph_add_node(gr, job, X_DBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_JOB,
-   job->job.id);
-for (el = job->nodes; el; el = el->next) {
-xdbg_graph_add_edge(gr, job, (BdrvChild *)el->data);
+xdbg_graph_add_node(gr, job, X_DBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_JOB,
+job->job.id);
+for (el = job->nodes; el; el = el->next) {
+xdbg_graph_add_edge(gr, job, (BdrvChild *)el->data);
+}
 }
 }
 
diff --git a/blockdev.c b/blockdev.c
index 71f793c4ab..5b79093155 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -150,12 +150,15 @@ void blockdev_mark_auto_del(BlockBackend *blk)
 return;
 }
 
-for (job = block_job_next(NULL); job; job = block_job_next(job)) {
+JOB_LOCK_GUARD();
+
+for (job = block_job_next_locked(NULL); job;
+ job = block_job_next_locked(job)) {
 if (block_job_has_bdrv(job, blk_bs(blk))) {
 AioContext *aio_context = job->job.aio_context;
 aio_context_acquire(aio_context);
 
-job_cancel(>job, false);
+job_cancel_locked(>job, false);
 
 aio_context_release(aio_context);
 }
@@ -3745,7 +3748,10 @@ BlockJobInfoList *qmp_query_block_jobs(Error **errp)
 BlockJobInfoList *head = NULL, **tail = 
 BlockJob *job;
 
-for (job = block_job_next(NULL); job; job = block_job_next(job)) {
+JOB_LOCK_GUARD();
+
+for (job = block_job_next_locked(NULL); job;
+ job = block_job_next_locked(job)) {
 BlockJobInfo *value;
 AioContext *aio_context;
 
diff --git a/blockjob.c b/blockjob.c
index 70952879d8..1075def475 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -99,7 +99,9 @@ static char *child_job_get_parent_desc(BdrvChild *c)
 static void child_job_drained_begin(BdrvChild *c)
 {
 BlockJob *job = c->opaque;
-job_pause(>job);
+WITH_JOB_LOCK_GUARD() {
+job_pause_locked(>job);
+}
 }
 
 static bool child_job_drained_poll(BdrvChild *c)
@@ -111,8 +113,10 @@ static bool child_job_drained_poll(BdrvChild *c)
 /* An inactive or completed job doesn't have any pending requests. Jobs
  * with !job->busy are either already paused or have a pause point after
  * being reentered, so no job driver code will run before they pause. */
-if (!job->busy || job_is_completed(job)) {
-return false;
+WITH_JOB_LOCK_GUARD() {
+if (!job->busy || job_is_completed_locked(job)) {
+return false;
+}
 }
 
 /* Otherwise, assume that it isn't fully stopped yet, but allow the job to
@@ -127,7 +131,9 @@ static bool child_job_drained_poll(BdrvChild *c)
 static void child_job_drained_end(BdrvChild *c, int *drained_end_counter)
 {
 BlockJob *job = c->opaque;
-job_resume(>job);
+WITH_JOB_LOCK_GUARD() {
+job_resume_locked(>job);
+}
 }
 
 static bool child_job_can_set_aio_ctx(BdrvChild *c, AioContext *ctx,
@@ -480,13 +486,15 @@ void *block_job_create(const char *job_id, const 
BlockJobDriver *driver,
 job->ready_notifier.notify = block_job_event_ready_locked;
 job->idle_notifier.notify = block_job_on_idle_locked;
 
-notifier_list_add(>job.on_finalize_cancelled,
-

[PATCH v8 03/20] job.c: API functions not used outside should be static

2022-06-29 Thread Emanuele Giuseppe Esposito

job_event_* functions can all be static, as they are not used
outside job.c.

Same applies for job_txn_add_job().

Signed-off-by: Emanuele Giuseppe Esposito 
Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 include/qemu/job.h | 18 --
 job.c  | 22 +++---
 2 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/include/qemu/job.h b/include/qemu/job.h
index 876e13d549..4b64eb15f7 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -358,18 +358,6 @@ JobTxn *job_txn_new(void);
  */
 void job_txn_unref(JobTxn *txn);
 
-/**
- * @txn: The transaction (may be NULL)
- * @job: Job to add to the transaction
- *
- * Add @job to the transaction.  The @job must not already be in a transaction.
- * The caller must call either job_txn_unref() or job_completed() to release
- * the reference that is automatically grabbed here.
- *
- * If @txn is NULL, the function does nothing.
- */
-void job_txn_add_job(JobTxn *txn, Job *job);
-
 /**
  * Create a new long-running job and return it.
  *
@@ -431,12 +419,6 @@ void job_progress_set_remaining(Job *job, uint64_t 
remaining);
  */
 void job_progress_increase_remaining(Job *job, uint64_t delta);
 
-/** To be called when a cancelled job is finalised. */
-void job_event_cancelled(Job *job);
-
-/** To be called when a successfully completed job is finalised. */
-void job_event_completed(Job *job);
-
 /**
  * Conditionally enter the job coroutine if the job is ready to run, not
  * already busy and fn() returns true. fn() is called while under the job_lock
diff --git a/job.c b/job.c
index 2b4ffca9d4..cafd597ba4 100644
--- a/job.c
+++ b/job.c
@@ -125,7 +125,17 @@ void job_txn_unref(JobTxn *txn)
 }
 }
 
-void job_txn_add_job(JobTxn *txn, Job *job)
+/**
+ * @txn: The transaction (may be NULL)
+ * @job: Job to add to the transaction
+ *
+ * Add @job to the transaction.  The @job must not already be in a transaction.
+ * The caller must call either job_txn_unref() or job_completed() to release
+ * the reference that is automatically grabbed here.
+ *
+ * If @txn is NULL, the function does nothing.
+ */
+static void job_txn_add_job(JobTxn *txn, Job *job)
 {
 if (!txn) {
 return;
@@ -427,12 +437,18 @@ void job_progress_increase_remaining(Job *job, uint64_t 
delta)
 progress_increase_remaining(>progress, delta);
 }
 
-void job_event_cancelled(Job *job)
+/**
+ * To be called when a cancelled job is finalised.
+ */
+static void job_event_cancelled(Job *job)
 {
 notifier_list_notify(>on_finalize_cancelled, job);
 }
 
-void job_event_completed(Job *job)
+/**
+ * To be called when a successfully completed job is finalised.
+ */
+static void job_event_completed(Job *job)
 {
 notifier_list_notify(>on_finalize_completed, job);
 }
-- 
2.31.1

[PATCH v8 00/20] job: replace AioContext lock with job_mutex

2022-06-29 Thread Emanuele Giuseppe Esposito

In this series, we want to remove the AioContext lock and instead
use the already existent job_mutex to protect the job structures
and list. This is part of the work to get rid of AioContext lock
usage in favour of smaller granularity locks.

In order to simplify reviewer's job, job lock/unlock functions and
macros are added as empty prototypes (nop) in patch 1.
They are converted to use the actual job mutex only in the last
patch. In this way we can freely create locking sections
without worrying about deadlocks with the aiocontext lock.

Patch 2 defines what fields in the job structure need protection.
Patches 3-6 are in preparation to the job locks, moving functions
from global to static and introducing helpers.

Patch 7-9 introduce the (nop) job lock into the job API and
its users, and patches 10-13 categorize respectively locked and
unlocked functions in the job API.

Patches 14-17 take care of protecting job->aio_context, and
finally patch 18 makes the prototypes in patch 1 use the
job_mutex and removes all aiocontext lock at the same time.

Tested this series by running unit tests, qemu-iotests and qtests
(x86_64).

---
v8:
* reorganize patch ordering according with Vladimir proposal
* minor nitpicks

v7:
* s/temporary/temporarly
* double identical locking comment to the same function
* patch 2: add "Protected by AioContext lock" to better categorize fields in
  job.h
* use same comment style in all function headers ("Just like {funct}, but
  called between job_lock and job_unlock")

v6:
* patch 4 and 6 squashed with patch 19 (enable job lock and
  reduce/remove AioContext lock)
* patch 19: job_unref_locked read the aiocontext inside the
  job lock.

v5:
* just restructured patches a little bit better, as there were
  functions used before they were defined.
* rebased on kwolf/block branch and API split serie

v4:
* move "protected by job_mutex" from patch 2 to 15, where the job_mutex is
  actually added.
* s/aio_co_enter/aio_co_schedule in job.c, and adjust tests accordingly.
* remove job_get_aio_context, add job_set_aio_context. Use "fake rwlock"
  to protect job->aiocontext.
* get rid of useless getters method, namely:
  job_get_status
  job_get_pause_count
  job_get_paused
  job_get_busy
  They are all used only by tests, and such getter is pretty useless.
  Replace with job_lock(); assert(); job_unlock();
* use job lock macros instead of job lock/unlock in unit tests.
* convert also blockjob functions to have _locked
* put the job_lock/unlock patches before the _locked ones
* replace aio_co_enter in job.c and detect change of context

v3:
* add "_locked" suffix to the functions called under job_mutex lock
* rename _job_lock in real_job_lock
* job_mutex is now public, and drivers like monitor use it directly
* introduce and protect job_get_aio_context
* remove mirror-specific APIs and just use WITH_JOB_GUARD
* more extensive use of WITH_JOB_GUARD and JOB_LOCK_GUARD

RFC v2:
* use JOB_LOCK_GUARD and WITH_JOB_LOCK_GUARD
* mu(u)ltiple typos in commit messages
* job API split patches are sent separately in another series
* use of empty job_{lock/unlock} and JOB_LOCK_GUARD/WITH_JOB_LOCK_GUARD
  to avoid deadlocks and simplify the reviewer job
* move patch 11 (block_job_query: remove atomic read) as last

Emanuele Giuseppe Esposito (19):
  job.c: make job_mutex and job_lock/unlock() public
  job.h: categorize fields in struct Job
  job.c: API functions not used outside should be static
  aio-wait.h: introduce AIO_WAIT_WHILE_UNLOCKED
  job.c: add job_lock/unlock while keeping job.h intact
  job.h: define functions called without job lock held
  job.h: add _locked public functions
  blockjob.h: introduce block_job  _locked() APIs
  blockjob: rename notifier callbacks as _locked
  jobs: add job lock in find_* functions
  jobs: use job locks also in the unit tests
  block/mirror.c: use of job helpers in drivers to avoid TOC/TOU
  jobs: group together API calls under the same job lock
  commit and mirror: create new nodes using bdrv_get_aio_context, and
not the job aiocontext
  jobs: protect job.aio_context with BQL and job_mutex
  job.c: enable job lock/unlock and remove Aiocontext locks
  block_job_query: remove atomic read
  blockjob: remove unused functions
  job: remove unused functions

Paolo Bonzini (1):
  job: detect change of aiocontext within job coroutine

 block.c  |  20 +-
 block/commit.c   |   4 +-
 block/mirror.c   |  21 +-
 block/replication.c  |   6 +-
 blockdev.c   | 129 +++---
 blockjob.c   | 126 +++---
 include/block/aio-wait.h |  17 +-
 include/block/blockjob.h |  28 +-
 include/qemu/job.h   | 268 +
 job-qmp.c|  87 ++--
 job.c| 654 +++
 monitor/qmp-cmds.c   |   7 +-
 qemu-img.c   |  41 +-
 tests/unit/test-bdrv-drain.c |

[PATCH v8 14/20] commit and mirror: create new nodes using bdrv_get_aio_context, and not the job aiocontext

2022-06-29 Thread Emanuele Giuseppe Esposito

We are always using the given bs AioContext, so there is no need
to take the job ones (which is identical anyways).
This also reduces the point we need to check when protecting
job.aio_context field.

Signed-off-by: Emanuele Giuseppe Esposito 
Reviewed-by: Stefan Hajnoczi 
---
 block/commit.c | 4 ++--
 block/mirror.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/commit.c b/block/commit.c
index 851d1c557a..336f799172 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -370,7 +370,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
 goto fail;
 }
 
-s->base = blk_new(s->common.job.aio_context,
+s->base = blk_new(bdrv_get_aio_context(bs),
   base_perms,
   BLK_PERM_CONSISTENT_READ
   | BLK_PERM_WRITE_UNCHANGED);
@@ -382,7 +382,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
 s->base_bs = base;
 
 /* Required permissions are already taken with block_job_add_bdrv() */
-s->top = blk_new(s->common.job.aio_context, 0, BLK_PERM_ALL);
+s->top = blk_new(bdrv_get_aio_context(bs), 0, BLK_PERM_ALL);
 ret = blk_insert_bs(s->top, top, errp);
 if (ret < 0) {
 goto fail;
diff --git a/block/mirror.c b/block/mirror.c
index b38676e19d..1977e25171 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -1728,7 +1728,7 @@ static BlockJob *mirror_start_job(
 goto fail;
 }
 
-s->target = blk_new(s->common.job.aio_context,
+s->target = blk_new(bdrv_get_aio_context(bs),
 target_perms, target_shared_perms);
 ret = blk_insert_bs(s->target, target, errp);
 if (ret < 0) {
-- 
2.31.1

Re: [PATCH 12/14] aspeed: Make aspeed_board_init_flashes public

2022-06-29 Thread Alex Bennée



Cédric Le Goater  writes:

> On 6/24/22 18:50, Cédric Le Goater wrote:
>> On 6/23/22 20:43, Peter Delevoryas wrote:
>>>
>>>
 On Jun 23, 2022, at 8:09 AM, Cédric Le Goater  wrote:

 On 6/23/22 12:26, Peter Delevoryas wrote:
> Signed-off-by: Peter Delevoryas 

 Let's start simple without flash support. We should be able to
 load FW blobs in each CPU address space using loader devices.
>>>
>>> Actually, I was unable to do this, perhaps because the fb OpenBMC
>>> boot sequence is a little weird. I specifically _needed_ to have
>>> a flash device which maps the firmware in at 0x2000_, because
>>> the fb OpenBMC U-Boot SPL jumps to that address to start executing
>>> from flash? I think this is also why fb OpenBMC machines can be so slow.
>>>
>>> $ ./build/qemu-system-arm -machine fby35 \
>>>  -device loader,file=fby35.mtd,addr=0,cpu-num=0 -nographic \
>>>  -d int -drive file=fby35.mtd,format=raw,if=mtd
>> Ideally we should be booting from the flash device directly using
>> the machine option '-M ast2600-evb,execute-in-place=true' like HW
>> does. Instructions are fetched using SPI transfers. But the amount
>> of code generated is tremendous.

Yeah because there is a potential race when reading from HW so we throw
away TB's after executing them because we have no way of knowing if it
has changed under our feet. See 873d64ac30 (accel/tcg: re-factor non-RAM
execution code) which cleaned up this handling.

>> See some profiling below for a
>> run which barely reaches DRAM training in U-Boot.
>
> Some more profiling on both ast2500 and ast2600 machines shows :
>
>
> * ast2600-evb,execute-in-place=true :
>
> Type   Object  Call siteWait Time (s) 
> Count  Average (us)
> -
> BQL mutex  0x564dc03922e0  accel/tcg/cputlb.c:1365   14.21443
> 32909927  0.43

This is unavoidable as a HW access needs the BQL held so we will go
through this cycle every executed instruction.

Did I miss why the flash contents are not mapped into the physical
address space? Isn't that how it appear to the processor?

> condvar0x564dc0f02988  util/thread-pool.c:90 10.02312
> 56 178984.32
> condvar[   2]  softmmu/cpus.c:423 0.10051 
> 6  16752.04
> BQL mutex  0x564dc03922e0  util/rcu.c:269 0.04372 
> 4  10930.60
> BQL mutex  0x564dc03922e0  cpus-common.c:341  0.00151 
> 8189.16
> condvar0x564dc0390360  cpus-common.c:176  0.00092 
> 8115.04
> condvar0x564dc0392280  softmmu/cpus.c:642 0.00013 
> 2 65.04
> condvar0x564dc0392240  softmmu/cpus.c:571 0.00010 
> 2 49.54
> BQL mutex  0x564dc03922e0  accel/tcg/cputlb.c:14260.6   
> 467  0.14
> condvar0x564dc03903a0  cpus-common.c:206  0.4 
> 8  5.28
> -
>
>
> * ast2500-evb,execute-in-place=true :
>
> Type   Object  Call siteWait Time (s) 
> Count  Average (us)
> -
> condvar0x55a581137f88  util/thread-pool.c:90 10.01158
> 28 357556.50
> BQL mutex  0x55a57f0e02e0  accel/tcg/cputlb.c:13650.29886  
> 14394475  0.02
> condvar0x55a5814cb5a0  softmmu/cpus.c:423 0.02182 
> 2  10912.44
> BQL mutex  0x55a57f0e02e0  util/rcu.c:269 0.01420 
> 4   3549.56
> mutex  0x55a5813381c0  tcg/region.c:204   0.7  
> 3052  0.02
> condvar0x55a57f0e0280  softmmu/cpus.c:642 0.6 
> 1 59.79
> mutex  [   2]  chardev/char.c:118 0.3  
> 1492  0.02
> BQL mutex  0x55a57f0e02e0  util/main-loop.c:318   0.2
> 34  0.72
> BQL mutex  0x55a57f0e02e0  accel/tcg/cputlb.c:14260.2   
> 973  0.02
> condvar0x55a57f0e0240  softmmu/cpus.c:571 0.2 
> 1 15.16
> -
>
> C.
>
>
>
>> * execute-in-place=true
>> Each sample counts as 0.01 seconds.
>>    %   cumulative   self  self total
>>   time   seconds   seconds    calls  ns/call  ns/call  name
>> 100.00  0.02 0.02   164276   121.75   121.75  
>> memory_region_init_rom_device
>>    0.00  0.02 0.00 1610346008 0.00 0.00  tcg_code_capacity
>>    0.00  0.02 0.00 567612621 0.00 0.00  
>> type_register_static_array
>>    0.00  0.02 0.00

[PATCH v8 09/20] blockjob: rename notifier callbacks as _locked

2022-06-29 Thread Emanuele Giuseppe Esposito

They all are called with job_lock held, in job_event_*_locked()

Signed-off-by: Emanuele Giuseppe Esposito 
---
 blockjob.c | 25 +++--
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/blockjob.c b/blockjob.c
index 0d59aba439..70952879d8 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -247,7 +247,8 @@ int block_job_add_bdrv(BlockJob *job, const char *name, 
BlockDriverState *bs,
 return 0;
 }
 
-static void block_job_on_idle(Notifier *n, void *opaque)
+/* Called with job_mutex lock held. */
+static void block_job_on_idle_locked(Notifier *n, void *opaque)
 {
 aio_wait_kick();
 }
@@ -367,7 +368,8 @@ static void block_job_iostatus_set_err(BlockJob *job, int 
error)
 }
 }
 
-static void block_job_event_cancelled(Notifier *n, void *opaque)
+/* Called with job_mutex lock held. */
+static void block_job_event_cancelled_locked(Notifier *n, void *opaque)
 {
 BlockJob *job = opaque;
 uint64_t progress_current, progress_total;
@@ -386,7 +388,8 @@ static void block_job_event_cancelled(Notifier *n, void 
*opaque)
 job->speed);
 }
 
-static void block_job_event_completed(Notifier *n, void *opaque)
+/* Called with job_mutex lock held. */
+static void block_job_event_completed_locked(Notifier *n, void *opaque)
 {
 BlockJob *job = opaque;
 const char *msg = NULL;
@@ -412,7 +415,8 @@ static void block_job_event_completed(Notifier *n, void 
*opaque)
 msg);
 }
 
-static void block_job_event_pending(Notifier *n, void *opaque)
+/* Called with job_mutex lock held. */
+static void block_job_event_pending_locked(Notifier *n, void *opaque)
 {
 BlockJob *job = opaque;
 
@@ -424,7 +428,8 @@ static void block_job_event_pending(Notifier *n, void 
*opaque)
   job->job.id);
 }
 
-static void block_job_event_ready(Notifier *n, void *opaque)
+/* Called with job_mutex lock held. */
+static void block_job_event_ready_locked(Notifier *n, void *opaque)
 {
 BlockJob *job = opaque;
 uint64_t progress_current, progress_total;
@@ -469,11 +474,11 @@ void *block_job_create(const char *job_id, const 
BlockJobDriver *driver,
 
 ratelimit_init(>limit);
 
-job->finalize_cancelled_notifier.notify = block_job_event_cancelled;
-job->finalize_completed_notifier.notify = block_job_event_completed;
-job->pending_notifier.notify = block_job_event_pending;
-job->ready_notifier.notify = block_job_event_ready;
-job->idle_notifier.notify = block_job_on_idle;
+job->finalize_cancelled_notifier.notify = block_job_event_cancelled_locked;
+job->finalize_completed_notifier.notify = block_job_event_completed_locked;
+job->pending_notifier.notify = block_job_event_pending_locked;
+job->ready_notifier.notify = block_job_event_ready_locked;
+job->idle_notifier.notify = block_job_on_idle_locked;
 
 notifier_list_add(>job.on_finalize_cancelled,
   >finalize_cancelled_notifier);
-- 
2.31.1

[PATCH v8 06/20] job.h: define functions called without job lock held

2022-06-29 Thread Emanuele Giuseppe Esposito

These functions don't need a _locked() counterpart, since
they are all called outside job.c and take the lock only
internally.

Update also the comments in blockjob.c (and move them in job.c).

Note: at this stage, job_{lock/unlock} and job lock guard macros
are *nop*.

No functional change intended.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 blockjob.c | 20 
 include/qemu/job.h | 37 ++---
 job.c  | 15 +++
 3 files changed, 49 insertions(+), 23 deletions(-)

diff --git a/blockjob.c b/blockjob.c
index 4868453d74..7da59a1f1c 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -36,21 +36,6 @@
 #include "qemu/main-loop.h"
 #include "qemu/timer.h"
 
-/*
- * The block job API is composed of two categories of functions.
- *
- * The first includes functions used by the monitor.  The monitor is
- * peculiar in that it accesses the block job list with block_job_get, and
- * therefore needs consistency across block_job_get and the actual operation
- * (e.g. block_job_set_speed).  The consistency is achieved with
- * aio_context_acquire/release.  These functions are declared in blockjob.h.
- *
- * The second includes functions used by the block job drivers and sometimes
- * by the core block layer.  These do not care about locking, because the
- * whole coroutine runs under the AioContext lock, and are declared in
- * blockjob_int.h.
- */
-
 static bool is_block_job(Job *job)
 {
 return job_type(job) == JOB_TYPE_BACKUP ||
@@ -433,11 +418,6 @@ static void block_job_event_ready(Notifier *n, void 
*opaque)
 }
 
 
-/*
- * API for block job drivers and the block layer.  These functions are
- * declared in blockjob_int.h.
- */
-
 void *block_job_create(const char *job_id, const BlockJobDriver *driver,
JobTxn *txn, BlockDriverState *bs, uint64_t perm,
uint64_t shared_perm, int64_t speed, int flags,
diff --git a/include/qemu/job.h b/include/qemu/job.h
index 99960cc9a3..b714236c1a 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -363,6 +363,7 @@ void job_txn_unref_locked(JobTxn *txn);
 
 /**
  * Create a new long-running job and return it.
+ * Called with job_mutex *not* held.
  *
  * @job_id: The id of the newly-created job, or %NULL for internal jobs
  * @driver: The class object for the newly-created job.
@@ -400,6 +401,8 @@ void job_unref_locked(Job *job);
  * @done: How much progress the job made since the last call
  *
  * Updates the progress counter of the job.
+ *
+ * Progress API is thread safe.
  */
 void job_progress_update(Job *job, uint64_t done);
 
@@ -410,6 +413,8 @@ void job_progress_update(Job *job, uint64_t done);
  *
  * Sets the expected end value of the progress counter of a job so that a
  * completion percentage can be calculated when the progress is updated.
+ *
+ * Progress API is thread safe.
  */
 void job_progress_set_remaining(Job *job, uint64_t remaining);
 
@@ -425,6 +430,8 @@ void job_progress_set_remaining(Job *job, uint64_t 
remaining);
  * length before, and job_progress_update() afterwards.
  * (So the operation acts as a parenthesis in regards to the main job
  * operation running in background.)
+ *
+ * Progress API is thread safe.
  */
 void job_progress_increase_remaining(Job *job, uint64_t delta);
 
@@ -443,6 +450,8 @@ void job_enter_cond_locked(Job *job, bool(*fn)(Job *job));
  *
  * Begins execution of a job.
  * Takes ownership of one reference to the job object.
+ *
+ * Called with job_mutex *not* held.
  */
 void job_start(Job *job);
 
@@ -450,6 +459,7 @@ void job_start(Job *job);
  * @job: The job to enter.
  *
  * Continue the specified job by entering the coroutine.
+ * Called with job_mutex *not* held.
  */
 void job_enter(Job *job);
 
@@ -458,6 +468,9 @@ void job_enter(Job *job);
  *
  * Pause now if job_pause() has been called. Jobs that perform lots of I/O
  * must call this between requests so that the job can be paused.
+ *
+ * Called with job_mutex *not* held (we don't want the coroutine
+ * to yield with the lock held!).
  */
 void coroutine_fn job_pause_point(Job *job);
 
@@ -465,6 +478,8 @@ void coroutine_fn job_pause_point(Job *job);
  * @job: The job that calls the function.
  *
  * Yield the job coroutine.
+ * Called with job_mutex *not* held (we don't want the coroutine
+ * to yield with the lock held!).
  */
 void job_yield(Job *job);
 
@@ -475,6 +490,9 @@ void job_yield(Job *job);
  * Put the job to sleep (assuming that it wasn't canceled) for @ns
  * %QEMU_CLOCK_REALTIME nanoseconds.  Canceling the job will immediately
  * interrupt the wait.
+ *
+ * Called with job_mutex *not* held (we don't want the coroutine
+ * to yield with the lock held!).
  */
 void coroutine_fn job_sleep_ns(Job *job, int64_t ns);
 
@@ -496,6 +514,7 @@ bool job_is_cancelled_locked(Job *job);
 /**
  * Returns whether the job is scheduled for cancellation (at an
  * indefinite point).
+ * Called with job_mutex *not* held.
  */
 bool

[PATCH v8 16/20] jobs: protect job.aio_context with BQL and job_mutex

2022-06-29 Thread Emanuele Giuseppe Esposito

In order to make it thread safe, implement a "fake rwlock",
where we allow reads under BQL *or* job_mutex held, but
writes only under BQL *and* job_mutex.

The only write we have is in child_job_set_aio_ctx, which always
happens under drain (so the job is paused).
For this reason, introduce job_set_aio_context and make sure that
the context is set under BQL, job_mutex and drain.
Also make sure all other places where the aiocontext is read
are protected.

Note: at this stage, job_{lock/unlock} and job lock guard macros
are *nop*.

Suggested-by: Paolo Bonzini 
Signed-off-by: Emanuele Giuseppe Esposito 
---
 block/replication.c |  6 --
 blockjob.c  |  3 ++-
 include/qemu/job.h  | 19 ++-
 job.c   | 12 
 4 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/block/replication.c b/block/replication.c
index 55c8f894aa..2189863df1 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -148,8 +148,10 @@ static void replication_close(BlockDriverState *bs)
 }
 if (s->stage == BLOCK_REPLICATION_FAILOVER) {
 commit_job = >commit_job->job;
-assert(commit_job->aio_context == qemu_get_current_aio_context());
-job_cancel_sync(commit_job, false);
+WITH_JOB_LOCK_GUARD() {
+assert(commit_job->aio_context == qemu_get_current_aio_context());
+job_cancel_sync_locked(commit_job, false);
+}
 }
 
 if (s->mode == REPLICATION_MODE_SECONDARY) {
diff --git a/blockjob.c b/blockjob.c
index 1075def475..2293a00b4a 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -166,12 +166,13 @@ static void child_job_set_aio_ctx(BdrvChild *c, 
AioContext *ctx,
 bdrv_set_aio_context_ignore(sibling->bs, ctx, ignore);
 }
 
-job->job.aio_context = ctx;
+job_set_aio_context(>job, ctx);
 }
 
 static AioContext *child_job_get_parent_aio_context(BdrvChild *c)
 {
 BlockJob *job = c->opaque;
+assert(qemu_in_main_thread());
 
 return job->job.aio_context;
 }
diff --git a/include/qemu/job.h b/include/qemu/job.h
index e887f88cb2..8f13c3de61 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -77,7 +77,12 @@ typedef struct Job {
 
 /** Protected by AioContext lock */
 
-/** AioContext to run the job coroutine in */
+/**
+ * AioContext to run the job coroutine in.
+ * This field can be read when holding either the BQL (so we are in
+ * the main loop) or the job_mutex.
+ * It can be only written when we hold *both* BQL and job_mutex.
+ */
 AioContext *aio_context;
 
 /** Reference count of the block job */
@@ -728,4 +733,16 @@ int job_finish_sync(Job *job, void (*finish)(Job *, Error 
**errp),
 int job_finish_sync_locked(Job *job, void (*finish)(Job *, Error **errp),
Error **errp);
 
+/**
+ * Sets the @job->aio_context.
+ * Called with job_mutex *not* held.
+ *
+ * This function must run in the main thread to protect against
+ * concurrent read in job_finish_sync_locked(),
+ * takes the job_mutex lock to protect against the read in
+ * job_do_yield_locked(), and must be called when the coroutine
+ * is quiescent.
+ */
+void job_set_aio_context(Job *job, AioContext *ctx);
+
 #endif
diff --git a/job.c b/job.c
index 8db80b8086..ad2badd107 100644
--- a/job.c
+++ b/job.c
@@ -394,6 +394,17 @@ Job *job_get(const char *id)
 return job_get_locked(id);
 }
 
+void job_set_aio_context(Job *job, AioContext *ctx)
+{
+/* protect against read in job_finish_sync_locked and job_start */
+assert(qemu_in_main_thread());
+/* protect against read in job_do_yield_locked */
+JOB_LOCK_GUARD();
+/* ensure the coroutine is quiescent while the AioContext is changed */
+assert(job->pause_count > 0);
+job->aio_context = ctx;
+}
+
 /* Called with job_mutex *not* held. */
 static void job_sleep_timer_cb(void *opaque)
 {
@@ -1379,6 +1390,7 @@ int job_finish_sync_locked(Job *job,
 {
 Error *local_err = NULL;
 int ret;
+assert(qemu_in_main_thread());
 
 job_ref_locked(job);
 
-- 
2.31.1

[PATCH v8 20/20] job: remove unused functions

2022-06-29 Thread Emanuele Giuseppe Esposito

These public functions are not used anywhere, thus can be dropped.
Also, since this is the final job API that doesn't use AioContext
lock and replaces it with job_lock, adjust all remaining function
documentation to clearly specify if the job lock is taken or not.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 include/qemu/job.h |  97 +--
 job.c  | 122 ++---
 2 files changed, 40 insertions(+), 179 deletions(-)

diff --git a/include/qemu/job.h b/include/qemu/job.h
index 5db35f765c..f3d0694512 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -361,6 +361,8 @@ JobTxn *job_txn_new(void);
 /**
  * Release a reference that was previously acquired with job_txn_add_job or
  * job_txn_new. If it's the last reference to the object, it will be freed.
+ *
+ * Called with job lock *not* held.
  */
 void job_txn_unref(JobTxn *txn);
 
@@ -387,19 +389,17 @@ void *job_create(const char *job_id, const JobDriver 
*driver, JobTxn *txn,
 /**
  * Add a reference to Job refcnt, it will be decreased with job_unref, and then
  * be freed if it comes to be the last reference.
+ *
+ * Called with job lock held.
  */
-void job_ref(Job *job);
-
-/* Same as job_ref(), but called with job lock held. */
 void job_ref_locked(Job *job);
 
 /**
  * Release a reference that was previously acquired with job_ref() or
  * job_create(). If it's the last reference to the object, it will be freed.
+ *
+ * Called with job lock held.
  */
-void job_unref(Job *job);
-
-/* Same as job_unref(), but called with job lock held. */
 void job_unref_locked(Job *job);
 
 /**
@@ -445,10 +445,9 @@ void job_progress_increase_remaining(Job *job, uint64_t 
delta);
  * Conditionally enter the job coroutine if the job is ready to run, not
  * already busy and fn() returns true. fn() is called while under the job_lock
  * critical section.
+ *
+ * Called with job lock held, but might release it temporarily.
  */
-void job_enter_cond(Job *job, bool(*fn)(Job *job));
-
-/* Same as job_enter_cond(), but called with job lock held. */
 void job_enter_cond_locked(Job *job, bool(*fn)(Job *job));
 
 /**
@@ -529,11 +528,8 @@ bool job_cancel_requested(Job *job);
 
 /**
  * Returns whether the job is in a completed state.
- * Called with job_mutex *not* held.
+ * Called with job lock held.
  */
-bool job_is_completed(Job *job);
-
-/* Same as job_is_completed(), but called with job lock held. */
 bool job_is_completed_locked(Job *job);
 
 /**
@@ -549,40 +545,35 @@ bool job_is_ready_locked(Job *job);
  * Request @job to pause at the next pause point. Must be paired with
  * job_resume(). If the job is supposed to be resumed by user action, call
  * job_user_pause() instead.
+ *
+ * Called with job lock held.
  */
-void job_pause(Job *job);
-
-/* Same as job_pause(), but called with job lock held. */
 void job_pause_locked(Job *job);
 
-/** Resumes a @job paused with job_pause. */
-void job_resume(Job *job);
-
-/* Same as job_resume(), but called with job lock held. */
+/**
+ * Resumes a @job paused with job_pause.
+ * Called with job lock held, but might release it temporarily.
+ */
 void job_resume_locked(Job *job);
 
 /**
  * Asynchronously pause the specified @job.
  * Do not allow a resume until a matching call to job_user_resume.
+ * Called with job lock held.
  */
-void job_user_pause(Job *job, Error **errp);
-
-/* Same as job_user_pause(), but called with job lock held. */
 void job_user_pause_locked(Job *job, Error **errp);
 
-/** Returns true if the job is user-paused. */
-bool job_user_paused(Job *job);
-
-/* Same as job_user_paused(), but called with job lock held. */
+/**
+ * Returns true if the job is user-paused.
+ * Called with job lock held.
+ */
 bool job_user_paused_locked(Job *job);
 
 /**
  * Resume the specified @job.
  * Must be paired with a preceding job_user_pause.
+ * Called with job lock held.
  */
-void job_user_resume(Job *job, Error **errp);
-
-/* Same as job_user_resume(), but called with job lock held. */
 void job_user_resume_locked(Job *job, Error **errp);
 
 /**
@@ -590,30 +581,25 @@ void job_user_resume_locked(Job *job, Error **errp);
  * first one if @job is %NULL.
  *
  * Returns the requested job, or %NULL if there are no more jobs left.
+ * Called with job lock held.
  */
-Job *job_next(Job *job);
-
-/* Same as job_next(), but called with job lock held. */
 Job *job_next_locked(Job *job);
 
 /**
  * Get the job identified by @id (which must not be %NULL).
  *
  * Returns the requested job, or %NULL if it doesn't exist.
+ * Called with job lock held.
  */
-Job *job_get(const char *id);
-
-/* Same as job_get(), but called with job lock held. */
 Job *job_get_locked(const char *id);
 
 /**
  * Check whether the verb @verb can be applied to @job in its current state.
  * Returns 0 if the verb can be applied; otherwise errp is set and -EPERM
  * returned.
+ *
+ * Called with job lock held.
  */
-int job_apply_verb(Job *job, JobVerb verb, Error

[PATCH v8 02/20] job.h: categorize fields in struct Job

2022-06-29 Thread Emanuele Giuseppe Esposito

Categorize the fields in struct Job to understand which ones
need to be protected by the job mutex and which don't.

Signed-off-by: Emanuele Giuseppe Esposito 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 include/qemu/job.h | 61 +++---
 1 file changed, 36 insertions(+), 25 deletions(-)

diff --git a/include/qemu/job.h b/include/qemu/job.h
index d1192ffd61..876e13d549 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -40,27 +40,52 @@ typedef struct JobTxn JobTxn;
  * Long-running operation.
  */
 typedef struct Job {
+
+/* Fields set at initialization (job_create), and never modified */
+
 /** The ID of the job. May be NULL for internal jobs. */
 char *id;
 
-/** The type of this job. */
+/**
+ * The type of this job.
+ * All callbacks are called with job_mutex *not* held.
+ */
 const JobDriver *driver;
 
-/** Reference count of the block job */
-int refcnt;
-
-/** Current state; See @JobStatus for details. */
-JobStatus status;
-
-/** AioContext to run the job coroutine in */
-AioContext *aio_context;
-
 /**
  * The coroutine that executes the job.  If not NULL, it is reentered when
  * busy is false and the job is cancelled.
+ * Initialized in job_start()
  */
 Coroutine *co;
 
+/** True if this job should automatically finalize itself */
+bool auto_finalize;
+
+/** True if this job should automatically dismiss itself */
+bool auto_dismiss;
+
+/** The completion function that will be called when the job completes.  */
+BlockCompletionFunc *cb;
+
+/** The opaque value that is passed to the completion function.  */
+void *opaque;
+
+/* ProgressMeter API is thread-safe */
+ProgressMeter progress;
+
+
+/** Protected by AioContext lock */
+
+/** AioContext to run the job coroutine in */
+AioContext *aio_context;
+
+/** Reference count of the block job */
+int refcnt;
+
+/** Current state; See @JobStatus for details. */
+JobStatus status;
+
 /**
  * Timer that is used by @job_sleep_ns. Accessed under job_mutex (in
  * job.c).
@@ -112,14 +137,6 @@ typedef struct Job {
 /** Set to true when the job has deferred work to the main loop. */
 bool deferred_to_main_loop;
 
-/** True if this job should automatically finalize itself */
-bool auto_finalize;
-
-/** True if this job should automatically dismiss itself */
-bool auto_dismiss;
-
-ProgressMeter progress;
-
 /**
  * Return code from @run and/or @prepare callback(s).
  * Not final until the job has reached the CONCLUDED status.
@@ -134,12 +151,6 @@ typedef struct Job {
  */
 Error *err;
 
-/** The completion function that will be called when the job completes.  */
-BlockCompletionFunc *cb;
-
-/** The opaque value that is passed to the completion function.  */
-void *opaque;
-
 /** Notifiers called when a cancelled job is finalised */
 NotifierList on_finalize_cancelled;
 
@@ -167,6 +178,7 @@ typedef struct Job {
 
 /**
  * Callbacks and other information about a Job driver.
+ * All callbacks are invoked with job_mutex *not* held.
  */
 struct JobDriver {
 
@@ -472,7 +484,6 @@ void job_yield(Job *job);
  */
 void coroutine_fn job_sleep_ns(Job *job, int64_t ns);
 
-
 /** Returns the JobType of a given Job. */
 JobType job_type(const Job *job);
 
-- 
2.31.1

[PATCH v8 08/20] blockjob.h: introduce block_job _locked() APIs

2022-06-29 Thread Emanuele Giuseppe Esposito

Just as done with job.h, create _locked() functions in blockjob.h

These functions will be later useful when caller has already taken
the lock. All blockjob _locked functions call job _locked functions.

Note: at this stage, job_{lock/unlock} and job lock guard macros
are *nop*.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 blockjob.c   | 52 
 include/block/blockjob.h | 15 
 2 files changed, 57 insertions(+), 10 deletions(-)

diff --git a/blockjob.c b/blockjob.c
index 7da59a1f1c..0d59aba439 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -44,21 +44,27 @@ static bool is_block_job(Job *job)
job_type(job) == JOB_TYPE_STREAM;
 }
 
-BlockJob *block_job_next(BlockJob *bjob)
+BlockJob *block_job_next_locked(BlockJob *bjob)
 {
 Job *job = bjob ? >job : NULL;
 GLOBAL_STATE_CODE();
 
 do {
-job = job_next(job);
+job = job_next_locked(job);
 } while (job && !is_block_job(job));
 
 return job ? container_of(job, BlockJob, job) : NULL;
 }
 
-BlockJob *block_job_get(const char *id)
+BlockJob *block_job_next(BlockJob *bjob)
 {
-Job *job = job_get(id);
+JOB_LOCK_GUARD();
+return block_job_next_locked(bjob);
+}
+
+BlockJob *block_job_get_locked(const char *id)
+{
+Job *job = job_get_locked(id);
 GLOBAL_STATE_CODE();
 
 if (job && is_block_job(job)) {
@@ -68,6 +74,12 @@ BlockJob *block_job_get(const char *id)
 }
 }
 
+BlockJob *block_job_get(const char *id)
+{
+JOB_LOCK_GUARD();
+return block_job_get_locked(id);
+}
+
 void block_job_free(Job *job)
 {
 BlockJob *bjob = container_of(job, BlockJob, job);
@@ -256,14 +268,14 @@ static bool job_timer_pending(Job *job)
 return timer_pending(>sleep_timer);
 }
 
-bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
+bool block_job_set_speed_locked(BlockJob *job, int64_t speed, Error **errp)
 {
 const BlockJobDriver *drv = block_job_driver(job);
 int64_t old_speed = job->speed;
 
 GLOBAL_STATE_CODE();
 
-if (job_apply_verb(>job, JOB_VERB_SET_SPEED, errp) < 0) {
+if (job_apply_verb_locked(>job, JOB_VERB_SET_SPEED, errp) < 0) {
 return false;
 }
 if (speed < 0) {
@@ -277,7 +289,9 @@ bool block_job_set_speed(BlockJob *job, int64_t speed, 
Error **errp)
 job->speed = speed;
 
 if (drv->set_speed) {
+job_unlock();
 drv->set_speed(job, speed);
+job_lock();
 }
 
 if (speed && speed <= old_speed) {
@@ -285,18 +299,24 @@ bool block_job_set_speed(BlockJob *job, int64_t speed, 
Error **errp)
 }
 
 /* kick only if a timer is pending */
-job_enter_cond(>job, job_timer_pending);
+job_enter_cond_locked(>job, job_timer_pending);
 
 return true;
 }
 
+bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
+{
+JOB_LOCK_GUARD();
+return block_job_set_speed_locked(job, speed, errp);
+}
+
 int64_t block_job_ratelimit_get_delay(BlockJob *job, uint64_t n)
 {
 IO_CODE();
 return ratelimit_calculate_delay(>limit, n);
 }
 
-BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
+BlockJobInfo *block_job_query_locked(BlockJob *job, Error **errp)
 {
 BlockJobInfo *info;
 uint64_t progress_current, progress_total;
@@ -320,7 +340,7 @@ BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
 info->len   = progress_total;
 info->speed = job->speed;
 info->io_status = job->iostatus;
-info->ready = job_is_ready(>job),
+info->ready = job_is_ready_locked(>job),
 info->status= job->job.status;
 info->auto_finalize = job->job.auto_finalize;
 info->auto_dismiss  = job->job.auto_dismiss;
@@ -333,6 +353,12 @@ BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
 return info;
 }
 
+BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
+{
+JOB_LOCK_GUARD();
+return block_job_query_locked(job, errp);
+}
+
 static void block_job_iostatus_set_err(BlockJob *job, int error)
 {
 if (job->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
@@ -478,7 +504,7 @@ fail:
 return NULL;
 }
 
-void block_job_iostatus_reset(BlockJob *job)
+void block_job_iostatus_reset_locked(BlockJob *job)
 {
 GLOBAL_STATE_CODE();
 if (job->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
@@ -488,6 +514,12 @@ void block_job_iostatus_reset(BlockJob *job)
 job->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
 }
 
+void block_job_iostatus_reset(BlockJob *job)
+{
+JOB_LOCK_GUARD();
+block_job_iostatus_reset_locked(job);
+}
+
 void block_job_user_resume(Job *job)
 {
 BlockJob *bjob = container_of(job, BlockJob, job);
diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index 6525e16fd5..3959a98612 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -92,6 +92,9 @@ typedef struct BlockJob {
  */
 BlockJob *block_job_next(BlockJob *job);
 
+/* Same as block_job_next(), but called with job lock held. */
+BlockJob *block_job_next_locked(BlockJob *job);
+

1 2 3 >

1 - 100 of 230 matches

Mail list logo