[PATCH] backends/cryptodev-builtin: Fix local_error leaks

2024-04-22 Thread Li Zhijian via
It seems that this error does not need to be propagated to the upper,
directly output the error to avoid the leaks

Closes: https://gitlab.com/qemu-project/qemu/-/issues/2283
Signed-off-by: Li Zhijian 
---
 backends/cryptodev-builtin.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/backends/cryptodev-builtin.c b/backends/cryptodev-builtin.c
index a514bbb310..940104ee55 100644
--- a/backends/cryptodev-builtin.c
+++ b/backends/cryptodev-builtin.c
@@ -23,6 +23,7 @@
 
 #include "qemu/osdep.h"
 #include "sysemu/cryptodev.h"
+#include "qemu/error-report.h"
 #include "qapi/error.h"
 #include "standard-headers/linux/virtio_crypto.h"
 #include "crypto/cipher.h"
@@ -396,8 +397,8 @@ static int cryptodev_builtin_create_session(
 case VIRTIO_CRYPTO_HASH_CREATE_SESSION:
 case VIRTIO_CRYPTO_MAC_CREATE_SESSION:
 default:
-error_setg(_error, "Unsupported opcode :%" PRIu32 "",
-   sess_info->op_code);
+error_report("Unsupported opcode :%" PRIu32 "",
+ sess_info->op_code);
 return -VIRTIO_CRYPTO_NOTSUPP;
 }
 
@@ -554,8 +555,8 @@ static int cryptodev_builtin_operation(
 
 if (op_info->session_id >= MAX_NUM_SESSIONS ||
   builtin->sessions[op_info->session_id] == NULL) {
-error_setg(_error, "Cannot find a valid session id: %" PRIu64 "",
-   op_info->session_id);
+error_report("Cannot find a valid session id: %" PRIu64 "",
+ op_info->session_id);
 return -VIRTIO_CRYPTO_INVSESS;
 }
 
-- 
2.31.1




[PATCH v2] migration/colo: Fix bdrv_graph_rdlock_main_loop: Assertion `!qemu_in_coroutine()' failed.

2024-04-16 Thread Li Zhijian via
bdrv_activate_all() should not be called from the coroutine context, move
it to the QEMU thread colo_process_incoming_thread() with the bql_lock
protected.

The backtrace is as follows:
 #4  0x561af7948362 in bdrv_graph_rdlock_main_loop () at 
../block/graph-lock.c:260
 #5  0x561af7907a68 in graph_lockable_auto_lock_mainloop (x=0x7fd29810be7b) 
at /patch/to/qemu/include/block/graph-lock.h:259
 #6  0x561af79167d1 in bdrv_activate_all (errp=0x7fd29810bed0) at 
../block.c:6906
 #7  0x561af762b4af in colo_incoming_co () at ../migration/colo.c:935
 #8  0x561af7607e57 in process_incoming_migration_co (opaque=0x0) at 
../migration/migration.c:793
 #9  0x561af7adbeeb in coroutine_trampoline (i0=-106876144, i1=22042) at 
../util/coroutine-ucontext.c:175
 #10 0x7fd2a5cf21c0 in  () at /lib64/libc.so.6

CC: Fabiano Rosas 
Closes: https://gitlab.com/qemu-project/qemu/-/issues/2277
Fixes: 2b3912f135 ("block: Mark bdrv_first_blk() and bdrv_is_root_node() 
GRAPH_RDLOCK")
Signed-off-by: Li Zhijian 
---
V2: fix missing bql_unlock() in error path.
---
 migration/colo.c | 18 ++
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/migration/colo.c b/migration/colo.c
index 84632a603e..5600a43d78 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -835,6 +835,16 @@ static void *colo_process_incoming_thread(void *opaque)
 return NULL;
 }
 
+/* Make sure all file formats throw away their mutable metadata */
+bql_lock();
+bdrv_activate_all(_err);
+if (local_err) {
+bql_unlock();
+error_report_err(local_err);
+return NULL;
+}
+bql_unlock();
+
 failover_init_state();
 
 mis->to_src_file = qemu_file_get_return_path(mis->from_src_file);
@@ -922,7 +932,6 @@ out:
 int coroutine_fn colo_incoming_co(void)
 {
 MigrationIncomingState *mis = migration_incoming_get_current();
-Error *local_err = NULL;
 QemuThread th;
 
 assert(bql_locked());
@@ -931,13 +940,6 @@ int coroutine_fn colo_incoming_co(void)
 return 0;
 }
 
-/* Make sure all file formats throw away their mutable metadata */
-bdrv_activate_all(_err);
-if (local_err) {
-error_report_err(local_err);
-return -EINVAL;
-}
-
 qemu_thread_create(, "COLO incoming", colo_process_incoming_thread,
mis, QEMU_THREAD_JOINABLE);
 
-- 
2.31.1




[PATCH] migration/colo: Fix bdrv_graph_rdlock_main_loop: Assertion `!qemu_in_coroutine()' failed.

2024-04-16 Thread Li Zhijian via
bdrv_activate_all() should not be called from the coroutine context, move
it to the QEMU thread colo_process_incoming_thread() with the bql_lock
protected.

The backtrace is as follows:
 #4  0x561af7948362 in bdrv_graph_rdlock_main_loop () at 
../block/graph-lock.c:260
 #5  0x561af7907a68 in graph_lockable_auto_lock_mainloop (x=0x7fd29810be7b) 
at /patch/to/qemu/include/block/graph-lock.h:259
 #6  0x561af79167d1 in bdrv_activate_all (errp=0x7fd29810bed0) at 
../block.c:6906
 #7  0x561af762b4af in colo_incoming_co () at ../migration/colo.c:935
 #8  0x561af7607e57 in process_incoming_migration_co (opaque=0x0) at 
../migration/migration.c:793
 #9  0x561af7adbeeb in coroutine_trampoline (i0=-106876144, i1=22042) at 
../util/coroutine-ucontext.c:175
 #10 0x7fd2a5cf21c0 in  () at /lib64/libc.so.6

CC: Fabiano Rosas 
Closes: https://gitlab.com/qemu-project/qemu/-/issues/2277
Fixes: 2b3912f135 ("block: Mark bdrv_first_blk() and bdrv_is_root_node() 
GRAPH_RDLOCK")
Signed-off-by: Li Zhijian 
---
 migration/colo.c | 17 +
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/migration/colo.c b/migration/colo.c
index 84632a603e..94942fba32 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -835,6 +835,15 @@ static void *colo_process_incoming_thread(void *opaque)
 return NULL;
 }
 
+/* Make sure all file formats throw away their mutable metadata */
+bql_lock();
+bdrv_activate_all(_err);
+if (local_err) {
+error_report_err(local_err);
+return NULL;
+}
+bql_unlock();
+
 failover_init_state();
 
 mis->to_src_file = qemu_file_get_return_path(mis->from_src_file);
@@ -922,7 +931,6 @@ out:
 int coroutine_fn colo_incoming_co(void)
 {
 MigrationIncomingState *mis = migration_incoming_get_current();
-Error *local_err = NULL;
 QemuThread th;
 
 assert(bql_locked());
@@ -931,13 +939,6 @@ int coroutine_fn colo_incoming_co(void)
 return 0;
 }
 
-/* Make sure all file formats throw away their mutable metadata */
-bdrv_activate_all(_err);
-if (local_err) {
-error_report_err(local_err);
-return -EINVAL;
-}
-
 qemu_thread_create(, "COLO incoming", colo_process_incoming_thread,
mis, QEMU_THREAD_JOINABLE);
 
-- 
2.31.1




[PATCH v2] hw/mem/cxl_type3: reset dvsecs in ct3d_reset()

2024-04-09 Thread Li Zhijian via
After the kernel commit
0cab68720598 ("cxl/pci: Fix disabling memory if DVSEC CXL Range does not match 
a CFMWS window")
CXL type3 devices cannot be enabled again after the reboot because the
control register(see 8.1.3.2 in CXL specifiction 2.0 for more details) was
not reset.

These registers could be changed by the firmware or OS, let them have
their initial value in reboot so that the OS can read their clean status.

Fixes: e1706ea83da0 ("hw/cxl/device: Add a memory device (8.2.8.5)")
Signed-off-by: Li Zhijian 
---
root_port, usp and dsp have the same issue, if this patch get approved,
I will send another patch to fix them later.

V2:
   Add fixes tag.
   Reset all dvsecs registers instead of CTRL only
---
 hw/mem/cxl_type3.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index b0a7e9f11b64..4f09d0b8fedc 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -30,6 +30,7 @@
 #include "hw/pci/msix.h"
 
 #define DWORD_BYTE 4
+#define CT3D_CAP_SN_OFFSET PCI_CONFIG_SPACE_SIZE
 
 /* Default CDAT entries for a memory region */
 enum {
@@ -284,6 +285,10 @@ static void build_dvsecs(CXLType3Dev *ct3d)
  range2_size_hi = 0, range2_size_lo = 0,
  range2_base_hi = 0, range2_base_lo = 0;
 
+cxl_cstate->dvsec_offset = CT3D_CAP_SN_OFFSET;
+if (ct3d->sn != UI64_NULL) {
+cxl_cstate->dvsec_offset += PCI_EXT_CAP_DSN_SIZEOF;
+}
 /*
  * Volatile memory is mapped as (0x0)
  * Persistent memory is mapped at (volatile->size)
@@ -664,10 +669,7 @@ static void ct3_realize(PCIDevice *pci_dev, Error **errp)
 
 pcie_endpoint_cap_init(pci_dev, 0x80);
 if (ct3d->sn != UI64_NULL) {
-pcie_dev_ser_num_init(pci_dev, 0x100, ct3d->sn);
-cxl_cstate->dvsec_offset = 0x100 + 0x0c;
-} else {
-cxl_cstate->dvsec_offset = 0x100;
+pcie_dev_ser_num_init(pci_dev, CT3D_CAP_SN_OFFSET, ct3d->sn);
 }
 
 ct3d->cxl_cstate.pdev = pci_dev;
@@ -907,6 +909,7 @@ static void ct3d_reset(DeviceState *dev)
 
 cxl_component_register_init_common(reg_state, write_msk, 
CXL2_TYPE3_DEVICE);
 cxl_device_register_init_t3(ct3d);
+build_dvsecs(ct3d);
 
 /*
  * Bring up an endpoint to target with MCTP over VDM.
-- 
2.29.2




[PATCH 1/2] CXL/cxl_type3: add first_dvsec_offset() helper

2024-04-01 Thread Li Zhijian via
It helps to figure out where the first dvsec register is located. In
addition, replace offset and size hardcore with existing macros.

Signed-off-by: Li Zhijian 
---
 hw/mem/cxl_type3.c | 19 +--
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index b0a7e9f11b64..ad2fe7d463fb 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -643,6 +643,16 @@ static DOEProtocol doe_cdat_prot[] = {
 { }
 };
 
+static uint16_t first_dvsec_offset(CXLType3Dev *ct3d)
+{
+uint16_t offset = PCI_CONFIG_SPACE_SIZE;
+
+if (ct3d->sn != UI64_NULL)
+offset += PCI_EXT_CAP_DSN_SIZEOF;
+
+return offset;
+}
+
 static void ct3_realize(PCIDevice *pci_dev, Error **errp)
 {
 ERRP_GUARD();
@@ -663,13 +673,10 @@ static void ct3_realize(PCIDevice *pci_dev, Error **errp)
 pci_config_set_prog_interface(pci_conf, 0x10);
 
 pcie_endpoint_cap_init(pci_dev, 0x80);
-if (ct3d->sn != UI64_NULL) {
-pcie_dev_ser_num_init(pci_dev, 0x100, ct3d->sn);
-cxl_cstate->dvsec_offset = 0x100 + 0x0c;
-} else {
-cxl_cstate->dvsec_offset = 0x100;
-}
+if (ct3d->sn != UI64_NULL)
+pcie_dev_ser_num_init(pci_dev, PCI_CONFIG_SPACE_SIZE, ct3d->sn);
 
+cxl_cstate->dvsec_offset = first_dvsec_offset(ct3d);
 ct3d->cxl_cstate.pdev = pci_dev;
 build_dvsecs(ct3d);
 
-- 
2.29.2




[PATCH 2/2] CXL/cxl_type3: reset DVSEC CXL Control in ct3d_reset

2024-04-01 Thread Li Zhijian via
After the kernel commit
0cab68720598 ("cxl/pci: Fix disabling memory if DVSEC CXL Range does not match 
a CFMWS window")
CXL type3 devices cannot be enabled again after the reboot because this
flag was not reset.

This flag could be changed by the firmware or OS, let it have a
reset(default) value in reboot so that the OS can read its clean status.

Signed-off-by: Li Zhijian 
---
 hw/mem/cxl_type3.c | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index ad2fe7d463fb..3fe136053390 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -305,7 +305,8 @@ static void build_dvsecs(CXLType3Dev *ct3d)
 
 dvsec = (uint8_t *)&(CXLDVSECDevice){
 .cap = 0x1e,
-.ctrl = 0x2,
+#define CT3D_DEVSEC_CXL_CTRL 0x2
+.ctrl = CT3D_DEVSEC_CXL_CTRL,
 .status2 = 0x2,
 .range1_size_hi = range1_size_hi,
 .range1_size_lo = range1_size_lo,
@@ -906,6 +907,16 @@ MemTxResult cxl_type3_write(PCIDevice *d, hwaddr 
host_addr, uint64_t data,
 return address_space_write(as, dpa_offset, attrs, , size);
 }
 
+/* Reset DVSEC CXL Control */
+static void ct3d_dvsec_cxl_ctrl_reset(CXLType3Dev *ct3d)
+{
+uint16_t offset = first_dvsec_offset(ct3d);
+CXLDVSECDevice *dvsec;
+
+dvsec = (CXLDVSECDevice *)(ct3d->cxl_cstate.pdev->config + offset);
+dvsec->ctrl = CT3D_DEVSEC_CXL_CTRL;
+}
+
 static void ct3d_reset(DeviceState *dev)
 {
 CXLType3Dev *ct3d = CXL_TYPE3(dev);
@@ -914,6 +925,7 @@ static void ct3d_reset(DeviceState *dev)
 
 cxl_component_register_init_common(reg_state, write_msk, 
CXL2_TYPE3_DEVICE);
 cxl_device_register_init_t3(ct3d);
+ct3d_dvsec_cxl_ctrl_reset(ct3d);
 
 /*
  * Bring up an endpoint to target with MCTP over VDM.
-- 
2.29.2




[PATCH] migration, docs: mark RDMA migration as deprecated

2024-03-31 Thread Li Zhijian via
Except for RDMA migration, other parts of the RDMA subsystem have been
removed since 9.1.

Due to the lack of unit tests and CI tests for RDMA migration, int the
past developing cycles, a few fatal errors were introduced and broke the
RDMA migration, and these issues[1][2] were not fixed until some time later.

Modern network cards (TCP/IP) can also provide high bandwidth
(similar to RDMA) to handle the large amount of data generated during
migration.

Issue a warning to inform the end users of the RDMA migration status.

[1] https://lore.kernel.org/r/20230920090412.726725-1-lizhij...@fujitsu.com
[2] 
https://lore.kernel.org/r/cahecvy7hxswn4ow_kog+q+tn6f_kmeichevz1qgm-fbxbpp...@mail.gmail.com

CC: Peter Xu 
CC: Philippe Mathieu-Daudé 
CC: Fabiano Rosas 
CC: Thomas Huth 
CC: Daniel P. Berrangé 
CC: Yu Zhang 
Signed-off-by: Li Zhijian 
---
 docs/about/deprecated.rst | 15 +++
 migration/migration.c |  1 +
 2 files changed, 16 insertions(+)

diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst
index 7b548519b5..fe70a7009e 100644
--- a/docs/about/deprecated.rst
+++ b/docs/about/deprecated.rst
@@ -529,3 +529,18 @@ Compression method fails too much.  Too many races.  We 
are going to
 remove it if nobody fixes it.  For starters, migration-test
 compression tests are disabled because they fail randomly.  If you need
 compression, use multifd compression methods.
+
+RDMA migration (since 9.1)
+''
+
+The QEMU project intends to remove the whole RDMA subsystem from the
+code base in a future release without replacement unless somebody steps
+up and improves the situation. So far, except for RDMA migration, other
+parts of the RDMA subsystem have been removed since 9.1.
+
+Due to the lack of unit tests and CI tests for RDMA migration, in the past
+developing cycles, a few fatal errors were introduced and broke the RDMA
+migration, and these issues were not fixed until some time later.
+
+Modern network cards (TCP/IP) can also provide high bandwidth (similar to RDMA)
+to handle the large amount of data generated during migration.
diff --git a/migration/migration.c b/migration/migration.c
index 9fe8fd2afd..807d66bbba 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -563,6 +563,7 @@ bool migrate_uri_parse(const char *uri, MigrationChannel 
**channel,
 qapi_free_InetSocketAddress(isock);
 return false;
 }
+warn_report("RDMA migration is deprecated and will be removed in a 
future release");
 addr->transport = MIGRATION_ADDRESS_TYPE_RDMA;
 } else if (strstart(uri, "tcp:", NULL) ||
 strstart(uri, "unix:", NULL) ||
-- 
2.41.0




[PATCH v2 1/2] hw/cxl: Pass CXLComponentState to cache_mem_ops

2023-10-18 Thread Li Zhijian
cache_mem_ops.{read,write}() interprets opaque as
CXLComponentState(cxl_cstate) instead of ComponentRegisters(cregs).

Fortunately, cregs is the first member of cxl_cstate, so their values are
the same.

Fixes: 9e58f52d3f8 ("hw/cxl/component: Introduce CXL components (8.1.x, 8.2.5)")
Signed-off-by: Li Zhijian 
---
V2: change the source side since cache_mem_ops.{read,write}() will use
cxl_cstate.
---
 hw/cxl/cxl-component-utils.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/cxl/cxl-component-utils.c b/hw/cxl/cxl-component-utils.c
index f3bbf0fd131..6214dcdcc12 100644
--- a/hw/cxl/cxl-component-utils.c
+++ b/hw/cxl/cxl-component-utils.c
@@ -179,7 +179,7 @@ void cxl_component_register_block_init(Object *obj,
 /* io registers controls link which we don't care about in QEMU */
 memory_region_init_io(>io, obj, NULL, cregs, ".io",
   CXL2_COMPONENT_IO_REGION_SIZE);
-memory_region_init_io(>cache_mem, obj, _mem_ops, cregs,
+memory_region_init_io(>cache_mem, obj, _mem_ops, cxl_cstate,
   ".cache_mem", CXL2_COMPONENT_CM_REGION_SIZE);
 
 memory_region_add_subregion(>component_registers, 0, >io);
-- 
2.41.0




[PATCH v2 2/2] hw/cxl: Pass NULL for a NULL MemoryRegionOps

2023-10-18 Thread Li Zhijian
a NULL parameter is enough for a NULL MemoryRegionOps

Signed-off-by: Li Zhijian 
---
 hw/cxl/cxl-component-utils.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/cxl/cxl-component-utils.c b/hw/cxl/cxl-component-utils.c
index 6214dcdcc12..010ed82edab 100644
--- a/hw/cxl/cxl-component-utils.c
+++ b/hw/cxl/cxl-component-utils.c
@@ -177,7 +177,7 @@ void cxl_component_register_block_init(Object *obj,
CXL2_COMPONENT_BLOCK_SIZE);
 
 /* io registers controls link which we don't care about in QEMU */
-memory_region_init_io(>io, obj, NULL, cregs, ".io",
+memory_region_init_io(>io, obj, NULL, NULL, ".io",
   CXL2_COMPONENT_IO_REGION_SIZE);
 memory_region_init_io(>cache_mem, obj, _mem_ops, cxl_cstate,
   ".cache_mem", CXL2_COMPONENT_CM_REGION_SIZE);
-- 
2.41.0




[PATCH] hw/cxl: Fix opaque type interpret wrongly

2023-10-12 Thread Li Zhijian
void cxl_component_register_block_init(Object *obj,
   CXLComponentState *cxl_cstate,
   const char *type)
{
ComponentRegisters *cregs = _cstate->crb;
...
memory_region_init_io(>cache_mem, obj, _mem_ops, cregs,
  ".cache_mem", CXL2_COMPONENT_CM_REGION_SIZE);

Obviously, opaque should be pointer to ComponentRegisters.
Fortunately, cregs is the first member of cxl_state, so their values are
the same.

Fixes: 9e58f52d3f8 ("hw/cxl/component: Introduce CXL components (8.1.x, 8.2.5)")
Signed-off-by: Li Zhijian 
---
 hw/cxl/cxl-component-utils.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/hw/cxl/cxl-component-utils.c b/hw/cxl/cxl-component-utils.c
index f3bbf0fd131..f27a9d3cf60 100644
--- a/hw/cxl/cxl-component-utils.c
+++ b/hw/cxl/cxl-component-utils.c
@@ -64,8 +64,7 @@ hwaddr cxl_decode_ig(int ig)
 static uint64_t cxl_cache_mem_read_reg(void *opaque, hwaddr offset,
unsigned size)
 {
-CXLComponentState *cxl_cstate = opaque;
-ComponentRegisters *cregs = _cstate->crb;
+ComponentRegisters *cregs = opaque;
 
 if (size == 8) {
 qemu_log_mask(LOG_UNIMP,
@@ -113,8 +112,7 @@ static void dumb_hdm_handler(CXLComponentState *cxl_cstate, 
hwaddr offset,
 static void cxl_cache_mem_write_reg(void *opaque, hwaddr offset, uint64_t 
value,
 unsigned size)
 {
-CXLComponentState *cxl_cstate = opaque;
-ComponentRegisters *cregs = _cstate->crb;
+ComponentRegisters *cregs = opaque;
 uint32_t mask;
 
 if (size == 8) {
-- 
2.41.0




[PATCH v2 1/2] migration: Fix rdma migration failed

2023-09-26 Thread Li Zhijian
Migration over RDMA failed since
commit: 294e5a4034 ("multifd: Only flush once each full round of memory")
with erors:
qemu-system-x86_64: rdma: Too many requests in this message 
(3638950032).Bailing.

migration with RDMA is different from tcp. RDMA has its own control
message, and all traffic between RDMA_CONTROL_REGISTER_REQUEST and
RDMA_CONTROL_REGISTER_FINISHED should not be disturbed.

find_dirty_block() will be called during RDMA_CONTROL_REGISTER_REQUEST
and RDMA_CONTROL_REGISTER_FINISHED, it will send a extra traffic(
RAM_SAVE_FLAG_MULTIFD_FLUSH) to destination and cause migration to fail
even though multifd is disabled.

This change make migrate_multifd_flush_after_each_section() return true
when multifd is disabled, that also means RAM_SAVE_FLAG_MULTIFD_FLUSH
will not be sent to destination any more when multifd is disabled.

Fixes: 294e5a4034 ("multifd: Only flush once each full round of memory")
CC: Fabiano Rosas 
Signed-off-by: Li Zhijian 
---

V2: put that check at the entry of migrate_multifd_flush_after_each_section() # 
Peter
---
 migration/options.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/migration/options.c b/migration/options.c
index 1d1e1321b0..327bcf2fbe 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -368,7 +368,7 @@ bool migrate_multifd_flush_after_each_section(void)
 {
 MigrationState *s = migrate_get_current();
 
-return s->multifd_flush_after_each_section;
+return !migrate_multifd() || s->multifd_flush_after_each_section;
 }
 
 bool migrate_postcopy(void)
-- 
2.31.1




[PATCH v2 2/2] migration/rdma: zore out head.repeat to make the error more clear

2023-09-26 Thread Li Zhijian
Previously, we got a confusion error that complains
the RDMAControlHeader.repeat:
qemu-system-x86_64: rdma: Too many requests in this message 
(3638950032).Bailing.

Actually, it's caused by an unexpected RDMAControlHeader.type.
After this patch, error will become:
qemu-system-x86_64: Unknown control message QEMU FILE

Reviewed-by: Fabiano Rosas 
Reviewed-by: Peter Xu 
Signed-off-by: Li Zhijian 

---
V2: add reviewed-by tags
---
 migration/rdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index a2a3db35b1..3073d9953c 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -2812,7 +2812,7 @@ static ssize_t qio_channel_rdma_writev(QIOChannel *ioc,
 size_t remaining = iov[i].iov_len;
 uint8_t * data = (void *)iov[i].iov_base;
 while (remaining) {
-RDMAControlHeader head;
+RDMAControlHeader head = {};
 
 len = MIN(remaining, RDMA_SEND_INCREMENT);
 remaining -= len;
-- 
2.31.1




[PATCH 2/2] migration/rdma: zore out head.repeat to make the error more clear

2023-09-20 Thread Li Zhijian
From: Li Zhijian 

Previously, we got a confusion error that complains
the RDMAControlHeader.repeat:
qemu-system-x86_64: rdma: Too many requests in this message 
(3638950032).Bailing.

Actually, it's caused by an unexpected RDMAControlHeader.type.
After this patch, error will become:
qemu-system-x86_64: Unknown control message QEMU FILE

Signed-off-by: Li Zhijian 
---
 migration/rdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index a2a3db35b1..3073d9953c 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -2812,7 +2812,7 @@ static ssize_t qio_channel_rdma_writev(QIOChannel *ioc,
 size_t remaining = iov[i].iov_len;
 uint8_t * data = (void *)iov[i].iov_base;
 while (remaining) {
-RDMAControlHeader head;
+RDMAControlHeader head = {};
 
 len = MIN(remaining, RDMA_SEND_INCREMENT);
 remaining -= len;
-- 
2.31.1




[PATCH 1/2] migration: Fix rdma migration failed

2023-09-20 Thread Li Zhijian
From: Li Zhijian 

Destination will fail with:
qemu-system-x86_64: rdma: Too many requests in this message 
(3638950032).Bailing.

migrate with RDMA is different from tcp. RDMA has its own control
message, and all traffic between RDMA_CONTROL_REGISTER_REQUEST and
RDMA_CONTROL_REGISTER_FINISHED should not be disturbed.

find_dirty_block() will be called during RDMA_CONTROL_REGISTER_REQUEST
and RDMA_CONTROL_REGISTER_FINISHED, it will send a extra traffic to
destination and cause migration to fail.

Since there's no existing subroutine to indicate whether it's migrated
by RDMA or not, and RDMA is not compatible with multifd, we use
migrate_multifd() here.

Fixes: 294e5a4034 ("multifd: Only flush once each full round of memory")
Signed-off-by: Li Zhijian 
---
 migration/ram.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/migration/ram.c b/migration/ram.c
index 9040d66e61..89ae28e21a 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1399,7 +1399,8 @@ static int find_dirty_block(RAMState *rs, 
PageSearchStatus *pss)
 pss->page = 0;
 pss->block = QLIST_NEXT_RCU(pss->block, next);
 if (!pss->block) {
-if (!migrate_multifd_flush_after_each_section()) {
+if (migrate_multifd() &&
+!migrate_multifd_flush_after_each_section()) {
 QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel;
 int ret = multifd_send_sync_main(f);
 if (ret < 0) {
-- 
2.31.1




[PATCH v3] hw/cxl: Fix CFMW config memory leak

2023-05-31 Thread Li Zhijian
Allocate targets and targets[n] resources when all sanity checks are
passed to avoid memory leaks.

Suggested-by: Philippe Mathieu-Daudé 
Signed-off-by: Li Zhijian 
---
V3: allocte further resource when we can't fail # Philippe
V2: Delete unnecesarry check
---
 hw/cxl/cxl-host.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/hw/cxl/cxl-host.c b/hw/cxl/cxl-host.c
index 034c7805b3e..f0920da956d 100644
--- a/hw/cxl/cxl-host.c
+++ b/hw/cxl/cxl-host.c
@@ -39,12 +39,6 @@ static void cxl_fixed_memory_window_config(CXLState 
*cxl_state,
 return;
 }
 
-fw->targets = g_malloc0_n(fw->num_targets, sizeof(*fw->targets));
-for (i = 0, target = object->targets; target; i++, target = target->next) {
-/* This link cannot be resolved yet, so stash the name for now */
-fw->targets[i] = g_strdup(target->value);
-}
-
 if (object->size % (256 * MiB)) {
 error_setg(errp,
"Size of a CXL fixed memory window must be a multiple of 
256MiB");
@@ -64,6 +58,12 @@ static void cxl_fixed_memory_window_config(CXLState 
*cxl_state,
 fw->enc_int_gran = 0;
 }
 
+fw->targets = g_malloc0_n(fw->num_targets, sizeof(*fw->targets));
+for (i = 0, target = object->targets; target; i++, target = target->next) {
+/* This link cannot be resolved yet, so stash the name for now */
+fw->targets[i] = g_strdup(target->value);
+}
+
 cxl_state->fixed_windows = g_list_append(cxl_state->fixed_windows,
  g_steal_pointer());
 
-- 
2.31.1






[PATCH v2] hw/cxl: Fix CFMW config memory leak

2023-05-30 Thread Li Zhijian
Only 'fw' pointer is marked as g_autofree, so we shoud free other
resource manually in error path.

Signed-off-by: Li Zhijian 
---
V2: Delete unnecesarry check
---
 hw/cxl/cxl-host.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/hw/cxl/cxl-host.c b/hw/cxl/cxl-host.c
index 034c7805b3e..787a2e779d2 100644
--- a/hw/cxl/cxl-host.c
+++ b/hw/cxl/cxl-host.c
@@ -48,7 +48,7 @@ static void cxl_fixed_memory_window_config(CXLState 
*cxl_state,
 if (object->size % (256 * MiB)) {
 error_setg(errp,
"Size of a CXL fixed memory window must be a multiple of 
256MiB");
-return;
+goto err_free;
 }
 fw->size = object->size;
 
@@ -57,7 +57,7 @@ static void cxl_fixed_memory_window_config(CXLState 
*cxl_state,
 cxl_interleave_granularity_enc(object->interleave_granularity,
errp);
 if (*errp) {
-return;
+goto err_free;
 }
 } else {
 /* Default to 256 byte interleave */
@@ -68,6 +68,12 @@ static void cxl_fixed_memory_window_config(CXLState 
*cxl_state,
  g_steal_pointer());
 
 return;
+
+err_free:
+for (i = 0; i < fw->num_targets; i++) {
+g_free(fw->targets[i]);
+}
+g_free(fw->targets);
 }
 
 void cxl_fmws_link_targets(CXLState *cxl_state, Error **errp)
-- 
2.31.1






[PATCH] hw/clx: Fix CFMW config memory leak

2023-05-29 Thread Li Zhijian
Only 'fw' pointer is marked as g_autofree, so we shoud free other
resource manually in error path.

Signed-off-by: Li Zhijian 
---
 hw/cxl/cxl-host.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/hw/cxl/cxl-host.c b/hw/cxl/cxl-host.c
index 034c7805b3e..dd1a7c83f71 100644
--- a/hw/cxl/cxl-host.c
+++ b/hw/cxl/cxl-host.c
@@ -48,7 +48,7 @@ static void cxl_fixed_memory_window_config(CXLState 
*cxl_state,
 if (object->size % (256 * MiB)) {
 error_setg(errp,
"Size of a CXL fixed memory window must be a multiple of 
256MiB");
-return;
+goto err_free;
 }
 fw->size = object->size;
 
@@ -57,7 +57,7 @@ static void cxl_fixed_memory_window_config(CXLState 
*cxl_state,
 cxl_interleave_granularity_enc(object->interleave_granularity,
errp);
 if (*errp) {
-return;
+goto err_free;
 }
 } else {
 /* Default to 256 byte interleave */
@@ -68,6 +68,12 @@ static void cxl_fixed_memory_window_config(CXLState 
*cxl_state,
  g_steal_pointer());
 
 return;
+
+err_free:
+for (i = 0; i < fw->num_targets && fw->targets[i]; i++) {
+g_free(fw->targets[i]);
+}
+g_free(fw->targets);
 }
 
 void cxl_fmws_link_targets(CXLState *cxl_state, Error **errp)
-- 
2.31.1






[PATCH 1/2] docs/cxl: Correct CFMW number

2023-05-24 Thread Li Zhijian
The 'Notes:' in this document mentioned CFMW{0-2}, but the figure missed
CFMW2.

Signed-off-by: Li Zhijian 
---
I'm totally new to CXL, so i have little confidence to this change :)
---
 docs/system/devices/cxl.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/system/devices/cxl.rst b/docs/system/devices/cxl.rst
index dce43476129..d3577a4d6da 100644
--- a/docs/system/devices/cxl.rst
+++ b/docs/system/devices/cxl.rst
@@ -162,7 +162,7 @@ Example system Topology. x marks the match in each decoder 
level::
   |<--SYSTEM PHYSICAL ADDRESS MAP (1)->|
   |__   __   __|
   |   |  | |  | |  |   |
-  |   | CFMW 0   | |  CXL Fixed Memory Window 1   | | CFMW 1   |   |
+  |   | CFMW 0   | |  CXL Fixed Memory Window 1   | | CFMW 2   |   |
   |   | HB0 only | |  Configured to interleave memory | | HB1 only |   |
   |   |  | |  memory accesses across HB0/HB1  | |  |   |
   |   |__| |_x| |__|   |
@@ -247,7 +247,7 @@ Example topology involving a switch::
   |<--SYSTEM PHYSICAL ADDRESS MAP (1)->|
   |__   __   __|
   |   |  | |  | |  |   |
-  |   | CFMW 0   | |  CXL Fixed Memory Window 1   | | CFMW 1   |   |
+  |   | CFMW 0   | |  CXL Fixed Memory Window 1   | | CFMW 2   |   |
   |   | HB0 only | |  Configured to interleave memory | | HB1 only |   |
   |   |  | |  memory accesses across HB0/HB1  | |  |   |
   |   |x_| |__| |__|   |
-- 
2.31.1






[PATCH 2/2] docs/clx: Change to lowercase as others

2023-05-24 Thread Li Zhijian
Using the same style except the 'Topo' abbreviation.

Signed-off-by: Li Zhijian 
---
I'm not a native speaker, feel free to correct me.
---
 docs/system/devices/cxl.rst | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/system/devices/cxl.rst b/docs/system/devices/cxl.rst
index d3577a4d6da..56414d25871 100644
--- a/docs/system/devices/cxl.rst
+++ b/docs/system/devices/cxl.rst
@@ -157,7 +157,7 @@ responsible for allocating appropriate ranges from within 
the CFMWs
 and exposing those via normal memory configurations as would be done
 for system RAM.
 
-Example system Topology. x marks the match in each decoder level::
+Example system topology. x marks the match in each decoder level::
 
   |<--SYSTEM PHYSICAL ADDRESS MAP (1)->|
   |__   __   __|
@@ -187,8 +187,8 @@ Example system Topology. x marks the match in each decoder 
level::
___|___   __|__   __|_   ___|_
(3)|  Root Port 0  | | Root Port 1 | | Root Port 2| | Root Port 3 |
   |  Appears in   | | Appears in  | | Appears in | | Appear in   |
-  |  PCI topology | | PCI Topology| | PCI Topo   | | PCI Topo|
-  |  As 0c:00.0   | | as 0c:01.0  | | as de:00.0 | | as de:01.0  |
+  |  PCI topology | | PCI topology| | PCI Topo   | | PCI Topo|
+  |  as 0c:00.0   | | as 0c:01.0  | | as de:00.0 | | as de:01.0  |
   |___| |_| || |_|
 |  |   |  |
 |  |   |  |
@@ -272,7 +272,7 @@ Example topology involving a switch::
   |  Root Port 0  |
   |  Appears in   |
   |  PCI topology |
-  |  As 0c:00.0   |
+  |  as 0c:00.0   |
   |___x___|
   |
   |
-- 
2.31.1






[PATCH] MAINTAINERS: email address change

2021-12-30 Thread Li Zhijian
Fujitsu's mail service has migrated to O365 months ago, the
lizhij...@cn.fujitsu.com address will stop working on 2022-06-01,
change it to my new email address lizhij...@fujitsu.com.

Signed-off-by: Li Zhijian 
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 50435b8d2f5..e5cda5886d4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2943,7 +2943,7 @@ F: docs/COLO-FT.txt
 
 COLO Proxy
 M: Zhang Chen 
-M: Li Zhijian 
+M: Li Zhijian 
 S: Supported
 F: docs/colo-proxy.txt
 F: net/colo*
-- 
2.33.0






[PATCH v4] migration/rdma: Fix out of order wrid

2021-10-28 Thread Li Zhijian
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev 
tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device 
e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive 
if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device 
virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 
2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl 
-spice streaming-video=filter,port=5902,disable-ticketing -incoming 
rdma:192.168.22.23:
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: 
warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name 
uverbs2, infiniband_verbs class device path 
/sys/class/infiniband_verbs/uverbs2, infiniband class device path 
/sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL 
RECV (4000)

source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev 
tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device 
e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive 
if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device 
virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 
2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl 
-spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: 
warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name 
uverbs2, infiniband_verbs class device path 
/sys/class/infiniband_verbs/uverbs2, infiniband class device path 
/sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got 
CONTROL RECV (4000)

NOTE: we use soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0

This migration could not be completed when out of order(OOO) CQ event occurs.
The send queue and receive queue shared a same completion queue, and
qemu_rdma_block_for_wrid() will drop the CQs it's not interested in. But
the dropped CQs by qemu_rdma_block_for_wrid() could be later CQs it wants.
So in this case, qemu_rdma_block_for_wrid() will block forever.

OOO cases will occur in both source side and destination side. And a
forever blocking happens on only SEND and RECV are out of order. OOO between
'WRITE RDMA' and 'RECV' doesn't matter.

below the OOO sequence:
   source destination
  rdma_write_one()   qemu_rdma_registration_handle()
1.S1: post_recv XD1: post_recv Y
2.wait for recv CQ event X
3.   D2: post_send X ---+
4.   wait for send CQ send event X (D2) |
5.recv CQ event X reaches (D2)  |
6.  +-S2: post_send Y   |
7.  | wait for send CQ event Y  |
8.  |recv CQ event Y (S2) (drop it) |
9.  +-send CQ event Y reaches (S2)  |
10.  send CQ event X reaches (D2)  -+
11.  wait recv CQ event Y (dropped by (8))

Although a hardware IB works fine in my a hundred of runs, the IB specification
doesn't guaratee the CQ order in such case.

Here we introduce a independent send completion queue to distinguish
ibv_post_send completion queue from the original mixed completion queue.
It helps us to poll the specific CQE we are really interested in.

Signed-off-by: Li Zhijian 
---
V4: amend log messages # Dave
V3: rebase code, and combine 2/2 to 1/2
V2: Introduce send completion queue
---
 migration/rdma.c | 138 ++-
 1 file changed, 101 insertions(+), 37 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index 2a3c7889b9f..f5d3bbe7e9c 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -358,9 +358,11 @@ typedef struct RDMAContext {
 struct ibv_context  *verbs;
 struct rdma_event_channel   *channel;
 struct ibv_qp *qp;  /* queue pair */
-struct ibv_comp_channel *comp_channel;  /* completion channel */
+struct ibv_comp_channel

[PATCH v3] migration/rdma: Fix out of order wrid

2021-09-27 Thread Li Zhijian
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev 
tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device 
e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive 
if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device 
virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 
2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl 
-spice streaming-video=filter,port=5902,disable-ticketing -incoming 
rdma:192.168.22.23:
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: 
warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name 
uverbs2, infiniband_verbs class device path 
/sys/class/infiniband_verbs/uverbs2, infiniband class device path 
/sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL 
RECV (4000)

source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev 
tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device 
e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive 
if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device 
virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 
2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl 
-spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: 
warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name 
uverbs2, infiniband_verbs class device path 
/sys/class/infiniband_verbs/uverbs2, infiniband class device path 
/sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got 
CONTROL RECV (4000)

NOTE: we use soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0

This migration could not be completed when out of order(OOO) CQ event occurs.
The send queue and receive queue shared a same completion queue, and
qemu_rdma_block_for_wrid() will drop the CQs it's not interested in. But
the dropped CQs by qemu_rdma_block_for_wrid() could be later CQs it wants.
So in this case, qemu_rdma_block_for_wrid() will block forever.

OOO cases will occur in both source side and destination side. And a
forever blocking happens on only SEND and RECV are out of order. OOO between
'WRITE RDMA' and 'RECV' doesn't matter.

below the OOO sequence:
   source destination
  rdma_write_one()   qemu_rdma_registration_handle()
1.S1: post_recv XD1: post_recv Y
2.wait for recv CQ event X
3.   D2: post_send X ---+
4.   wait for send CQ send event X (D2) |
5.recv CQ event X reaches (D2)  |
6.  +-S2: post_send Y   |
7.  | wait for send CQ event Y  |
8.  |recv CQ event Y (S2) (drop it) |
9.  +-send CQ event Y reaches (S2)  |
10.  send CQ event X reaches (D2)  -+
11.  wait recv CQ event Y (dropped by (8))

Although a hardware IB works fine in my a hundred of runs, the IB specification
doesn't guaratee the CQ order in such case.

Here we introduce a independent send completion queue to distinguish
ibv_post_send completion queue from the original mixed completion queue.
It helps us to poll the specific CQE we are really interested in.

Signed-off-by: Li Zhijian 
---
V3: rebase code, and combine 2/2 to 1/2
V2: Introduce send completion queue
---
 migration/rdma.c | 132 +++
 1 file changed, 98 insertions(+), 34 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index 5c2d113aa94..bb19a5afe73 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -358,9 +358,11 @@ typedef struct RDMAContext {
 struct ibv_context  *verbs;
 struct rdma_event_channel   *channel;
 struct ibv_qp *qp;  /* queue pair */
-struct ibv_comp_channel *comp_channel;  /* completion channel */
+struct ibv_comp_channel *recv_comp_channel;  /* recv

Re: [PULL 0/5] Migration.next patches

2021-09-15 Thread Li, Zhijian



on 2021/9/14 21:19, Peter Maydell wrote:

On Tue, 14 Sept 2021 at 12:56, Juan Quintela  wrote:

The following changes since commit c6f5e042d89e79206cd1ce5525d3df219f13c3cc:

   Merge remote-tracking branch 
'remotes/pmaydell/tags/pull-target-arm-20210913-3' into staging (2021-09-13 
21:06:15 +0100)

are available in the Git repository at:

   https://github.com/juanquintela/qemu.git tags/migration.next-pull-request

for you to fetch changes up to d634d0e7b0225f97f45cecb72ca90bd0e7bdb211:

   migration/ram: Don't passs RAMState to 
migration_clear_memory_region_dirty_bitmap_*() (2021-09-14 13:45:06 +0200)


Migration Pull request (take 2)

This pull request includes:
- Remove RAMState unused parameter for several prototypes (dropped)
- RDMA fix
- give an error when using RDMA and multifd
- Implement yank for multifd send side

Please, Apply.


Hi; this fails to build on FreeBSD:

../src/migration/rdma.c:1146:23: error: use of undeclared identifier
'IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE'
 int advice = wr ? IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE :
   ^
../src/migration/rdma.c:1147:18: error: use of undeclared identifier
'IBV_ADVISE_MR_ADVICE_PREFETCH'
  IBV_ADVISE_MR_ADVICE_PREFETCH;
  ^
../src/migration/rdma.c:1150:11: warning: implicit declaration of
function 'ibv_advise_mr' is invalid in C99
[-Wimplicit-function-declaration]
 ret = ibv_advise_mr(pd, advice,
   ^
../src/migration/rdma.c:1151:25: error: use of undeclared identifier
'IBV_ADVISE_MR_FLAG_FLUSH'
 IBV_ADVISE_MR_FLAG_FLUSH, _list, 1);
 ^
1 warning and 3 errors generated.

Looking at the code, none of the proposed ways to detect
whether the host has this function seem to have been implemented:
did you push the wrong branch ?



Yes, this errors should be fixed in my V3 patches: [PATCH v3 0/2] enable fsdax 
rdma migration
And i recalled that the 1st pull request included 7 patches, 2 are deleted in 
this time?

Thanks
Zhijina




thanks
-- PMM







Re: [PULL 0/7] Migration.next patches

2021-09-10 Thread Li, Zhijian



on 2021/9/10 20:55, Philippe Mathieu-Daudé wrote:

BTW: Does QEMU provide any mean to set http(s)_proxy to building vm ? 
Currently, i have to
hack the code like:

-self.ssh_root_check("pkg install -y %s\n" % " ".join(self.pkgs))
+self.ssh_root_check("setenv HTTP_PROXY http://myproxy; setenv HTTPS_PROXY 
http://myproxy; pkg install -y %s\n" % " ".join(self.pkgs))

This is supported since commit b08ba163aaa ("tests/vm: send proxy
environment variables over ssh"). Maybe we only pass lower case
variables and should consider upper case too?


Great, I'm glad to know this. Thank you.
Lower case variables also work well on FreeBSD, so it's sufficient i think.


Thanks
Zhijian






[PATCH v3 1/2] migration/rdma: Try to register On-Demand Paging memory region

2021-09-10 Thread Li Zhijian
Previously, for the fsdax mem-backend-file, it will register failed with
Operation not supported. In this case, we can try to register it with
On-Demand Paging[1] like what rpma_mr_reg() does on rpma[2].

[1]: 
https://community.mellanox.com/s/article/understanding-on-demand-paging--odp-x
[2]: http://pmem.io/rpma/manpages/v0.9.0/rpma_mr_reg.3

CC: Marcel Apfelbaum 
Signed-off-by: Li Zhijian 
Reviewed-by: Marcel Apfelbaum 

---
V2: add ODP sanity check and remove goto
---
 migration/rdma.c   | 73 ++
 migration/trace-events |  1 +
 2 files changed, 54 insertions(+), 20 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index 5c2d113aa94..eb80431aae2 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -1117,19 +1117,47 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma)
 return 0;
 }
 
+/* Check whether On-Demand Paging is supported by RDAM device */
+static bool rdma_support_odp(struct ibv_context *dev)
+{
+struct ibv_device_attr_ex attr = {0};
+int ret = ibv_query_device_ex(dev, NULL, );
+if (ret) {
+return false;
+}
+
+if (attr.odp_caps.general_caps & IBV_ODP_SUPPORT) {
+return true;
+}
+
+return false;
+}
+
 static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
 {
 int i;
 RDMALocalBlocks *local = >local_ram_blocks;
 
 for (i = 0; i < local->nb_blocks; i++) {
+int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
+
 local->block[i].mr =
 ibv_reg_mr(rdma->pd,
 local->block[i].local_host_addr,
-local->block[i].length,
-IBV_ACCESS_LOCAL_WRITE |
-IBV_ACCESS_REMOTE_WRITE
+local->block[i].length, access
 );
+
+if (!local->block[i].mr &&
+errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
+access |= IBV_ACCESS_ON_DEMAND;
+/* register ODP mr */
+local->block[i].mr =
+ibv_reg_mr(rdma->pd,
+   local->block[i].local_host_addr,
+   local->block[i].length, access);
+trace_qemu_rdma_register_odp_mr(local->block[i].block_name);
+}
+
 if (!local->block[i].mr) {
 perror("Failed to register local dest ram block!");
 break;
@@ -1215,28 +1243,33 @@ static int qemu_rdma_register_and_get_keys(RDMAContext 
*rdma,
  */
 if (!block->pmr[chunk]) {
 uint64_t len = chunk_end - chunk_start;
+int access = rkey ? IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE :
+ 0;
 
 trace_qemu_rdma_register_and_get_keys(len, chunk_start);
 
-block->pmr[chunk] = ibv_reg_mr(rdma->pd,
-chunk_start, len,
-(rkey ? (IBV_ACCESS_LOCAL_WRITE |
-IBV_ACCESS_REMOTE_WRITE) : 0));
-
-if (!block->pmr[chunk]) {
-perror("Failed to register chunk!");
-fprintf(stderr, "Chunk details: block: %d chunk index %d"
-" start %" PRIuPTR " end %" PRIuPTR
-" host %" PRIuPTR
-" local %" PRIuPTR " registrations: %d\n",
-block->index, chunk, (uintptr_t)chunk_start,
-(uintptr_t)chunk_end, host_addr,
-(uintptr_t)block->local_host_addr,
-rdma->total_registrations);
-return -1;
+block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
+if (!block->pmr[chunk] &&
+errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
+access |= IBV_ACCESS_ON_DEMAND;
+/* register ODP mr */
+block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
+trace_qemu_rdma_register_odp_mr(block->block_name);
 }
-rdma->total_registrations++;
 }
+if (!block->pmr[chunk]) {
+perror("Failed to register chunk!");
+fprintf(stderr, "Chunk details: block: %d chunk index %d"
+" start %" PRIuPTR " end %" PRIuPTR
+" host %" PRIuPTR
+" local %" PRIuPTR " registrations: %d\n",
+block->index, chunk, (uintptr_t)chunk_start,
+(uintptr_t)chunk_end, host_addr,
+(uintptr_t)block->local_host_addr,
+rdma->total_registrations);
+return -1;
+}
+rdma->total_registrations++;
 
 if (lkey) {
  

[PATCH v3 0/2] enable fsdax rdma migration

2021-09-10 Thread Li Zhijian
Previous qemu are facing 2 problems when migrating a fsdax memory backend with
RDMA protocol.
(1) ibv_reg_mr failed with Operation not supported
(2) requester(source) side could receive RNR NAK.

For the (1), we can try to register memory region with ODP feature which
has already been implemented in some modern HCA hardware/drivers.
For the (2), IB provides advise API to prefetch pages in specific memory
region. It can help driver reduce the page fault on responder(destination)
side during RDMA_WRITE.

V3: fix FreeBSD compling error


Li Zhijian (2):
  migration/rdma: Try to register On-Demand Paging memory region
  migration/rdma: advise prefetch write for ODP region

 meson.build|   6 +++
 migration/rdma.c   | 115 ++---
 migration/trace-events |   2 +
 3 files changed, 103 insertions(+), 20 deletions(-)

-- 
2.31.1






[PATCH v3 2/2] migration/rdma: advise prefetch write for ODP region

2021-09-10 Thread Li Zhijian
The responder mr registering with ODP will sent RNR NAK back to
the requester in the face of the page fault.
-
ibv_poll_cq wc.status=13 RNR retry counter exceeded!
ibv_poll_cq wrid=WRITE RDMA!
-
ibv_advise_mr(3) helps to make pages present before the actual IO is
conducted so that the responder does page fault as little as possible.

Signed-off-by: Li Zhijian 
Reviewed-by: Marcel Apfelbaum 

---
V3: Fix FreeBSD compiling errors
V2: use IBV_ADVISE_MR_FLAG_FLUSH instead of IB_UVERBS_ADVISE_MR_FLAG_FLUSH
and add Reviewed-by tag. # Marcel
---
 meson.build|  6 ++
 migration/rdma.c   | 42 ++
 migration/trace-events |  1 +
 3 files changed, 49 insertions(+)

diff --git a/meson.build b/meson.build
index 6e4d2d80343..97406d1b79b 100644
--- a/meson.build
+++ b/meson.build
@@ -1328,6 +1328,12 @@ config_host_data.set('HAVE_COPY_FILE_RANGE', 
cc.has_function('copy_file_range'))
 config_host_data.set('HAVE_OPENPTY', cc.has_function('openpty', dependencies: 
util))
 config_host_data.set('HAVE_STRCHRNUL', cc.has_function('strchrnul'))
 config_host_data.set('HAVE_SYSTEM_FUNCTION', cc.has_function('system', prefix: 
'#include '))
+if rdma.found()
+  config_host_data.set('HAVE_IBV_ADVISE_MR',
+   cc.has_function('ibv_advise_mr',
+   args: config_host['RDMA_LIBS'].split(),
+   prefix: '#include 
'))
+endif
 
 # has_header_symbol
 config_host_data.set('CONFIG_BYTESWAP_H',
diff --git a/migration/rdma.c b/migration/rdma.c
index eb80431aae2..2a3c7889b9f 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -1133,6 +1133,32 @@ static bool rdma_support_odp(struct ibv_context *dev)
 return false;
 }
 
+/*
+ * ibv_advise_mr to avoid RNR NAK error as far as possible.
+ * The responder mr registering with ODP will sent RNR NAK back to
+ * the requester in the face of the page fault.
+ */
+static void qemu_rdma_advise_prefetch_mr(struct ibv_pd *pd, uint64_t addr,
+ uint32_t len,  uint32_t lkey,
+ const char *name, bool wr)
+{
+#ifdef HAVE_IBV_ADVISE_MR
+int ret;
+int advice = wr ? IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE :
+ IBV_ADVISE_MR_ADVICE_PREFETCH;
+struct ibv_sge sg_list = {.lkey = lkey, .addr = addr, .length = len};
+
+ret = ibv_advise_mr(pd, advice,
+IBV_ADVISE_MR_FLAG_FLUSH, _list, 1);
+/* ignore the error */
+if (ret) {
+trace_qemu_rdma_advise_mr(name, len, addr, strerror(errno));
+} else {
+trace_qemu_rdma_advise_mr(name, len, addr, "successed");
+}
+#endif
+}
+
 static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
 {
 int i;
@@ -1156,6 +1182,15 @@ static int qemu_rdma_reg_whole_ram_blocks(RDMAContext 
*rdma)
local->block[i].local_host_addr,
local->block[i].length, access);
 trace_qemu_rdma_register_odp_mr(local->block[i].block_name);
+
+if (local->block[i].mr) {
+qemu_rdma_advise_prefetch_mr(rdma->pd,
+(uintptr_t)local->block[i].local_host_addr,
+local->block[i].length,
+local->block[i].mr->lkey,
+local->block[i].block_name,
+true);
+}
 }
 
 if (!local->block[i].mr) {
@@ -1255,6 +1290,13 @@ static int qemu_rdma_register_and_get_keys(RDMAContext 
*rdma,
 /* register ODP mr */
 block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
 trace_qemu_rdma_register_odp_mr(block->block_name);
+
+if (block->pmr[chunk]) {
+qemu_rdma_advise_prefetch_mr(rdma->pd, (uintptr_t)chunk_start,
+len, block->pmr[chunk]->lkey,
+block->block_name, rkey);
+
+}
 }
 }
 if (!block->pmr[chunk]) {
diff --git a/migration/trace-events b/migration/trace-events
index 5f6aa580def..a8ae163707c 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -213,6 +213,7 @@ qemu_rdma_poll_other(const char *compstr, int64_t comp, int 
left) "other complet
 qemu_rdma_post_send_control(const char *desc) "CONTROL: sending %s.."
 qemu_rdma_register_and_get_keys(uint64_t len, void *start) "Registering %" 
PRIu64 " bytes @ %p"
 qemu_rdma_register_odp_mr(const char *name) "Try to register On-Demand Paging 
memory region: %s"
+qemu_rdma_advise_mr(const char *name, uint32_t len, uint64_t addr, const char 
*res) "Try to advise block %s prefetch at %" PRIu32 "@0x%" PRIx64 &quo

Re: [PULL 0/7] Migration.next patches

2021-09-09 Thread Li, Zhijian

on 2021/9/9 21:42, Peter Maydell wrote:

On Thu, 9 Sept 2021 at 11:36, Juan Quintela  wrote:
Fails to build, FreeBSD:

../src/migration/rdma.c:1146:23: error: use of undeclared identifier
'IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE'
 int advice = wr ? IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE :
   ^
../src/migration/rdma.c:1147:18: error: use of undeclared identifier
'IBV_ADVISE_MR_ADVICE_PREFETCH'
  IBV_ADVISE_MR_ADVICE_PREFETCH;
  ^
../src/migration/rdma.c:1150:11: warning: implicit declaration of
function 'ibv_advise_mr' is invalid in C99
[-Wimplicit-function-declaration]
 ret = ibv_advise_mr(pd, advice,
   ^
../src/migration/rdma.c:1151:25: error: use of undeclared identifier
'IBV_ADVISE_MR_FLAG_FLUSH'
 IBV_ADVISE_MR_FLAG_FLUSH, _list, 1);
 ^


it's introduced by [PULL 4/7] migration/rdma: advise prefetch write for ODP 
region
where it calls a ibv_advise_mr(). i have checked the latest FreeBSD, it didn't 
ship with this API
May i know if just FressBSD reports this failure? if so, i just need filtering 
out FreeBSD only

Thanks
zhijian



-- PMM







[PATCH v2 1/2] migration/rdma: Try to register On-Demand Paging memory region

2021-08-22 Thread Li Zhijian
Previously, for the fsdax mem-backend-file, it will register failed with
Operation not supported. In this case, we can try to register it with
On-Demand Paging[1] like what rpma_mr_reg() does on rpma[2].

[1]: 
https://community.mellanox.com/s/article/understanding-on-demand-paging--odp-x
[2]: http://pmem.io/rpma/manpages/v0.9.0/rpma_mr_reg.3

CC: Marcel Apfelbaum 
Signed-off-by: Li Zhijian 

---
V2: add ODP sanity check and remove goto
---
 migration/rdma.c   | 73 ++
 migration/trace-events |  1 +
 2 files changed, 54 insertions(+), 20 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index 5c2d113aa94..eb80431aae2 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -1117,19 +1117,47 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma)
 return 0;
 }
 
+/* Check whether On-Demand Paging is supported by RDAM device */
+static bool rdma_support_odp(struct ibv_context *dev)
+{
+struct ibv_device_attr_ex attr = {0};
+int ret = ibv_query_device_ex(dev, NULL, );
+if (ret) {
+return false;
+}
+
+if (attr.odp_caps.general_caps & IBV_ODP_SUPPORT) {
+return true;
+}
+
+return false;
+}
+
 static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
 {
 int i;
 RDMALocalBlocks *local = >local_ram_blocks;
 
 for (i = 0; i < local->nb_blocks; i++) {
+int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
+
 local->block[i].mr =
 ibv_reg_mr(rdma->pd,
 local->block[i].local_host_addr,
-local->block[i].length,
-IBV_ACCESS_LOCAL_WRITE |
-IBV_ACCESS_REMOTE_WRITE
+local->block[i].length, access
 );
+
+if (!local->block[i].mr &&
+errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
+access |= IBV_ACCESS_ON_DEMAND;
+/* register ODP mr */
+local->block[i].mr =
+ibv_reg_mr(rdma->pd,
+   local->block[i].local_host_addr,
+   local->block[i].length, access);
+trace_qemu_rdma_register_odp_mr(local->block[i].block_name);
+}
+
 if (!local->block[i].mr) {
 perror("Failed to register local dest ram block!");
 break;
@@ -1215,28 +1243,33 @@ static int qemu_rdma_register_and_get_keys(RDMAContext 
*rdma,
  */
 if (!block->pmr[chunk]) {
 uint64_t len = chunk_end - chunk_start;
+int access = rkey ? IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE :
+ 0;
 
 trace_qemu_rdma_register_and_get_keys(len, chunk_start);
 
-block->pmr[chunk] = ibv_reg_mr(rdma->pd,
-chunk_start, len,
-(rkey ? (IBV_ACCESS_LOCAL_WRITE |
-IBV_ACCESS_REMOTE_WRITE) : 0));
-
-if (!block->pmr[chunk]) {
-perror("Failed to register chunk!");
-fprintf(stderr, "Chunk details: block: %d chunk index %d"
-" start %" PRIuPTR " end %" PRIuPTR
-" host %" PRIuPTR
-" local %" PRIuPTR " registrations: %d\n",
-block->index, chunk, (uintptr_t)chunk_start,
-(uintptr_t)chunk_end, host_addr,
-(uintptr_t)block->local_host_addr,
-rdma->total_registrations);
-return -1;
+block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
+if (!block->pmr[chunk] &&
+errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
+access |= IBV_ACCESS_ON_DEMAND;
+/* register ODP mr */
+block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
+trace_qemu_rdma_register_odp_mr(block->block_name);
 }
-rdma->total_registrations++;
 }
+if (!block->pmr[chunk]) {
+perror("Failed to register chunk!");
+fprintf(stderr, "Chunk details: block: %d chunk index %d"
+" start %" PRIuPTR " end %" PRIuPTR
+" host %" PRIuPTR
+" local %" PRIuPTR " registrations: %d\n",
+block->index, chunk, (uintptr_t)chunk_start,
+(uintptr_t)chunk_end, host_addr,
+(uintptr_t)block->local_host_addr,
+rdma->total_registrations);
+return -1;
+}
+rdma->total_registrations++;
 
 if (lkey) {
 *lkey = block->pmr[chunk]-&

[PATCH v2 2/2] migration/rdma: advise prefetch write for ODP region

2021-08-22 Thread Li Zhijian
The responder mr registering with ODP will sent RNR NAK back to
the requester in the face of the page fault.
-
ibv_poll_cq wc.status=13 RNR retry counter exceeded!
ibv_poll_cq wrid=WRITE RDMA!
-
ibv_advise_mr(3) helps to make pages present before the actual IO is
conducted so that the responder does page fault as little as possible.

Signed-off-by: Li Zhijian 
Reviewed-by: Marcel Apfelbaum 

---
V2: use IBV_ADVISE_MR_FLAG_FLUSH instead of IB_UVERBS_ADVISE_MR_FLAG_FLUSH
and add Reviewed-by tag. # Marcel
---
 migration/rdma.c   | 40 
 migration/trace-events |  1 +
 2 files changed, 41 insertions(+)

diff --git a/migration/rdma.c b/migration/rdma.c
index eb80431aae2..6c2cc3f617c 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -1133,6 +1133,30 @@ static bool rdma_support_odp(struct ibv_context *dev)
 return false;
 }
 
+/*
+ * ibv_advise_mr to avoid RNR NAK error as far as possible.
+ * The responder mr registering with ODP will sent RNR NAK back to
+ * the requester in the face of the page fault.
+ */
+static void qemu_rdma_advise_prefetch_mr(struct ibv_pd *pd, uint64_t addr,
+ uint32_t len,  uint32_t lkey,
+ const char *name, bool wr)
+{
+int ret;
+int advice = wr ? IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE :
+ IBV_ADVISE_MR_ADVICE_PREFETCH;
+struct ibv_sge sg_list = {.lkey = lkey, .addr = addr, .length = len};
+
+ret = ibv_advise_mr(pd, advice,
+IBV_ADVISE_MR_FLAG_FLUSH, _list, 1);
+/* ignore the error */
+if (ret) {
+trace_qemu_rdma_advise_mr(name, len, addr, strerror(errno));
+} else {
+trace_qemu_rdma_advise_mr(name, len, addr, "successed");
+}
+}
+
 static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
 {
 int i;
@@ -1156,6 +1180,15 @@ static int qemu_rdma_reg_whole_ram_blocks(RDMAContext 
*rdma)
local->block[i].local_host_addr,
local->block[i].length, access);
 trace_qemu_rdma_register_odp_mr(local->block[i].block_name);
+
+if (local->block[i].mr) {
+qemu_rdma_advise_prefetch_mr(rdma->pd,
+(uintptr_t)local->block[i].local_host_addr,
+local->block[i].length,
+local->block[i].mr->lkey,
+local->block[i].block_name,
+true);
+}
 }
 
 if (!local->block[i].mr) {
@@ -1255,6 +1288,13 @@ static int qemu_rdma_register_and_get_keys(RDMAContext 
*rdma,
 /* register ODP mr */
 block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
 trace_qemu_rdma_register_odp_mr(block->block_name);
+
+if (block->pmr[chunk]) {
+qemu_rdma_advise_prefetch_mr(rdma->pd, (uintptr_t)chunk_start,
+len, block->pmr[chunk]->lkey,
+block->block_name, rkey);
+
+}
 }
 }
 if (!block->pmr[chunk]) {
diff --git a/migration/trace-events b/migration/trace-events
index 5f6aa580def..a8ae163707c 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -213,6 +213,7 @@ qemu_rdma_poll_other(const char *compstr, int64_t comp, int 
left) "other complet
 qemu_rdma_post_send_control(const char *desc) "CONTROL: sending %s.."
 qemu_rdma_register_and_get_keys(uint64_t len, void *start) "Registering %" 
PRIu64 " bytes @ %p"
 qemu_rdma_register_odp_mr(const char *name) "Try to register On-Demand Paging 
memory region: %s"
+qemu_rdma_advise_mr(const char *name, uint32_t len, uint64_t addr, const char 
*res) "Try to advise block %s prefetch at %" PRIu32 "@0x%" PRIx64 ": %s"
 qemu_rdma_registration_handle_compress(int64_t length, int index, int64_t 
offset) "Zapping zero chunk: %" PRId64 " bytes, index %d, offset %" PRId64
 qemu_rdma_registration_handle_finished(void) ""
 qemu_rdma_registration_handle_ram_blocks(void) ""
-- 
2.31.1






[PATCH v2 0/2] enable fsdax rdma migration

2021-08-22 Thread Li Zhijian
Previous qemu are facing 2 problems when migrating a fsdax memory backend with
RDMA protocol.
(1) ibv_reg_mr failed with Operation not supported
(2) requester(source) side could receive RNR NAK.

For the (1), we can try to register memory region with ODP feature which
has already been implemented in some modern HCA hardware/drivers.
For the (2), IB provides advise API to prefetch pages in specific memory
region. It can help driver reduce the page fault on responder(destination)
side during RDMA_WRITE.

CC: marcel.apfelb...@gmail.com

Li Zhijian (2):
  migration/rdma: Try to register On-Demand Paging memory region
  migration/rdma: advise prefetch write for ODP region

 migration/rdma.c   | 117 +
 migration/trace-events |   2 +
 2 files changed, 98 insertions(+), 21 deletions(-)

-- 
2.31.1






Re: [PATCH] nvdimm: release the correct device list

2021-08-02 Thread Li , Zhijian/李 智坚

ping

Any body could help to review/queue this patch ?



On 2021/6/29 22:05, Igor Mammedov wrote:

On Thu, 24 Jun 2021 19:04:15 +0800
Li Zhijian  wrote:


Signed-off-by: Li Zhijian 

Reviewed-by: Igor Mammedov 


---
  hw/acpi/nvdimm.c | 12 ++--
  1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index e3d5fe19392..ff317263e85 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -355,10 +355,10 @@ nvdimm_build_structure_caps(GArray *structures, uint32_t 
capabilities)
  
  static GArray *nvdimm_build_device_structure(NVDIMMState *state)

  {
-GSList *device_list = nvdimm_get_device_list();
+GSList *device_list, *list = nvdimm_get_device_list();
  GArray *structures = g_array_new(false, true /* clear */, 1);
  
-for (; device_list; device_list = device_list->next) {

+for (device_list = list; device_list; device_list = device_list->next) {
  DeviceState *dev = device_list->data;
  
  /* build System Physical Address Range Structure. */

@@ -373,7 +373,7 @@ static GArray *nvdimm_build_device_structure(NVDIMMState 
*state)
  /* build NVDIMM Control Region Structure. */
  nvdimm_build_structure_dcr(structures, dev);
  }
-g_slist_free(device_list);
+g_slist_free(list);
  
  if (state->persistence) {

  nvdimm_build_structure_caps(structures, state->persistence);
@@ -1339,9 +1339,9 @@ static void nvdimm_build_ssdt(GArray *table_offsets, 
GArray *table_data,
  
  void nvdimm_build_srat(GArray *table_data)

  {
-GSList *device_list = nvdimm_get_device_list();
+GSList *device_list, *list = nvdimm_get_device_list();
  
-for (; device_list; device_list = device_list->next) {

+for (device_list = list; device_list; device_list = device_list->next) {
  AcpiSratMemoryAffinity *numamem = NULL;
  DeviceState *dev = device_list->data;
  Object *obj = OBJECT(dev);
@@ -1356,7 +1356,7 @@ void nvdimm_build_srat(GArray *table_data)
  build_srat_memory(numamem, addr, size, node,
MEM_AFFINITY_ENABLED | MEM_AFFINITY_NON_VOLATILE);
  }
-g_slist_free(device_list);
+g_slist_free(list);
  }
  
  void nvdimm_build_acpi(GArray *table_offsets, GArray *table_data,









[PATCH v2 1/2] migration: allow multifd for socket protocol only

2021-07-31 Thread Li Zhijian
multifd with unsupported protocol will cause a segment fault.
(gdb) bt
 #0  0x563b4a93faf8 in socket_connect (addr=0x0, errp=0x7f7f02675410) at 
../util/qemu-sockets.c:1190
 #1  0x563b4a797a03 in qio_channel_socket_connect_sync (ioc=0x563b4d16e8c0, 
addr=0x0, errp=0x7f7f02675410) at ../io/channel-socket.c:145
 #2  0x563b4a797abf in qio_channel_socket_connect_worker 
(task=0x563b4cd86c30, opaque=0x0) at ../io/channel-socket.c:168
 #3  0x563b4a792631 in qio_task_thread_worker (opaque=0x563b4cd86c30) at 
../io/task.c:124
 #4  0x563b4a91da69 in qemu_thread_start (args=0x563b4c44bb80) at 
../util/qemu-thread-posix.c:541
 #5  0x7f7fe9b5b3f9 in ?? ()
 #6  0x in ?? ()

It's enough to check migrate_multifd_is_allowed() in multifd cleanup() and
multifd setup() though there are so many other places using 
migrate_use_multifd().

Signed-off-by: Li Zhijian 
---
 migration/migration.c |  4 
 migration/multifd.c   | 24 ++--
 migration/multifd.h   |  2 ++
 3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 2d306582ebf..212314541f1 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -456,10 +456,12 @@ static void qemu_start_incoming_migration(const char 
*uri, Error **errp)
 {
 const char *p = NULL;
 
+migrate_protocol_allow_multifd(false); /* reset it anyway */
 qapi_event_send_migration(MIGRATION_STATUS_SETUP);
 if (strstart(uri, "tcp:", ) ||
 strstart(uri, "unix:", NULL) ||
 strstart(uri, "vsock:", NULL)) {
+migrate_protocol_allow_multifd(true);
 socket_start_incoming_migration(p ? p : uri, errp);
 #ifdef CONFIG_RDMA
 } else if (strstart(uri, "rdma:", )) {
@@ -2289,9 +2291,11 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
 }
 }
 
+migrate_protocol_allow_multifd(false);
 if (strstart(uri, "tcp:", ) ||
 strstart(uri, "unix:", NULL) ||
 strstart(uri, "vsock:", NULL)) {
+migrate_protocol_allow_multifd(true);
 socket_start_outgoing_migration(s, p ? p : uri, _err);
 #ifdef CONFIG_RDMA
 } else if (strstart(uri, "rdma:", )) {
diff --git a/migration/multifd.c b/migration/multifd.c
index ab41590e714..4a4d16d3888 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -531,7 +531,7 @@ void multifd_save_cleanup(void)
 {
 int i;
 
-if (!migrate_use_multifd()) {
+if (!migrate_use_multifd() || !migrate_multifd_is_allowed()) {
 return;
 }
 multifd_send_terminate_threads(NULL);
@@ -864,6 +864,17 @@ cleanup:
 multifd_new_send_channel_cleanup(p, sioc, local_err);
 }
 
+static bool migrate_allow_multifd;
+void migrate_protocol_allow_multifd(bool allow)
+{
+migrate_allow_multifd = allow;
+}
+
+bool migrate_multifd_is_allowed(void)
+{
+return migrate_allow_multifd;
+}
+
 int multifd_save_setup(Error **errp)
 {
 int thread_count;
@@ -874,6 +885,11 @@ int multifd_save_setup(Error **errp)
 if (!migrate_use_multifd()) {
 return 0;
 }
+if (!migrate_multifd_is_allowed()) {
+error_setg(errp, "multifd is not supported by current protocol");
+return -1;
+}
+
 s = migrate_get_current();
 thread_count = migrate_multifd_channels();
 multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
@@ -967,7 +983,7 @@ int multifd_load_cleanup(Error **errp)
 {
 int i;
 
-if (!migrate_use_multifd()) {
+if (!migrate_use_multifd() || !migrate_multifd_is_allowed()) {
 return 0;
 }
 multifd_recv_terminate_threads(NULL);
@@ -1123,6 +1139,10 @@ int multifd_load_setup(Error **errp)
 if (!migrate_use_multifd()) {
 return 0;
 }
+if (!migrate_multifd_is_allowed()) {
+error_setg(errp, "multifd is not supported by current protocol");
+return -1;
+}
 thread_count = migrate_multifd_channels();
 multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
 multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
diff --git a/migration/multifd.h b/migration/multifd.h
index 8d6751f5ed8..f62a1becd0b 100644
--- a/migration/multifd.h
+++ b/migration/multifd.h
@@ -13,6 +13,8 @@
 #ifndef QEMU_MIGRATION_MULTIFD_H
 #define QEMU_MIGRATION_MULTIFD_H
 
+bool migrate_multifd_is_allowed(void);
+void migrate_protocol_allow_multifd(bool allow);
 int multifd_save_setup(Error **errp);
 void multifd_save_cleanup(void);
 int multifd_load_setup(Error **errp);
-- 
2.31.1






[PATCH v2 2/2] migration: allow enabling mutilfd for specific protocol only

2021-07-31 Thread Li Zhijian
And change the default to true so that in '-incoming defer' case, user is able
to change multifd capability.

Signed-off-by: Li Zhijian 
---
 migration/migration.c | 8 
 migration/multifd.c   | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/migration/migration.c b/migration/migration.c
index 212314541f1..b4d0e66cf7b 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1249,6 +1249,14 @@ static bool migrate_caps_check(bool *cap_list,
 }
 }
 
+/* incoming side only */
+if (runstate_check(RUN_STATE_INMIGRATE) &&
+!migrate_multifd_is_allowed() &&
+cap_list[MIGRATION_CAPABILITY_MULTIFD]) {
+error_setg(errp, "multifd is not supported by current protocol");
+return false;
+}
+
 return true;
 }
 
diff --git a/migration/multifd.c b/migration/multifd.c
index 4a4d16d3888..4643b25c9db 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -864,7 +864,7 @@ cleanup:
 multifd_new_send_channel_cleanup(p, sioc, local_err);
 }
 
-static bool migrate_allow_multifd;
+static bool migrate_allow_multifd = true;
 void migrate_protocol_allow_multifd(bool allow)
 {
 migrate_allow_multifd = allow;
-- 
2.31.1






[PATCH 2/2] migration/rdma: advise prefetch write for ODP region

2021-07-31 Thread Li Zhijian
The responder mr registering with ODP will sent RNR NAK back to
the requester in the face of the page fault.
-
ibv_poll_cq wc.status=13 RNR retry counter exceeded!
ibv_poll_cq wrid=WRITE RDMA!
-
ibv_advise_mr(3) helps to make pages present before the actual IO is
conducted so that the responder does page fault as little as possible.

Signed-off-by: Li Zhijian 
---
 migration/rdma.c   | 40 
 migration/trace-events |  1 +
 2 files changed, 41 insertions(+)

diff --git a/migration/rdma.c b/migration/rdma.c
index 8784b5f22a6..a2ad00d665f 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -1117,6 +1117,30 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma)
 return 0;
 }
 
+/*
+ * ibv_advise_mr to avoid RNR NAK error as far as possible.
+ * The responder mr registering with ODP will sent RNR NAK back to
+ * the requester in the face of the page fault.
+ */
+static void qemu_rdma_advise_prefetch_write_mr(struct ibv_pd *pd, uint64_t 
addr,
+   uint32_t len,  uint32_t lkey,
+   const char *name, bool wr)
+{
+int ret;
+int advice = wr ? IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE :
+ IBV_ADVISE_MR_ADVICE_PREFETCH;
+struct ibv_sge sg_list = {.lkey = lkey, .addr = addr, .length = len};
+
+ret = ibv_advise_mr(pd, advice,
+IB_UVERBS_ADVISE_MR_FLAG_FLUSH, _list, 1);
+/* ignore the error */
+if (ret) {
+trace_qemu_rdma_advise_mr(name, len, addr, strerror(errno));
+} else {
+trace_qemu_rdma_advise_mr(name, len, addr, "successed");
+}
+}
+
 static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
 {
 int i;
@@ -1140,6 +1164,17 @@ on_demand:
 perror("Failed to register local dest ram block!\n");
 break;
 }
+
+if (access & IBV_ACCESS_ON_DEMAND) {
+qemu_rdma_advise_prefetch_write_mr(rdma->pd,
+   (uintptr_t)
+   local->block[i].local_host_addr,
+   local->block[i].length,
+   local->block[i].mr->lkey,
+   local->block[i].block_name,
+   true);
+}
+
 rdma->total_registrations++;
 }
 
@@ -1244,6 +1279,11 @@ on_demand:
 rdma->total_registrations);
 return -1;
 }
+if (access & IBV_ACCESS_ON_DEMAND) {
+qemu_rdma_advise_prefetch_write_mr(rdma->pd, 
(uintptr_t)chunk_start,
+   len, block->pmr[chunk]->lkey,
+   block->block_name, rkey);
+}
 rdma->total_registrations++;
 }
 
diff --git a/migration/trace-events b/migration/trace-events
index 5f6aa580def..901c1d54c12 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -213,6 +213,7 @@ qemu_rdma_poll_other(const char *compstr, int64_t comp, int 
left) "other complet
 qemu_rdma_post_send_control(const char *desc) "CONTROL: sending %s.."
 qemu_rdma_register_and_get_keys(uint64_t len, void *start) "Registering %" 
PRIu64 " bytes @ %p"
 qemu_rdma_register_odp_mr(const char *name) "Try to register On-Demand Paging 
memory region: %s"
+qemu_rdma_advise_mr(const char *name, uint32_t len, uint64_t addr, const char 
*res) "Try to advise block %s prefetch write at %" PRIu32 "@0x%" PRIx64 ": %s"
 qemu_rdma_registration_handle_compress(int64_t length, int index, int64_t 
offset) "Zapping zero chunk: %" PRId64 " bytes, index %d, offset %" PRId64
 qemu_rdma_registration_handle_finished(void) ""
 qemu_rdma_registration_handle_ram_blocks(void) ""
-- 
2.31.1






[PATCH 0/2] enable fsdax rdma migration

2021-07-31 Thread Li Zhijian
Previous qemu face 2 problems when migrating a fsdax memory backend with
RDMA protocol.
(1) ibv_reg_mr failed with Operation not supported
(2) requester(source) side could receive RNR NAK.

For the (1), we can try to register memory region with ODP feature which
has already been implemented in some modern HCA hardware/drivers.
For the (2), IB provides advise API to prefetch pages in specific memory
region. It can help driver reduce the page fault on responder(destination)
side during RDMA_WRITE.

Li Zhijian (2):
  migration/rdma: Try to register On-Demand Paging memory region
  migration/rdma: advise prefetch write for ODP region

 migration/rdma.c   | 67 --
 migration/trace-events |  2 ++
 2 files changed, 60 insertions(+), 9 deletions(-)

-- 
2.31.1






[PATCH 1/2] migration/rdma: Try to register On-Demand Paging memory region

2021-07-31 Thread Li Zhijian
Previously, for the fsdax mem-backend-file, it will register failed with
Operation not supported. In this case, we can try to register it with
On-Demand Paging[1] like what rpma_mr_reg() does on rpma[2].

[1]: 
https://community.mellanox.com/s/article/understanding-on-demand-paging--odp-x
[2]: http://pmem.io/rpma/manpages/v0.9.0/rpma_mr_reg.3
Signed-off-by: Li Zhijian 
---
 migration/rdma.c   | 27 ++-
 migration/trace-events |  1 +
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index 5c2d113aa94..8784b5f22a6 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -1123,15 +1123,21 @@ static int qemu_rdma_reg_whole_ram_blocks(RDMAContext 
*rdma)
 RDMALocalBlocks *local = >local_ram_blocks;
 
 for (i = 0; i < local->nb_blocks; i++) {
+int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
+
+on_demand:
 local->block[i].mr =
 ibv_reg_mr(rdma->pd,
 local->block[i].local_host_addr,
-local->block[i].length,
-IBV_ACCESS_LOCAL_WRITE |
-IBV_ACCESS_REMOTE_WRITE
+local->block[i].length, access
 );
 if (!local->block[i].mr) {
-perror("Failed to register local dest ram block!");
+if (!(access & IBV_ACCESS_ON_DEMAND) && errno == ENOTSUP) {
+access |= IBV_ACCESS_ON_DEMAND;
+trace_qemu_rdma_register_odp_mr(local->block[i].block_name);
+goto on_demand;
+}
+perror("Failed to register local dest ram block!\n");
 break;
 }
 rdma->total_registrations++;
@@ -1215,15 +1221,18 @@ static int qemu_rdma_register_and_get_keys(RDMAContext 
*rdma,
  */
 if (!block->pmr[chunk]) {
 uint64_t len = chunk_end - chunk_start;
+int access = rkey ? IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE : 
0;
 
 trace_qemu_rdma_register_and_get_keys(len, chunk_start);
 
-block->pmr[chunk] = ibv_reg_mr(rdma->pd,
-chunk_start, len,
-(rkey ? (IBV_ACCESS_LOCAL_WRITE |
-IBV_ACCESS_REMOTE_WRITE) : 0));
-
+on_demand:
+block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
 if (!block->pmr[chunk]) {
+if (!(access & IBV_ACCESS_ON_DEMAND) && errno == ENOTSUP) {
+access |= IBV_ACCESS_ON_DEMAND;
+trace_qemu_rdma_register_odp_mr(block->block_name);
+goto on_demand;
+}
 perror("Failed to register chunk!");
 fprintf(stderr, "Chunk details: block: %d chunk index %d"
 " start %" PRIuPTR " end %" PRIuPTR
diff --git a/migration/trace-events b/migration/trace-events
index a1c0f034ab8..5f6aa580def 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -212,6 +212,7 @@ qemu_rdma_poll_write(const char *compstr, int64_t comp, int 
left, uint64_t block
 qemu_rdma_poll_other(const char *compstr, int64_t comp, int left) "other 
completion %s (%" PRId64 ") received left %d"
 qemu_rdma_post_send_control(const char *desc) "CONTROL: sending %s.."
 qemu_rdma_register_and_get_keys(uint64_t len, void *start) "Registering %" 
PRIu64 " bytes @ %p"
+qemu_rdma_register_odp_mr(const char *name) "Try to register On-Demand Paging 
memory region: %s"
 qemu_rdma_registration_handle_compress(int64_t length, int index, int64_t 
offset) "Zapping zero chunk: %" PRId64 " bytes, index %d, offset %" PRId64
 qemu_rdma_registration_handle_finished(void) ""
 qemu_rdma_registration_handle_ram_blocks(void) ""
-- 
2.31.1






[PATCH 1/2] migration: allow multifd for socket protocol only

2021-07-16 Thread Li Zhijian
multifd with unsupported protocal will cause a segment fault.
(gdb) bt
 #0  0x563b4a93faf8 in socket_connect (addr=0x0, errp=0x7f7f02675410) at 
../util/qemu-sockets.c:1190
 #1  0x563b4a797a03 in qio_channel_socket_connect_sync (ioc=0x563b4d16e8c0, 
addr=0x0, errp=0x7f7f02675410) at ../io/channel-socket.c:145
 #2  0x563b4a797abf in qio_channel_socket_connect_worker 
(task=0x563b4cd86c30, opaque=0x0) at ../io/channel-socket.c:168
 #3  0x563b4a792631 in qio_task_thread_worker (opaque=0x563b4cd86c30) at 
../io/task.c:124
 #4  0x563b4a91da69 in qemu_thread_start (args=0x563b4c44bb80) at 
../util/qemu-thread-posix.c:541
 #5  0x7f7fe9b5b3f9 in ?? ()
 #6  0x in ?? ()

It's enough to check migrate_multifd_is_allowed() in multifd cleanup() and
multifd setup() though there are so many other places using 
migrate_use_multifd().

Signed-off-by: Li Zhijian 
---
 migration/migration.c |  4 
 migration/multifd.c   | 24 ++--
 migration/multifd.h   |  2 ++
 3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 2d306582ebf..d6d48f6999b 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -456,10 +456,12 @@ static void qemu_start_incoming_migration(const char 
*uri, Error **errp)
 {
 const char *p = NULL;
 
+migrate_protocal_allow_multifd(false); /* reset it anyway */
 qapi_event_send_migration(MIGRATION_STATUS_SETUP);
 if (strstart(uri, "tcp:", ) ||
 strstart(uri, "unix:", NULL) ||
 strstart(uri, "vsock:", NULL)) {
+migrate_protocal_allow_multifd(true);
 socket_start_incoming_migration(p ? p : uri, errp);
 #ifdef CONFIG_RDMA
 } else if (strstart(uri, "rdma:", )) {
@@ -2289,9 +2291,11 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
 }
 }
 
+migrate_protocal_allow_multifd(false);
 if (strstart(uri, "tcp:", ) ||
 strstart(uri, "unix:", NULL) ||
 strstart(uri, "vsock:", NULL)) {
+migrate_protocal_allow_multifd(true);
 socket_start_outgoing_migration(s, p ? p : uri, _err);
 #ifdef CONFIG_RDMA
 } else if (strstart(uri, "rdma:", )) {
diff --git a/migration/multifd.c b/migration/multifd.c
index ab41590e714..b3d99c79d83 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -531,7 +531,7 @@ void multifd_save_cleanup(void)
 {
 int i;
 
-if (!migrate_use_multifd()) {
+if (!migrate_use_multifd() || !migrate_multifd_is_allowed()) {
 return;
 }
 multifd_send_terminate_threads(NULL);
@@ -864,6 +864,17 @@ cleanup:
 multifd_new_send_channel_cleanup(p, sioc, local_err);
 }
 
+static bool migrate_allow_multifd;
+void migrate_protocal_allow_multifd(bool allow)
+{
+migrate_allow_multifd = allow;
+}
+
+bool migrate_multifd_is_allowed(void)
+{
+return migrate_allow_multifd;
+}
+
 int multifd_save_setup(Error **errp)
 {
 int thread_count;
@@ -874,6 +885,11 @@ int multifd_save_setup(Error **errp)
 if (!migrate_use_multifd()) {
 return 0;
 }
+if (!migrate_multifd_is_allowed()) {
+error_setg(errp, "multifd is not supported by current protocol");
+return -1;
+}
+
 s = migrate_get_current();
 thread_count = migrate_multifd_channels();
 multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
@@ -967,7 +983,7 @@ int multifd_load_cleanup(Error **errp)
 {
 int i;
 
-if (!migrate_use_multifd()) {
+if (!migrate_use_multifd() || !migrate_multifd_is_allowed()) {
 return 0;
 }
 multifd_recv_terminate_threads(NULL);
@@ -1123,6 +1139,10 @@ int multifd_load_setup(Error **errp)
 if (!migrate_use_multifd()) {
 return 0;
 }
+if (!migrate_multifd_is_allowed()) {
+error_setg(errp, "multifd is not supported by current protocol");
+return -1;
+}
 thread_count = migrate_multifd_channels();
 multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
 multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
diff --git a/migration/multifd.h b/migration/multifd.h
index 8d6751f5ed8..9d968d63831 100644
--- a/migration/multifd.h
+++ b/migration/multifd.h
@@ -13,6 +13,8 @@
 #ifndef QEMU_MIGRATION_MULTIFD_H
 #define QEMU_MIGRATION_MULTIFD_H
 
+bool migrate_multifd_is_allowed(void);
+void migrate_protocal_allow_multifd(bool allow);
 int multifd_save_setup(Error **errp);
 void multifd_save_cleanup(void);
 int multifd_load_setup(Error **errp);
-- 
2.31.1






[PATCH 2/2] migration: allow enabling mutilfd for specific protocol only

2021-07-16 Thread Li Zhijian
And change the default to true so that '-incoming defer' can enable
multifd first.

Signed-off-by: Li Zhijian 
---
 migration/migration.c | 8 
 migration/multifd.c   | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/migration/migration.c b/migration/migration.c
index d6d48f6999b..bcc8b3bcb92 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1249,6 +1249,14 @@ static bool migrate_caps_check(bool *cap_list,
 }
 }
 
+/* incoming side only */
+if (runstate_check(RUN_STATE_INMIGRATE) &&
+!migrate_multifd_is_allowed() &&
+cap_list[MIGRATION_CAPABILITY_MULTIFD]) {
+error_setg(errp, "multifd is not supported by current protocol");
+return false;
+}
+
 return true;
 }
 
diff --git a/migration/multifd.c b/migration/multifd.c
index b3d99c79d83..372f3633eda 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -864,7 +864,7 @@ cleanup:
 multifd_new_send_channel_cleanup(p, sioc, local_err);
 }
 
-static bool migrate_allow_multifd;
+static bool migrate_allow_multifd = true;
 void migrate_protocal_allow_multifd(bool allow)
 {
 migrate_allow_multifd = allow;
-- 
2.31.1






[PATCH] migration/rdma: prevent from double free the same mr

2021-07-08 Thread Li Zhijian
backtrace:
'0x75f44ec2 in __ibv_dereg_mr_1_1 (mr=0x7fff1007d390) at 
/home/lizhijian/rdma-core/libibverbs/verbs.c:478
478 void *addr  = mr->addr;
(gdb) bt
 #0  0x75f44ec2 in __ibv_dereg_mr_1_1 (mr=0x7fff1007d390) at 
/home/lizhijian/rdma-core/libibverbs/verbs.c:478
 #1  0x55891fcc in rdma_delete_block (block=, 
rdma=0x7fff38176010) at ../migration/rdma.c:691
 #2  qemu_rdma_cleanup (rdma=0x7fff38176010) at ../migration/rdma.c:2365
 #3  0x558925b0 in qio_channel_rdma_close_rcu (rcu=0x56b8b6c0) at 
../migration/rdma.c:3073
 #4  0x55d652a3 in call_rcu_thread (opaque=opaque@entry=0x0) at 
../util/rcu.c:281
 #5  0x55d5edf9 in qemu_thread_start (args=0x7fffe88bb4d0) at 
../util/qemu-thread-posix.c:541
 #6  0x754c73f9 in start_thread () at /lib64/libpthread.so.0
 #7  0x753f3b03 in clone () at /lib64/libc.so.6 '

Signed-off-by: Li Zhijian 
---
 migration/rdma.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/migration/rdma.c b/migration/rdma.c
index b6cc4bef4a8..0f22b8227c0 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -1143,6 +1143,7 @@ static int qemu_rdma_reg_whole_ram_blocks(RDMAContext 
*rdma)
 
 for (i--; i >= 0; i--) {
 ibv_dereg_mr(local->block[i].mr);
+local->block[i].mr = NULL;
 rdma->total_registrations--;
 }
 
-- 
2.30.2






Re: [PATCH] Fix libpmem configuration option

2021-07-08 Thread Li , Zhijian/李 智坚


i have to apply below extra changes

From a8d027d3dfe70fb33363ad5934e163999fc29eec Mon Sep 17 00:00:00 2001
From: Li Zhijian 
Date: Thu, 8 Jul 2021 17:52:40 +0800
Subject: [PATCH] fix libpmem configuration

Signed-off-by: Li Zhijian 
---
 configure | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure b/configure
index a172c83e150..0a3a4610f27 100755
--- a/configure
+++ b/configure
@@ -4818,7 +4818,7 @@ elif test "$pthread_setname_np_wo_tid" = "yes" ; then
   echo "CONFIG_PTHREAD_SETNAME_NP_WO_TID=y" >> $config_host_mak
 fi
 
-if test "$libpmem" = "yes" ; then

+if test "$libpmem" = "enabled" ; then
   echo "CONFIG_LIBPMEM=y" >> $config_host_mak
   echo "LIBPMEM_LIBS=$libpmem_libs" >> $config_host_mak
   echo "LIBPMEM_CFLAGS=$libpmem_cflags" >> $config_host_mak
--
2.31.1

On 2021/7/7 15:51, Miroslav Rezanina wrote:

For some reason, libpmem option setting was set to work in an opposite
way (--enable-libpmem disabled it and vice versa). Fixing this so
configuration works properly.

Signed-off-by: Miroslav Rezanina 
---
  configure | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/configure b/configure
index 7994bdee92..ffa93cc5fd 100755
--- a/configure
+++ b/configure
@@ -1501,9 +1501,9 @@ for opt do
;;
--disable-debug-mutex) debug_mutex=no
;;
-  --enable-libpmem) libpmem=disabled
+  --enable-libpmem) libpmem="enabled"
;;
-  --disable-libpmem) libpmem=enabled
+  --disable-libpmem) libpmem="disabled"
;;
--enable-xkbcommon) xkbcommon="enabled"
;;





[PATCH] misc: Remove redundant new line in perror()

2021-07-06 Thread Li Zhijian
Signed-off-by: Li Zhijian 
---
 migration/rdma.c | 2 +-
 softmmu/cpus.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index 7fb9646f6ef..e99e2e16a73 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -1133,7 +1133,7 @@ static int qemu_rdma_reg_whole_ram_blocks(RDMAContext 
*rdma)
 IBV_ACCESS_REMOTE_WRITE
 );
 if (!local->block[i].mr) {
-perror("Failed to register local dest ram block!\n");
+perror("Failed to register local dest ram block!");
 break;
 }
 rdma->total_registrations++;
diff --git a/softmmu/cpus.c b/softmmu/cpus.c
index c3caaeb26ec..071085f840b 100644
--- a/softmmu/cpus.c
+++ b/softmmu/cpus.c
@@ -325,7 +325,7 @@ static void sigbus_reraise(void)
 sigaddset(, SIGBUS);
 pthread_sigmask(SIG_UNBLOCK, , NULL);
 }
-perror("Failed to re-raise SIGBUS!\n");
+perror("Failed to re-raise SIGBUS!");
 abort();
 }
 
-- 
2.30.2






[PATCH] migration/rdma: Use error_report to suppress errno message

2021-06-28 Thread Li Zhijian
Since the prior calls are successful, in this case a errno doesn't
indicate a real error which would just make us confused.

before:
(qemu) migrate -d rdma:192.168.22.23:
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name 
uverbs2, infiniband_verbs class device path 
/sys/class/infiniband_verbs/uverbs2, infiniband class device path 
/sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect: No space left on 
device

Signed-off-by: Li Zhijian 
---
 migration/rdma.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index d90b29a4b51..b6cc4bef4a8 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -1006,7 +1006,7 @@ route:
 if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
 ERROR(errp, "result not equal to event_addr_resolved %s",
 rdma_event_str(cm_event->event));
-perror("rdma_resolve_addr");
+error_report("rdma_resolve_addr");
 rdma_ack_cm_event(cm_event);
 ret = -EINVAL;
 goto err_resolve_get_addr;
@@ -2544,7 +2544,7 @@ static int qemu_rdma_connect(RDMAContext *rdma, Error 
**errp, bool return_path)
 }
 
 if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
-perror("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect");
+error_report("rdma_get_cm_event != EVENT_ESTABLISHED after 
rdma_connect");
 ERROR(errp, "connecting to destination!");
 rdma_ack_cm_event(cm_event);
 goto err_rdma_source_connect;
-- 
2.31.1






[PATCH] nvdimm: release the correct device list

2021-06-24 Thread Li Zhijian
Signed-off-by: Li Zhijian 
---
 hw/acpi/nvdimm.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index e3d5fe19392..ff317263e85 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -355,10 +355,10 @@ nvdimm_build_structure_caps(GArray *structures, uint32_t 
capabilities)
 
 static GArray *nvdimm_build_device_structure(NVDIMMState *state)
 {
-GSList *device_list = nvdimm_get_device_list();
+GSList *device_list, *list = nvdimm_get_device_list();
 GArray *structures = g_array_new(false, true /* clear */, 1);
 
-for (; device_list; device_list = device_list->next) {
+for (device_list = list; device_list; device_list = device_list->next) {
 DeviceState *dev = device_list->data;
 
 /* build System Physical Address Range Structure. */
@@ -373,7 +373,7 @@ static GArray *nvdimm_build_device_structure(NVDIMMState 
*state)
 /* build NVDIMM Control Region Structure. */
 nvdimm_build_structure_dcr(structures, dev);
 }
-g_slist_free(device_list);
+g_slist_free(list);
 
 if (state->persistence) {
 nvdimm_build_structure_caps(structures, state->persistence);
@@ -1339,9 +1339,9 @@ static void nvdimm_build_ssdt(GArray *table_offsets, 
GArray *table_data,
 
 void nvdimm_build_srat(GArray *table_data)
 {
-GSList *device_list = nvdimm_get_device_list();
+GSList *device_list, *list = nvdimm_get_device_list();
 
-for (; device_list; device_list = device_list->next) {
+for (device_list = list; device_list; device_list = device_list->next) {
 AcpiSratMemoryAffinity *numamem = NULL;
 DeviceState *dev = device_list->data;
 Object *obj = OBJECT(dev);
@@ -1356,7 +1356,7 @@ void nvdimm_build_srat(GArray *table_data)
 build_srat_memory(numamem, addr, size, node,
   MEM_AFFINITY_ENABLED | MEM_AFFINITY_NON_VOLATILE);
 }
-g_slist_free(device_list);
+g_slist_free(list);
 }
 
 void nvdimm_build_acpi(GArray *table_offsets, GArray *table_data,
-- 
2.31.1






[PATCH v2 2/2] migration/rdma: rename cq and comp_channel with recv prefix

2021-06-18 Thread Li Zhijian
make the code more clear

Signed-off-by: Li Zhijian 
---
 migration/rdma.c | 50 
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index 16fe0688858..527972d4970 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -358,10 +358,10 @@ typedef struct RDMAContext {
 struct ibv_context  *verbs;
 struct rdma_event_channel   *channel;
 struct ibv_qp *qp;  /* queue pair */
-struct ibv_comp_channel *comp_channel;  /* completion channel */
+struct ibv_comp_channel *recv_comp_channel;  /* recv completion channel */
 struct ibv_comp_channel *send_comp_channel;  /* send completion channel */
 struct ibv_pd *pd;  /* protection domain */
-struct ibv_cq *cq;  /* completion queue */
+struct ibv_cq *recv_cq; /* recv completion queue */
 struct ibv_cq *send_cq; /* send completion queue */
 
 /*
@@ -1062,8 +1062,8 @@ static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
 }
 
 /* create completion channel */
-rdma->comp_channel = ibv_create_comp_channel(rdma->verbs);
-if (!rdma->comp_channel) {
+rdma->recv_comp_channel = ibv_create_comp_channel(rdma->verbs);
+if (!rdma->recv_comp_channel) {
 error_report("failed to allocate completion channel");
 goto err_alloc_pd_cq;
 }
@@ -1071,9 +1071,9 @@ static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
 /*
  * Completion queue can be filled by read work requests.
  */
-rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
-NULL, rdma->comp_channel, 0);
-if (!rdma->cq) {
+rdma->recv_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
+  NULL, rdma->recv_comp_channel, 0);
+if (!rdma->recv_cq) {
 error_report("failed to allocate completion queue");
 goto err_alloc_pd_cq;
 }
@@ -1098,18 +1098,18 @@ err_alloc_pd_cq:
 if (rdma->pd) {
 ibv_dealloc_pd(rdma->pd);
 }
-if (rdma->comp_channel) {
-ibv_destroy_comp_channel(rdma->comp_channel);
+if (rdma->recv_comp_channel) {
+ibv_destroy_comp_channel(rdma->recv_comp_channel);
 }
 if (rdma->send_comp_channel) {
 ibv_destroy_comp_channel(rdma->send_comp_channel);
 }
-if (rdma->cq) {
-ibv_destroy_cq(rdma->cq);
-rdma->cq = NULL;
+if (rdma->recv_cq) {
+ibv_destroy_cq(rdma->recv_cq);
+rdma->recv_cq = NULL;
 }
 rdma->pd = NULL;
-rdma->comp_channel = NULL;
+rdma->recv_comp_channel = NULL;
 rdma->send_comp_channel = NULL;
 return -1;
 
@@ -1128,7 +1128,7 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma)
 attr.cap.max_send_sge = 1;
 attr.cap.max_recv_sge = 1;
 attr.send_cq = rdma->send_cq;
-attr.recv_cq = rdma->cq;
+attr.recv_cq = rdma->recv_cq;
 attr.qp_type = IBV_QPT_RC;
 
 ret = rdma_create_qp(rdma->cm_id, rdma->pd, );
@@ -1606,12 +1606,12 @@ static int qemu_rdma_wait_comp_channel(RDMAContext 
*rdma,
 static struct ibv_comp_channel *to_channel(RDMAContext *rdma, int wrid)
 {
 return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_comp_channel :
-   rdma->comp_channel;
+   rdma->recv_comp_channel;
 }
 
 static struct ibv_cq *to_cq(RDMAContext *rdma, int wrid)
 {
-return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_cq : rdma->cq;
+return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_cq : rdma->recv_cq;
 }
 
 /*
@@ -2398,17 +2398,17 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
 rdma_destroy_qp(rdma->cm_id);
 rdma->qp = NULL;
 }
-if (rdma->cq) {
-ibv_destroy_cq(rdma->cq);
-rdma->cq = NULL;
+if (rdma->recv_cq) {
+ibv_destroy_cq(rdma->recv_cq);
+rdma->recv_cq = NULL;
 }
 if (rdma->send_cq) {
 ibv_destroy_cq(rdma->send_cq);
 rdma->send_cq = NULL;
 }
-if (rdma->comp_channel) {
-ibv_destroy_comp_channel(rdma->comp_channel);
-rdma->comp_channel = NULL;
+if (rdma->recv_comp_channel) {
+ibv_destroy_comp_channel(rdma->recv_comp_channel);
+rdma->recv_comp_channel = NULL;
 }
 if (rdma->send_comp_channel) {
 ibv_destroy_comp_channel(rdma->send_comp_channel);
@@ -3084,12 +3084,12 @@ static void 
qio_channel_rdma_set_aio_fd_handler(QIOChannel *ioc,
 {
 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
 if (io_read) {
-aio_set_fd_handler(ctx, rioc->rdmain->comp_channel->fd,
+aio_set_fd_handler(ctx, rioc->rdmain->recv_comp_channel->fd,
false, io_read, io_wri

[PATCH v2 1/2] migration/rdma: Fix out of order wrid

2021-06-18 Thread Li Zhijian
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev 
tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device 
e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive 
if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device 
virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 
2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl 
-spice streaming-video=filter,port=5902,disable-ticketing -incoming 
rdma:192.168.22.23:
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: 
warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name 
uverbs2, infiniband_verbs class device path 
/sys/class/infiniband_verbs/uverbs2, infiniband class device path 
/sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL 
RECV (4000)

source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev 
tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device 
e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive 
if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device 
virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 
2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl 
-spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: 
warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name 
uverbs2, infiniband_verbs class device path 
/sys/class/infiniband_verbs/uverbs2, infiniband class device path 
/sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got 
CONTROL RECV (4000)

NOTE: soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0

This migration cannot be completed since out of order(OOO) CQ event occurs.
OOO cases will occur in both source side and destination side. And it
happens on only SEND and RECV are out of order. OOO between 'WRITE RDMA' and
'RECV' doesn't matter.

below the OOO sequence:
  source destination
  qemu_rdma_write_one()  qemu_rdma_registration_handle()
1.  post_recv X post_recv Y
2.  post_send X
3.  wait X CQ event
4.  X CQ event
5.  post_send Y
6.  wait Y CQ event
7.  Y CQ event (dropped)
8.  Y CQ event(send Y done)
9.  X CQ event(send X done)
10. wait Y CQ event(dropped at (7), blocks 
forever)

Looks it only happens on soft RoCE rdma device in my a hundred of runs,
a hardware IB device works fine.

Here we introduce a independent send completion queue to distinguish
ibv_post_send completion queue from the original mixed completion queue.
It helps us to poll the specific CQE we are really interesting in.

Signed-off-by: Li Zhijian 
---
V2 Introduce send completion queue
---
 migration/rdma.c | 94 
 1 file changed, 79 insertions(+), 15 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index d90b29a4b51..16fe0688858 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -359,8 +359,10 @@ typedef struct RDMAContext {
 struct rdma_event_channel   *channel;
 struct ibv_qp *qp;  /* queue pair */
 struct ibv_comp_channel *comp_channel;  /* completion channel */
+struct ibv_comp_channel *send_comp_channel;  /* send completion channel */
 struct ibv_pd *pd;  /* protection domain */
 struct ibv_cq *cq;  /* completion queue */
+struct ibv_cq *send_cq; /* send completion queue */
 
 /*
  * If a previous write failed (perhaps because of a failed
@@ -1067,8 +1069,7 @@ static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
 }
 
 /*
- * Completion queue can be filled by both read and write work requests,
- * so must reflect the sum of both possible queue sizes.
+ * Completion queue can be filled by read work requests.
  */
 rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
 NULL, rdma->comp_channel, 0);
@@ -1077,6 +1078,20 @@ s

[PATCH] docs/nvdimm: update doc

2021-06-10 Thread Li Zhijian
The prompt was updated since def835f0da ('hostmem: Don't report pmem attribute 
if unsupported')

Signed-off-by: Li Zhijian 
---
 docs/nvdimm.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/nvdimm.txt b/docs/nvdimm.txt
index 0aae682be3e..71cdbdf554b 100644
--- a/docs/nvdimm.txt
+++ b/docs/nvdimm.txt
@@ -247,7 +247,8 @@ is built with libpmem [2] support (configured with 
--enable-libpmem), QEMU
 will take necessary operations to guarantee the persistence of its own writes
 to the vNVDIMM backend(e.g., in vNVDIMM label emulation and live migration).
 If 'pmem' is 'on' while there is no libpmem support, qemu will exit and report
-a "lack of libpmem support" message to ensure the persistence is available.
+a "lack of libpmem support" (or "Invalid parameter 'pmem'" since v6.0.0)
+message to ensure the persistence is available.
 For example, if we want to ensure the persistence for some backend file,
 use the QEMU command line:
 
-- 
2.30.2






[RFC PATCH] migration/rdma: Fix out of order wrid

2021-06-10 Thread Li Zhijian
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev 
tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device 
e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive 
if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device 
virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 
2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl 
-spice streaming-video=filter,port=5902,disable-ticketing -incoming 
rdma:192.168.22.23:
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: 
warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name 
uverbs2, infiniband_verbs class device path 
/sys/class/infiniband_verbs/uverbs2, infiniband class device path 
/sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL 
RECV (4000)

source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev 
tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device 
e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive 
if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device 
virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 
2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl 
-spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: 
warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name 
uverbs2, infiniband_verbs class device path 
/sys/class/infiniband_verbs/uverbs2, infiniband class device path 
/sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got 
CONTROL RECV (4000)

NOTE: soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0

This migration cannot be completed since out of order(OOO) CQ event occurs.
OOO cases will occur in both source side and destination side. And it
happens on only SEND and RECV are out of order. OOO between 'WRITE RDMA' and
'RECV' doesn't matter.

below the OOO sequence:
  source destination
  qemu_rdma_write_one()  qemu_rdma_registration_handle()
1.  post_recv X post_recv Y
2.  post_send X
3.  wait X CQ event
4.  X CQ event
5.  post_send Y
6.  wait Y CQ event
7.  Y CQ event (dropped)
8.  Y CQ event(send Y done)
9.  X CQ event(send X done)
10. wait Y CQ event(dropped at (7), blocks 
forever)

Looks it only happens on soft RoCE rdma device in my a hundred of runs,
a hardward IB device works fine.

Signed-off-by: Li Zhijian 
---
This is just a draft to address this problem. One possible approach
could be creating their independent CQ for both SEND and RECV, it can
help us to poll the CQ we are really insterested in. But it could be a
big changes.
---
 migration/rdma.c | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/migration/rdma.c b/migration/rdma.c
index b703bf1b918..7a2b0a8853e 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -364,6 +364,8 @@ typedef struct RDMAContext {
 struct ibv_comp_channel *comp_channel;  /* completion channel */
 struct ibv_pd *pd;  /* protection domain */
 struct ibv_cq *cq;  /* completion queue */
+int64_t ooo_wrid;
+int64_t ooo_wrid_byte_len;
 
 /*
  * If a previous write failed (perhaps because of a failed
@@ -1612,11 +1614,32 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, 
int wrid_requested,
 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
 
 if (wr_id == RDMA_WRID_NONE) {
+if (rdma->ooo_wrid >= RDMA_WRID_SEND_CONTROL && rdma->ooo_wrid == 
wrid_requested) {
+error_report("get expected ooo wrid %d", wrid_requested);
+if (byte_len && rdma->ooo_wrid_byte_len != -1) {
+*byte_len = rdma->ooo_wrid_byte_len;
+rdma->ooo_wrid = RDMA_WRID_NONE;
+return 0;
+}
+}
 break;
 }
 if (wr_id != wrid_requested) {

[PATCH] migration/rdma: Fix cm event use after free

2021-06-01 Thread Li Zhijian
Signed-off-by: Li Zhijian 
---
 migration/rdma.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index 1cdb4561f32..d90b29a4b51 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -1539,16 +1539,20 @@ static int qemu_rdma_wait_comp_channel(RDMAContext 
*rdma)
 
 if (pfds[1].revents) {
 ret = rdma_get_cm_event(rdma->channel, _event);
-if (!ret) {
-rdma_ack_cm_event(cm_event);
+if (ret) {
+error_report("failed to get cm event while wait "
+ "completion channel");
+return -EPIPE;
 }
 
 error_report("receive cm event while wait comp channel,"
  "cm event is %d", cm_event->event);
 if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
 cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
+rdma_ack_cm_event(cm_event);
 return -EPIPE;
 }
+rdma_ack_cm_event(cm_event);
 }
 break;
 
@@ -3285,7 +3289,6 @@ static void rdma_cm_poll_handler(void *opaque)
 error_report("get_cm_event failed %d", errno);
 return;
 }
-rdma_ack_cm_event(cm_event);
 
 if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
 cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
@@ -3298,12 +3301,14 @@ static void rdma_cm_poll_handler(void *opaque)
 rdma->return_path->error_state = -EPIPE;
 }
 }
+rdma_ack_cm_event(cm_event);
 
 if (mis->migration_incoming_co) {
 qemu_coroutine_enter(mis->migration_incoming_co);
 }
 return;
 }
+rdma_ack_cm_event(cm_event);
 }
 
 static int qemu_rdma_accept(RDMAContext *rdma)
-- 
2.30.2






[PATCH v2 4/4] migration/rdma: source: poll cm_event from return path

2021-05-25 Thread Li Zhijian
source side always blocks if postcopy is only enabled at source side.
users are not able to cancel this migration in this case.

Let source side have chance to cancel this migration

Signed-off-by: Li Zhijian 
---
V2: utilize poll to check cm event
---
 migration/rdma.c | 42 ++
 1 file changed, 38 insertions(+), 4 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index d829d08d076..f67e21b4f54 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -36,6 +36,7 @@
 #include 
 #include "trace.h"
 #include "qom/object.h"
+#include 
 
 /*
  * Print and error on both the Monitor and the Log file.
@@ -2460,7 +2461,36 @@ err_rdma_source_init:
 return -1;
 }
 
-static int qemu_rdma_connect(RDMAContext *rdma, Error **errp)
+static int qemu_get_cm_event_timeout(RDMAContext *rdma,
+ struct rdma_cm_event **cm_event,
+ long msec, Error **errp)
+{
+int ret;
+struct pollfd poll_fd = {
+.fd = rdma->channel->fd,
+.events = POLLIN,
+.revents = 0
+};
+
+do {
+ret = poll(_fd, 1, msec);
+} while (ret < 0 && errno == EINTR);
+
+if (ret == 0) {
+ERROR(errp, "poll cm event timeout");
+return -1;
+} else if (ret < 0) {
+ERROR(errp, "failed to pull cm event, errno=%i", errno);
+return -1;
+} else if (poll_fd.revents & POLLIN) {
+return rdma_get_cm_event(rdma->channel, cm_event);
+} else {
+ERROR(errp, "no POLLIN event, revent=%x", poll_fd.revents);
+return -1;
+}
+}
+
+static int qemu_rdma_connect(RDMAContext *rdma, Error **errp, bool return_path)
 {
 RDMACapabilities cap = {
 .version = RDMA_CONTROL_VERSION_CURRENT,
@@ -2498,7 +2528,11 @@ static int qemu_rdma_connect(RDMAContext *rdma, Error 
**errp)
 goto err_rdma_source_connect;
 }
 
-ret = rdma_get_cm_event(rdma->channel, _event);
+if (return_path) {
+ret = qemu_get_cm_event_timeout(rdma, _event, 5000, errp);
+} else {
+ret = rdma_get_cm_event(rdma->channel, _event);
+}
 if (ret) {
 perror("rdma_get_cm_event after rdma_connect");
 ERROR(errp, "connecting to destination!");
@@ -4111,7 +4145,7 @@ void rdma_start_outgoing_migration(void *opaque,
 }
 
 trace_rdma_start_outgoing_migration_after_rdma_source_init();
-ret = qemu_rdma_connect(rdma, errp);
+ret = qemu_rdma_connect(rdma, errp, false);
 
 if (ret) {
 goto err;
@@ -4132,7 +4166,7 @@ void rdma_start_outgoing_migration(void *opaque,
 goto return_path_err;
 }
 
-ret = qemu_rdma_connect(rdma_return_path, errp);
+ret = qemu_rdma_connect(rdma_return_path, errp, true);
 
 if (ret) {
 goto return_path_err;
-- 
2.30.2






[PATCH v2 2/4] migration/rdma: Fix rdma_addrinfo res leaks

2021-05-25 Thread Li Zhijian
rdma_freeaddrinfo() is the reverse operation of rdma_getaddrinfo()

Signed-off-by: Li Zhijian 
Reviewed-by: Dr. David Alan Gilbert 
---
 migration/rdma.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/migration/rdma.c b/migration/rdma.c
index 7e7595faabf..651534e8255 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -987,10 +987,12 @@ static int qemu_rdma_resolve_host(RDMAContext *rdma, 
Error **errp)
 }
 }
 
+rdma_freeaddrinfo(res);
 ERROR(errp, "could not resolve address %s", rdma->host);
 goto err_resolve_get_addr;
 
 route:
+rdma_freeaddrinfo(res);
 qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id);
 
 ret = rdma_get_cm_event(rdma->channel, _event);
@@ -2593,6 +2595,7 @@ static int qemu_rdma_dest_init(RDMAContext *rdma, Error 
**errp)
 break;
 }
 
+rdma_freeaddrinfo(res);
 if (!e) {
 ERROR(errp, "Error: could not rdma_bind_addr!");
 goto err_dest_init_bind_addr;
-- 
2.30.2






[PATCH v2 3/4] migration/rdma: destination: create the return patch after the first accept

2021-05-25 Thread Li Zhijian
destination side:
$ build/qemu-system-x86_64 -enable-kvm -netdev 
tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device 
e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive 
if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device 
virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 
2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl 
-spice streaming-video=filter,port=5902,disable-ticketing -incoming 
rdma:192.168.1.10:
(qemu) migrate_set_capability postcopy-ram on
(qemu)
dest_init RDMA Device opened: kernel name rocep1s0f0 uverbs device name 
uverbs0, infiniband_verbs class device path 
/sys/class/infiniband_verbs/uverbs0, infiniband class device path 
/sys/class/infiniband/rocep1s0f0, transport: (2) Ethernet
Segmentation fault (core dumped)

 (gdb) bt
 #0  qemu_rdma_accept (rdma=0x0) at ../migration/rdma.c:3272
 #1  rdma_accept_incoming_migration (opaque=0x0) at ../migration/rdma.c:3986
 #2  0x563c9e51f02a in aio_dispatch_handler
 (ctx=ctx@entry=0x563ca0606010, node=0x563ca12b2150) at 
../util/aio-posix.c:329
 #3  0x563c9e51f752 in aio_dispatch_handlers (ctx=0x563ca0606010) at  
../util/aio-posix.c:372
 #4  aio_dispatch (ctx=0x563ca0606010) at ../util/aio-posix.c:382
 #5  0x563c9e4f4d9e in aio_ctx_dispatch (source=,  
callback=, user_data=)at ../util/async.c:306
 #6  0x7fe96ef3fa9f in g_main_context_dispatch () at  
/lib64/libglib-2.0.so.0
 #7  0x563c9e4ffeb8 in glib_pollfds_poll () at ../util/main-loop.c:231
 #8  os_host_main_loop_wait (timeout=12188789) at ../util/main-loop.c:254
 #9  main_loop_wait (nonblocking=nonblocking@entry=0) at 
../util/main-loop.c:530
 #10 0x563c9e3c7211 in qemu_main_loop () at ../softmmu/runstate.c:725
 #11 0x563c9dfd46fe in main (argc=, argv=, envp=) at ../softmmu/main.c:50

The rdma return path will not be created when qemu incoming is starting
since migrate_copy() is false at that moment, then a  NULL return path
rdma was referenced if the user enabled postcopy later.

Signed-off-by: Li Zhijian 
---
V2: alloc memory for host_port
---
 migration/rdma.c | 32 +---
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index 651534e8255..d829d08d076 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -316,6 +316,7 @@ typedef struct RDMALocalBlocks {
 typedef struct RDMAContext {
 char *host;
 int port;
+char *host_port;
 
 RDMAWorkRequestData wr_data[RDMA_WRID_MAX];
 
@@ -2392,7 +2393,9 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
 rdma->channel = NULL;
 }
 g_free(rdma->host);
+g_free(rdma->host_port);
 rdma->host = NULL;
+rdma->host_port = NULL;
 }
 
 
@@ -2648,6 +2651,7 @@ static void *qemu_rdma_data_init(const char *host_port, 
Error **errp)
 if (!inet_parse(addr, host_port, NULL)) {
 rdma->port = atoi(addr->port);
 rdma->host = g_strdup(addr->host);
+rdma->host_port = g_strdup(host_port);
 } else {
 ERROR(errp, "bad RDMA migration address '%s'", host_port);
 g_free(rdma);
@@ -3276,6 +3280,7 @@ static int qemu_rdma_accept(RDMAContext *rdma)
 .private_data = ,
 .private_data_len = sizeof(cap),
  };
+RDMAContext *rdma_return_path = NULL;
 struct rdma_cm_event *cm_event;
 struct ibv_context *verbs;
 int ret = -EINVAL;
@@ -3291,6 +3296,20 @@ static int qemu_rdma_accept(RDMAContext *rdma)
 goto err_rdma_dest_wait;
 }
 
+/*
+ * initialize the RDMAContext for return path for postcopy after first
+ * connection request reached.
+ */
+if (migrate_postcopy() && !rdma->is_return_path) {
+rdma_return_path = qemu_rdma_data_init(rdma->host_port, NULL);
+if (rdma_return_path == NULL) {
+rdma_ack_cm_event(cm_event);
+goto err_rdma_dest_wait;
+}
+
+qemu_rdma_return_path_dest_init(rdma_return_path, rdma);
+}
+
 memcpy(, cm_event->param.conn.private_data, sizeof(cap));
 
 network_to_caps();
@@ -3406,6 +3425,7 @@ static int qemu_rdma_accept(RDMAContext *rdma)
 err_rdma_dest_wait:
 rdma->error_state = ret;
 qemu_rdma_cleanup(rdma);
+g_free(rdma_return_path);
 return ret;
 }
 
@@ -4048,17 +4068,6 @@ void rdma_start_incoming_migration(const char 
*host_port, Error **errp)
 
 trace_rdma_start_incoming_migration_after_rdma_listen();
 
-/* initialize the RDMAContext for return path */
-if (migrate_postcopy()) {
-rdma_return_path = qemu_rdma_data_init(host_port, _err);
-
-if (rdma_return_path == NULL) {
-goto cleanup_rdma;
-}
-
-qemu_rdma_return_path_dest_init(rdm

[PATCH v2 1/4] migration/rdma: cleanup rmda in rdma_start_incoming_migration error path

2021-05-25 Thread Li Zhijian
the error path after calling qemu_rdma_dest_init() should do rdma cleanup

Signed-off-by: Li Zhijian 
Reviewed-by: Dr. David Alan Gilbert 
---
 migration/rdma.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index 41726cc74a8..7e7595faabf 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -4040,7 +4040,7 @@ void rdma_start_incoming_migration(const char *host_port, 
Error **errp)
 
 if (ret) {
 ERROR(errp, "listening on socket!");
-goto err;
+goto cleanup_rdma;
 }
 
 trace_rdma_start_incoming_migration_after_rdma_listen();
@@ -4050,7 +4050,7 @@ void rdma_start_incoming_migration(const char *host_port, 
Error **errp)
 rdma_return_path = qemu_rdma_data_init(host_port, _err);
 
 if (rdma_return_path == NULL) {
-goto err;
+goto cleanup_rdma;
 }
 
 qemu_rdma_return_path_dest_init(rdma_return_path, rdma);
@@ -4059,6 +4059,9 @@ void rdma_start_incoming_migration(const char *host_port, 
Error **errp)
 qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
 NULL, (void *)(intptr_t)rdma);
 return;
+
+cleanup_rdma:
+qemu_rdma_cleanup(rdma);
 err:
 error_propagate(errp, local_err);
 if (rdma) {
-- 
2.30.2






[PATCH RESEND 4/4] migration/rdma: source: get accept cm_event from return path in non-block mode

2021-05-20 Thread Li Zhijian
source side always blocks if postcopy is only enabled at source side.
users are not able to cancel this migration in this case.

Here we try to get the cm_event every 100ms tile timeout.

Signed-off-by: Li Zhijian 
---
 migration/rdma.c | 59 
 1 file changed, 55 insertions(+), 4 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index 3b228c46eb..181ad03849 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -2458,7 +2458,54 @@ err_rdma_source_init:
 return -1;
 }
 
-static int qemu_rdma_connect(RDMAContext *rdma, Error **errp)
+#define RDMA_GET_EVENT_INTERVAL 10 /* 100ms */
+static int qemu_get_cm_event_timeout(RDMAContext *rdma,
+ struct rdma_cm_event **cm_event,
+ long sec, Error **errp)
+{
+long wait_ns = 0;
+int ret;
+int flags = fcntl(rdma->channel->fd, F_GETFL), save_flags;
+
+if (flags == -1) {
+perror("failed to get file flags");
+return flags;
+}
+save_flags = flags;
+flags |= O_NONBLOCK;
+ret = fcntl(rdma->channel->fd, F_SETFL, flags);
+if (ret) {
+perror("failed to set file flags nonblocking");
+return ret;
+}
+
+retry:
+ret = rdma_get_cm_event(rdma->channel, cm_event);
+if (ret && errno == EAGAIN) {
+if (wait_ns < sec * 100) {
+perror("rdma_get_cm_event after rdma_connect");
+wait_ns += RDMA_GET_EVENT_INTERVAL;
+usleep(RDMA_GET_EVENT_INTERVAL);
+goto retry;
+}
+}
+if (ret) {
+perror("rdma_get_cm_event after rdma_connect");
+ERROR(errp, "connecting to destination!");
+return ret;
+}
+
+/* restore flags */
+ret = fcntl(rdma->channel->fd, F_SETFL, save_flags);
+if (ret) {
+rdma_ack_cm_event(*cm_event);
+perror("failed to restore file flags");
+}
+
+return ret;
+}
+
+static int qemu_rdma_connect(RDMAContext *rdma, Error **errp, bool return_path)
 {
 RDMACapabilities cap = {
 .version = RDMA_CONTROL_VERSION_CURRENT,
@@ -2496,7 +2543,11 @@ static int qemu_rdma_connect(RDMAContext *rdma, Error 
**errp)
 goto err_rdma_source_connect;
 }
 
-ret = rdma_get_cm_event(rdma->channel, _event);
+if (return_path) {
+ret = qemu_get_cm_event_timeout(rdma, _event, 2, errp);
+} else {
+ret = rdma_get_cm_event(rdma->channel, _event);
+}
 if (ret) {
 perror("rdma_get_cm_event after rdma_connect");
 ERROR(errp, "connecting to destination!");
@@ -4108,7 +4159,7 @@ void rdma_start_outgoing_migration(void *opaque,
 }
 
 trace_rdma_start_outgoing_migration_after_rdma_source_init();
-ret = qemu_rdma_connect(rdma, errp);
+ret = qemu_rdma_connect(rdma, errp, false);
 
 if (ret) {
 goto err;
@@ -4129,7 +4180,7 @@ void rdma_start_outgoing_migration(void *opaque,
 goto return_path_err;
 }
 
-ret = qemu_rdma_connect(rdma_return_path, errp);
+ret = qemu_rdma_connect(rdma_return_path, errp, true);
 
 if (ret) {
 goto return_path_err;
-- 
2.30.2






[PATCH RESEND 3/4] migration/rdma: destination: create the return patch after the first accept

2021-05-20 Thread Li Zhijian
destination side:
$ build/qemu-system-x86_64 -enable-kvm -netdev 
tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device 
e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive 
if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device 
virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 
2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl 
-spice streaming-video=filter,port=5902,disable-ticketing -incoming 
rdma:192.168.1.10:
(qemu) migrate_set_capability postcopy-ram on
(qemu)
dest_init RDMA Device opened: kernel name rocep1s0f0 uverbs device name 
uverbs0, infiniband_verbs class device path 
/sys/class/infiniband_verbs/uverbs0, infiniband class device path 
/sys/class/infiniband/rocep1s0f0, transport: (2) Ethernet
Segmentation fault (core dumped)

 (gdb) bt
 #0  qemu_rdma_accept (rdma=0x0) at ../migration/rdma.c:3272
 #1  rdma_accept_incoming_migration (opaque=0x0) at ../migration/rdma.c:3986
 #2  0x563c9e51f02a in aio_dispatch_handler
 (ctx=ctx@entry=0x563ca0606010, node=0x563ca12b2150) at 
../util/aio-posix.c:329
 #3  0x563c9e51f752 in aio_dispatch_handlers (ctx=0x563ca0606010) at  
../util/aio-posix.c:372
 #4  aio_dispatch (ctx=0x563ca0606010) at ../util/aio-posix.c:382
 #5  0x563c9e4f4d9e in aio_ctx_dispatch (source=,  
callback=, user_data=)at ../util/async.c:306
 #6  0x7fe96ef3fa9f in g_main_context_dispatch () at  
/lib64/libglib-2.0.so.0
 #7  0x563c9e4ffeb8 in glib_pollfds_poll () at ../util/main-loop.c:231
 #8  os_host_main_loop_wait (timeout=12188789) at ../util/main-loop.c:254
 #9  main_loop_wait (nonblocking=nonblocking@entry=0) at 
../util/main-loop.c:530
 #10 0x563c9e3c7211 in qemu_main_loop () at ../softmmu/runstate.c:725
 #11 0x563c9dfd46fe in main (argc=, argv=, envp=) at ../softmmu/main.c:50

The rdma return path will not be created when qemu incoming is starting
since migrate_copy() is false at that moment, then a  NULL return path
rdma was referenced if the user enabled postcopy later.

Signed-off-by: Li Zhijian 
---
 migration/rdma.c | 29 ++---
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index 651534e825..3b228c46eb 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -316,6 +316,7 @@ typedef struct RDMALocalBlocks {
 typedef struct RDMAContext {
 char *host;
 int port;
+const char *host_port;
 
 RDMAWorkRequestData wr_data[RDMA_WRID_MAX];
 
@@ -2648,6 +2649,7 @@ static void *qemu_rdma_data_init(const char *host_port, 
Error **errp)
 if (!inet_parse(addr, host_port, NULL)) {
 rdma->port = atoi(addr->port);
 rdma->host = g_strdup(addr->host);
+rdma->host_port = host_port;
 } else {
 ERROR(errp, "bad RDMA migration address '%s'", host_port);
 g_free(rdma);
@@ -3276,6 +3278,7 @@ static int qemu_rdma_accept(RDMAContext *rdma)
 .private_data = ,
 .private_data_len = sizeof(cap),
  };
+RDMAContext *rdma_return_path = NULL;
 struct rdma_cm_event *cm_event;
 struct ibv_context *verbs;
 int ret = -EINVAL;
@@ -3291,6 +3294,20 @@ static int qemu_rdma_accept(RDMAContext *rdma)
 goto err_rdma_dest_wait;
 }
 
+/*
+ * initialize the RDMAContext for return path for postcopy after first
+ * connection is accepted.
+ */
+if (migrate_postcopy() && !rdma->is_return_path) {
+rdma_return_path = qemu_rdma_data_init(rdma->host_port, NULL);
+if (rdma_return_path == NULL) {
+rdma_ack_cm_event(cm_event);
+goto err_rdma_dest_wait;
+}
+
+qemu_rdma_return_path_dest_init(rdma_return_path, rdma);
+}
+
 memcpy(, cm_event->param.conn.private_data, sizeof(cap));
 
 network_to_caps();
@@ -3406,6 +3423,7 @@ static int qemu_rdma_accept(RDMAContext *rdma)
 err_rdma_dest_wait:
 rdma->error_state = ret;
 qemu_rdma_cleanup(rdma);
+g_free(rdma_return_path);
 return ret;
 }
 
@@ -4048,17 +4066,6 @@ void rdma_start_incoming_migration(const char 
*host_port, Error **errp)
 
 trace_rdma_start_incoming_migration_after_rdma_listen();
 
-/* initialize the RDMAContext for return path */
-if (migrate_postcopy()) {
-rdma_return_path = qemu_rdma_data_init(host_port, _err);
-
-if (rdma_return_path == NULL) {
-goto cleanup_rdma;
-}
-
-qemu_rdma_return_path_dest_init(rdma_return_path, rdma);
-}
-
 qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
 NULL, (void *)(intptr_t)rdma);
 return;
-- 
2.30.2






[PATCH RESEND 2/4] migration/rdma: Fix rdma_addrinfo res leaks

2021-05-20 Thread Li Zhijian
rdma_freeaddrinfo() is the reverse operation of rdma_getaddrinfo()

Signed-off-by: Li Zhijian 
---
 migration/rdma.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/migration/rdma.c b/migration/rdma.c
index 7e7595faab..651534e825 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -987,10 +987,12 @@ static int qemu_rdma_resolve_host(RDMAContext *rdma, 
Error **errp)
 }
 }
 
+rdma_freeaddrinfo(res);
 ERROR(errp, "could not resolve address %s", rdma->host);
 goto err_resolve_get_addr;
 
 route:
+rdma_freeaddrinfo(res);
 qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id);
 
 ret = rdma_get_cm_event(rdma->channel, _event);
@@ -2593,6 +2595,7 @@ static int qemu_rdma_dest_init(RDMAContext *rdma, Error 
**errp)
 break;
 }
 
+rdma_freeaddrinfo(res);
 if (!e) {
 ERROR(errp, "Error: could not rdma_bind_addr!");
 goto err_dest_init_bind_addr;
-- 
2.30.2






[PATCH RESEND 1/4] migration/rdma: cleanup rmda in rdma_start_incoming_migration error path

2021-05-20 Thread Li Zhijian
the error path after calling qemu_rdma_dest_init() should do rdma cleanup

Signed-off-by: Li Zhijian 
---
 migration/rdma.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index 41726cc74a..7e7595faab 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -4040,7 +4040,7 @@ void rdma_start_incoming_migration(const char *host_port, 
Error **errp)
 
 if (ret) {
 ERROR(errp, "listening on socket!");
-goto err;
+goto cleanup_rdma;
 }
 
 trace_rdma_start_incoming_migration_after_rdma_listen();
@@ -4050,7 +4050,7 @@ void rdma_start_incoming_migration(const char *host_port, 
Error **errp)
 rdma_return_path = qemu_rdma_data_init(host_port, _err);
 
 if (rdma_return_path == NULL) {
-goto err;
+goto cleanup_rdma;
 }
 
 qemu_rdma_return_path_dest_init(rdma_return_path, rdma);
@@ -4059,6 +4059,9 @@ void rdma_start_incoming_migration(const char *host_port, 
Error **errp)
 qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
 NULL, (void *)(intptr_t)rdma);
 return;
+
+cleanup_rdma:
+qemu_rdma_cleanup(rdma);
 err:
 error_propagate(errp, local_err);
 if (rdma) {
-- 
2.30.2






[PATCH 3/4] migration/rdma: destination: create the return patch after the first accept

2021-05-20 Thread Li Zhijian
From: Your Name 

destination side:
$ build/qemu-system-x86_64 -enable-kvm -netdev 
tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device 
e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive 
if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device 
virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 
2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl 
-spice streaming-video=filter,port=5902,disable-ticketing -incoming 
rdma:192.168.1.10:
(qemu) migrate_set_capability postcopy-ram on
(qemu)
dest_init RDMA Device opened: kernel name rocep1s0f0 uverbs device name 
uverbs0, infiniband_verbs class device path 
/sys/class/infiniband_verbs/uverbs0, infiniband class device path 
/sys/class/infiniband/rocep1s0f0, transport: (2) Ethernet
Segmentation fault (core dumped)

 (gdb) bt
 #0  qemu_rdma_accept (rdma=0x0) at ../migration/rdma.c:3272
 #1  rdma_accept_incoming_migration (opaque=0x0) at ../migration/rdma.c:3986
 #2  0x563c9e51f02a in aio_dispatch_handler
 (ctx=ctx@entry=0x563ca0606010, node=0x563ca12b2150) at 
../util/aio-posix.c:329
 #3  0x563c9e51f752 in aio_dispatch_handlers (ctx=0x563ca0606010) at  
../util/aio-posix.c:372
 #4  aio_dispatch (ctx=0x563ca0606010) at ../util/aio-posix.c:382
 #5  0x563c9e4f4d9e in aio_ctx_dispatch (source=,  
callback=, user_data=)at ../util/async.c:306
 #6  0x7fe96ef3fa9f in g_main_context_dispatch () at  
/lib64/libglib-2.0.so.0
 #7  0x563c9e4ffeb8 in glib_pollfds_poll () at ../util/main-loop.c:231
 #8  os_host_main_loop_wait (timeout=12188789) at ../util/main-loop.c:254
 #9  main_loop_wait (nonblocking=nonblocking@entry=0) at 
../util/main-loop.c:530
 #10 0x563c9e3c7211 in qemu_main_loop () at ../softmmu/runstate.c:725
 #11 0x563c9dfd46fe in main (argc=, argv=, envp=) at ../softmmu/main.c:50

The rdma return path will not be created when qemu incoming is starting
since migrate_copy() is false at that moment, then a  NULL return path
rdma was referenced if the user enabled postcopy later.

Signed-off-by: Li Zhijian 
---
 migration/rdma.c | 29 ++---
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index 651534e825..3b228c46eb 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -316,6 +316,7 @@ typedef struct RDMALocalBlocks {
 typedef struct RDMAContext {
 char *host;
 int port;
+const char *host_port;
 
 RDMAWorkRequestData wr_data[RDMA_WRID_MAX];
 
@@ -2648,6 +2649,7 @@ static void *qemu_rdma_data_init(const char *host_port, 
Error **errp)
 if (!inet_parse(addr, host_port, NULL)) {
 rdma->port = atoi(addr->port);
 rdma->host = g_strdup(addr->host);
+rdma->host_port = host_port;
 } else {
 ERROR(errp, "bad RDMA migration address '%s'", host_port);
 g_free(rdma);
@@ -3276,6 +3278,7 @@ static int qemu_rdma_accept(RDMAContext *rdma)
 .private_data = ,
 .private_data_len = sizeof(cap),
  };
+RDMAContext *rdma_return_path = NULL;
 struct rdma_cm_event *cm_event;
 struct ibv_context *verbs;
 int ret = -EINVAL;
@@ -3291,6 +3294,20 @@ static int qemu_rdma_accept(RDMAContext *rdma)
 goto err_rdma_dest_wait;
 }
 
+/*
+ * initialize the RDMAContext for return path for postcopy after first
+ * connection is accepted.
+ */
+if (migrate_postcopy() && !rdma->is_return_path) {
+rdma_return_path = qemu_rdma_data_init(rdma->host_port, NULL);
+if (rdma_return_path == NULL) {
+rdma_ack_cm_event(cm_event);
+goto err_rdma_dest_wait;
+}
+
+qemu_rdma_return_path_dest_init(rdma_return_path, rdma);
+}
+
 memcpy(, cm_event->param.conn.private_data, sizeof(cap));
 
 network_to_caps();
@@ -3406,6 +3423,7 @@ static int qemu_rdma_accept(RDMAContext *rdma)
 err_rdma_dest_wait:
 rdma->error_state = ret;
 qemu_rdma_cleanup(rdma);
+g_free(rdma_return_path);
 return ret;
 }
 
@@ -4048,17 +4066,6 @@ void rdma_start_incoming_migration(const char 
*host_port, Error **errp)
 
 trace_rdma_start_incoming_migration_after_rdma_listen();
 
-/* initialize the RDMAContext for return path */
-if (migrate_postcopy()) {
-rdma_return_path = qemu_rdma_data_init(host_port, _err);
-
-if (rdma_return_path == NULL) {
-goto cleanup_rdma;
-}
-
-qemu_rdma_return_path_dest_init(rdma_return_path, rdma);
-}
-
 qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
 NULL, (void *)(intptr_t)rdma);
 return;
-- 
2.30.2






[PATCH 1/4] migration/rdma: cleanup rmda in rdma_start_incoming_migration error path

2021-05-20 Thread Li Zhijian
the error path after calling qemu_rdma_dest_init() should do rdma cleanup

Signed-off-by: Li Zhijian 
---
 migration/rdma.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index 41726cc74a..7e7595faab 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -4040,7 +4040,7 @@ void rdma_start_incoming_migration(const char *host_port, 
Error **errp)
 
 if (ret) {
 ERROR(errp, "listening on socket!");
-goto err;
+goto cleanup_rdma;
 }
 
 trace_rdma_start_incoming_migration_after_rdma_listen();
@@ -4050,7 +4050,7 @@ void rdma_start_incoming_migration(const char *host_port, 
Error **errp)
 rdma_return_path = qemu_rdma_data_init(host_port, _err);
 
 if (rdma_return_path == NULL) {
-goto err;
+goto cleanup_rdma;
 }
 
 qemu_rdma_return_path_dest_init(rdma_return_path, rdma);
@@ -4059,6 +4059,9 @@ void rdma_start_incoming_migration(const char *host_port, 
Error **errp)
 qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
 NULL, (void *)(intptr_t)rdma);
 return;
+
+cleanup_rdma:
+qemu_rdma_cleanup(rdma);
 err:
 error_propagate(errp, local_err);
 if (rdma) {
-- 
2.30.2






[PATCH 2/4] migration/rdma: Fix rdma_addrinfo res leaks

2021-05-20 Thread Li Zhijian
rdma_freeaddrinfo() is the reverse operation of rdma_getaddrinfo()

Signed-off-by: Li Zhijian 
---
 migration/rdma.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/migration/rdma.c b/migration/rdma.c
index 7e7595faab..651534e825 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -987,10 +987,12 @@ static int qemu_rdma_resolve_host(RDMAContext *rdma, 
Error **errp)
 }
 }
 
+rdma_freeaddrinfo(res);
 ERROR(errp, "could not resolve address %s", rdma->host);
 goto err_resolve_get_addr;
 
 route:
+rdma_freeaddrinfo(res);
 qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id);
 
 ret = rdma_get_cm_event(rdma->channel, _event);
@@ -2593,6 +2595,7 @@ static int qemu_rdma_dest_init(RDMAContext *rdma, Error 
**errp)
 break;
 }
 
+rdma_freeaddrinfo(res);
 if (!e) {
 ERROR(errp, "Error: could not rdma_bind_addr!");
 goto err_dest_init_bind_addr;
-- 
2.30.2






[PATCH v2] migration/rdma: Fix cm_event used before being initialized

2021-05-19 Thread Li Zhijian
A segmentation fault was triggered when i try to abort a postcopy + rdma
migration.

since rdma_ack_cm_event releases a uninitialized cm_event in these case.

like below:
2496 ret = rdma_get_cm_event(rdma->channel, _event);
2497 if (ret) {
2498 perror("rdma_get_cm_event after rdma_connect");
2499 ERROR(errp, "connecting to destination!");
2500 rdma_ack_cm_event(cm_event); <<<< cause segmentation fault
2501 goto err_rdma_source_connect;
2502 }

Refer to the rdma_get_cm_event() code, cm_event will be
updated/changed only if rdma_get_cm_event() returns 0. So it's okey to
remove the ack in error patch.

Signed-off-by: Li Zhijian 

---
V2: remove ack from the error patch (Dave)
---
 migration/rdma.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index 00eac34232..41726cc74a 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -2497,7 +2497,6 @@ static int qemu_rdma_connect(RDMAContext *rdma, Error 
**errp)
 if (ret) {
 perror("rdma_get_cm_event after rdma_connect");
 ERROR(errp, "connecting to destination!");
-rdma_ack_cm_event(cm_event);
 goto err_rdma_source_connect;
 }
 
-- 
2.30.2






[PATCH] migration/rdma: Fix cm_event used before being initialized

2021-05-13 Thread Li Zhijian
A segmentation fault was triggered when i try to abort a postcopy + rdma
migration.

since rdma_ack_cm_event releases a uninitialized cm_event in thise case.

like below:
2496 ret = rdma_get_cm_event(rdma->channel, _event);
2497 if (ret) {
2498 perror("rdma_get_cm_event after rdma_connect");
2499 ERROR(errp, "connecting to destination!");
2500 rdma_ack_cm_event(cm_event); <<<< cause segmentation fault
2501 goto err_rdma_source_connect;
2502 }

Signed-off-by: Li Zhijian 
---
 migration/rdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index 00eac34232..2dadb62aed 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -2466,7 +2466,7 @@ static int qemu_rdma_connect(RDMAContext *rdma, Error 
**errp)
   .private_data = ,
   .private_data_len = sizeof(cap),
 };
-struct rdma_cm_event *cm_event;
+struct rdma_cm_event *cm_event = NULL;
 int ret;
 
 /*
-- 
2.30.2






[PATCH v2] block: Improve backing file validation

2021-05-11 Thread Li Zhijian
Image below user cases:
case 1:
```
$ qemu-img create -f raw source.raw 1G
$ qemu-img create -f qcow2 -F raw -b source.raw ./source.raw
qemu-img info source.raw
image: source.raw
file format: qcow2
virtual size: 193K (197120 bytes)
disk size: 196K
cluster_size: 65536
backing file: source.raw <<<<<<
backing file format: raw
Format specific information:
compat: 1.1
lazy refcounts: false
refcount bits: 16
corrupt: false
```

case 2:
```
$ qemu-img create -f raw source.raw 1G
$ ln -sf source.raw destination.qcow2
$ qemu-img create -f qcow2 -F raw -b source.raw ./destination.qcow2
qemu-img info source.raw
image: source.raw
file format: qcow2 <<<<<<
virtual size: 2.0G (2147483648 bytes)
disk size: 196K
cluster_size: 65536
backing file: source.raw
backing file format: raw
Format specific information:
compat: 1.1
lazy refcounts: false
refcount bits: 16
corrupt: false
```
Generally, we don't expect to corrupte the source.raw anyway, while
actually it does.

Here we check their inode number instead of file name.

Suggested-by: Daniel P. Berrangé 
Signed-off-by: Li Zhijian 

---
v2: utilize stat() instead of realpath() (Daniel)

Signed-off-by: Li Zhijian 
---
 block.c | 39 ---
 1 file changed, 32 insertions(+), 7 deletions(-)

diff --git a/block.c b/block.c
index 9ad725d205..db4ae57959 100644
--- a/block.c
+++ b/block.c
@@ -6431,6 +6431,37 @@ bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
 return true;
 }
 
+static bool validate_backing_file(const char *filename,
+  const char *backing_file, Error **errp)
+{
+struct stat filename_stat, backing_stat;
+
+if (backing_file[0] == '\0') {
+error_setg(errp, "Expected backing file name, got empty string");
+goto out;
+}
+
+/* check whether filename and backing_file are refering to the same file */
+if (stat(backing_file, _stat) == -1) {
+error_setg(errp, "Cannot stat backing file %s", backing_file);
+goto out;
+}
+if (stat(filename, _stat) == -1) {
+/* Simply consider filename doesn't exist, no need to further check */
+return true;
+}
+if ((filename_stat.st_dev == backing_stat.st_dev) &&
+(filename_stat.st_ino == backing_stat.st_ino)) {
+error_setg(errp, "Error: Trying to create an image with the "
+ "same filename as the backing file");
+goto out;
+}
+
+return true;
+out:
+return false;
+}
+
 void bdrv_img_create(const char *filename, const char *fmt,
  const char *base_filename, const char *base_fmt,
  char *options, uint64_t img_size, int flags, bool quiet,
@@ -6507,13 +6538,7 @@ void bdrv_img_create(const char *filename, const char 
*fmt,
 
 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
 if (backing_file) {
-if (!strcmp(filename, backing_file)) {
-error_setg(errp, "Error: Trying to create an image with the "
- "same filename as the backing file");
-goto out;
-}
-if (backing_file[0] == '\0') {
-error_setg(errp, "Expected backing file name, got empty string");
+if (!validate_backing_file(filename, backing_file, errp)) {
 goto out;
 }
 }
-- 
2.30.2






[PATCH] block: Improve backing file validation

2021-05-09 Thread Li Zhijian
Image below user cases:
case 1:
```
$ qemu-img create -f raw source.raw 1G
$ qemu-img create -f qcow2 -F raw -b source.raw ./source.raw
qemu-img info source.raw
image: source.raw
file format: qcow2
virtual size: 193K (197120 bytes)
disk size: 196K
cluster_size: 65536
backing file: source.raw <<<<<<
backing file format: raw
Format specific information:
compat: 1.1
lazy refcounts: false
refcount bits: 16
corrupt: false
```

case 2:
```
$ qemu-img create -f raw source.raw 1G
$ ln -sf source.raw destination.qcow2
$ qemu-img create -f qcow2 -F raw -b source.raw ./destination.qcow2
qemu-img info source.raw
image: source.raw
file format: qcow2 <<<<<<
virtual size: 2.0G (2147483648 bytes)
disk size: 196K
cluster_size: 65536
backing file: source.raw
backing file format: raw
Format specific information:
compat: 1.1
lazy refcounts: false
refcount bits: 16
corrupt: false
```
Generally, we don't expect to corrupte the source.raw anyway, while
actually it does.

Here we validate the realpath of file instead the input string.

Signed-off-by: Li Zhijian 
---
 block.c | 46 +++---
 1 file changed, 39 insertions(+), 7 deletions(-)

diff --git a/block.c b/block.c
index 9ad725d205..523845b763 100644
--- a/block.c
+++ b/block.c
@@ -6431,6 +6431,44 @@ bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
 return true;
 }
 
+static bool validate_backing_file(const char *filename,
+  const char *backing_file, Error **errp)
+{
+bool ret = false;
+char *rf, *real_filename = g_malloc0(PATH_MAX + 1);
+char *rb, *real_backing = g_malloc0(PATH_MAX + 1);
+
+rf = realpath(filename, real_filename);
+if (!rf) {
+if (errno == ENOENT) {
+/* filename doesn't exit, ignore it */
+rf = (char *)filename;
+} else {
+error_setg(errp, "Failed to resolve %s", filename);
+goto out;
+}
+}
+rb = realpath(backing_file, real_backing);
+if (!rb) {
+error_setg(errp, "Failed to resolve %s", backing_file);
+goto out;
+}
+if (!strcmp(rf, rb)) {
+error_setg(errp, "Error: Trying to create an image with the "
+"same filename as the backing file");
+goto out;
+}
+if (backing_file[0] == '\0') {
+error_setg(errp, "Expected backing file name, got empty string");
+goto out;
+}
+ret = true;
+out:
+g_free(real_filename);
+g_free(real_backing);
+return ret;
+}
+
 void bdrv_img_create(const char *filename, const char *fmt,
  const char *base_filename, const char *base_fmt,
  char *options, uint64_t img_size, int flags, bool quiet,
@@ -6507,13 +6545,7 @@ void bdrv_img_create(const char *filename, const char 
*fmt,
 
 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
 if (backing_file) {
-if (!strcmp(filename, backing_file)) {
-error_setg(errp, "Error: Trying to create an image with the "
- "same filename as the backing file");
-goto out;
-}
-if (backing_file[0] == '\0') {
-error_setg(errp, "Expected backing file name, got empty string");
+if (!validate_backing_file(filename, backing_file, errp)) {
 goto out;
 }
 }
-- 
2.30.2






Re: [Bug 1914696] Re: aarch64: migration failed: Segmentation fault (core dumped)

2021-02-05 Thread Li Zhijian




On 2/5/21 5:52 PM, Claudio Fontana wrote:

Hi LiZhiJian,

just one thing I noticed in your call stack, your $Subject talks about AArch64,
and you end up in arm_v7m_mmu_idx_for_secstate,

which means that ARM_FEATURE_M is detected, so definitely something looks off 
when it comes to the feature bits..

That's true, so i guessboldly migration across this FBC are not trustable.
the FBC changed the feature bits offset. Previously ARM_FEATURE = 11(source 
side), after this commit(destination side),  it turned ARM_FEATURE = 9.




Ciao,

Claudio

On 2/5/21 4:04 AM, lizhijian wrote:

paste the call trace

(gdb) bt
#0  0xc036a02c in armv7m_nvic_neg_prio_requested (opaque=0x0, 
secure=false) at ../hw/intc/armv7m_nvic.c:406
#1  0xc014dcf4 in arm_v7m_mmu_idx_for_secstate_and_priv 
(env=0xca23d950, secstate=false, priv=true) at ../target/arm/m_helper.c:2837
#2  0xc014dd8c in arm_v7m_mmu_idx_for_secstate (env=0xca23d950, 
secstate=false) at ../target/arm/m_helper.c:2848
#3  0xc018aa6c in arm_mmu_idx_el (env=0xca23d950, el=1) at 
../target/arm/helper.c:12841
#4  0xc018b788 in rebuild_hflags_internal (env=0xca23d950) at 
../target/arm/helper.c:13100
#5  0xc018b80c in arm_rebuild_hflags (env=0xca23d950) at 
../target/arm/helper.c:13113
#6  0xc007f928 in cpu_post_load (opaque=0xca233b10, version_id=22) 
at ../target/arm/machine.c:767
#7  0xbfc8f508 in vmstate_load_state (f=0xca355520, vmsd=0xc0d59ea8 
, opaque=0xca233b10, version_id=22) at 
../migration/vmstate.c:168
#8  0xbfca3404 in vmstate_load (f=0xca355520, se=0xca2708b0) at 
../migration/savevm.c:885
#9  0xbfca6410 in qemu_loadvm_section_start_full (f=0xca355520, 
mis=0xca204d90) at ../migration/savevm.c:2396
#10 0xbfca6a8c in qemu_loadvm_state_main (f=0xca355520, 
mis=0xca204d90) at ../migration/savevm.c:2582
#11 0xbfca6c34 in qemu_loadvm_state (f=0xca355520) at 
../migration/savevm.c:2661
#12 0xbfd95bf0 in process_incoming_migration_co (opaque=0x0) at 
../migration/migration.c:522
#13 0xc06c6248 in coroutine_trampoline (i0=-895198224, i1=43690) at 
../util/coroutine-ucontext.c:173
#14 0xa5071f90 in __startcontext () at 
../sysdeps/unix/sysv/linux/aarch64/setcontext.S:123


** Information type changed from Public to Public Security












Re: [PATCH 3/4] net/colo-compare.c: Add secondary old packet detection

2020-09-23 Thread Li Zhijian




On 9/23/20 2:47 PM, Zhang, Chen wrote:



-Original Message-
From: Li Zhijian 
Sent: Tuesday, September 22, 2020 2:47 PM
To: Zhang, Chen ; Jason Wang
; qemu-dev 
Cc: Zhang Chen 
Subject: Re: [PATCH 3/4] net/colo-compare.c: Add secondary old packet
detection



On 9/18/20 5:22 PM, Zhang Chen wrote:

From: Zhang Chen 

Detect queued secondary packet to sync VM state in time.

Signed-off-by: Zhang Chen 
---
   net/colo-compare.c | 25 -
   1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c index
3b72309d08..f7271b976f 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -641,19 +641,26 @@ void colo_compare_unregister_notifier(Notifier

*notify)

   static int colo_old_packet_check_one_conn(Connection *conn,
 CompareState *s)
   {
-GList *result = NULL;
-
-result = g_queue_find_custom(>primary_list,
- >compare_timeout,
- (GCompareFunc)colo_old_packet_check_one);
+if (!g_queue_is_empty(>primary_list)) {

Looks we don't need to check is_empty

Re-checked glib code, it just checked the queue rather than inside content.
Maybe check empty before that will benefit performance.

Yeah,  you are right

Reviewed-by: Li Zhijian 

Thank




Thanks
Zhang Chen


+if (g_queue_find_custom(>primary_list,
+>compare_timeout,
+(GCompareFunc)colo_old_packet_check_one))
+goto out;
+}

-if (result) {
-/* Do checkpoint will flush old packet */
-colo_compare_inconsistency_notify(s);
-return 0;
+if (!g_queue_is_empty(>secondary_list)) {

Ditto

Thanks

+if (g_queue_find_custom(>secondary_list,
+>compare_timeout,
+(GCompareFunc)colo_old_packet_check_one))
+goto out;
   }

   return 1;
+
+out:
+/* Do checkpoint will flush old packet */
+colo_compare_inconsistency_notify(s);
+return 0;
   }

   /*












Re: [PATCH 1/3] colo-compare: return -1 if no packet is queued

2020-09-23 Thread Li Zhijian




On 9/23/20 9:41 AM, Zhang, Chen wrote:



-Original Message-
From: Li Zhijian 
Sent: Tuesday, September 22, 2020 5:55 PM
To: Zhang, Chen ; jasow...@redhat.com
Cc: qemu-devel@nongnu.org; Li Zhijian 
Subject: [PATCH 1/3] colo-compare: return -1 if no packet is queued

Return 0 will trigger a packet comparison


Yes, we need active trigger a checkpoint to flush all the queued packets here.

Previously, no new checkpoint will be triggered since no new packet is queued 
though colo_compare_connection() is called.
actually we should send a notify to colo frame immediately, no need to compare 
them any more in order to less latency.

diff --git a/net/colo-compare.c b/net/colo-compare.c
index 3a45d64175..23092e4496 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -285,10 +285,13 @@ static int packet_enqueue(CompareState *s, int mode, 
Connection **con)
 }

 if (!ret) {
+    /* queue is too long, do a checkpoint to release all queued packets */
+    colo_compare_inconsistency_notify(s);
 trace_colo_compare_drop_packet(colo_mode[mode],
 "queue size too big, drop packet");
 packet_destroy(pkt, NULL);
 pkt = NULL;
+    return -1;
 }

 *con = conn;



Otherwise, we should drop all the packet after this time still next checkpoint.
So, I think original logic is a better choice.

Thanks
Zhang Chen


Signed-off-by: Li Zhijian 
---
  net/colo-compare.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/net/colo-compare.c b/net/colo-compare.c index
3a45d64175..039b515611 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -289,6 +289,7 @@ static int packet_enqueue(CompareState *s, int
mode, Connection **con)
  "queue size too big, drop packet");
  packet_destroy(pkt, NULL);
  pkt = NULL;
+return -1;
  }

  *con = conn;
--
2.28.0












Re: [PATCH v2 2/3] Reduce the time of checkpoint for COLO

2020-09-22 Thread Li Zhijian




On 9/22/20 5:24 PM, leirao wrote:

we should set ram_bulk_stage to false after ram_state_init,
otherwise the bitmap will be unused in migration_bitmap_find_dirty.
all pages in ram cache will be flushed to the ram of secondary guest
for each checkpoint.

Signed-off-by: leirao 
---
  migration/ram.c | 14 +-
  1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/migration/ram.c b/migration/ram.c
index 76d4fee..59ff0cf 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -3018,6 +3018,18 @@ static void decompress_data_with_multi_threads(QEMUFile 
*f,
  qemu_mutex_unlock(_done_lock);
  }
  
+ /*

+  * we must set ram_bulk_stage to fasle, otherwise in

a typo: s/fasle/false

Reviewed-by: Li Zhijian 



+  * migation_bitmap_find_dirty the bitmap will be unused and
+  * all the pages in ram cache wil be flushed to the ram of
+  * secondary VM.
+  */
+static void colo_init_ram_state(void)
+{
+ram_state_init(_state);
+ram_state->ram_bulk_stage = false;
+}
+
  /*
   * colo cache: this is for secondary VM, we cache the whole
   * memory of the secondary VM, it is need to hold the global lock
@@ -3061,7 +3073,7 @@ int colo_init_ram_cache(void)
  }
  }
  
-ram_state_init(_state);

+colo_init_ram_state();
  return 0;
  }
  







[PATCH 3/3] colo-compare: check mark in mutual exclusion

2020-09-22 Thread Li Zhijian
Signed-off-by: Li Zhijian 
---
 net/colo-compare.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index 039b515611..19633fc684 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -481,13 +481,11 @@ sec:
 colo_release_primary_pkt(s, ppkt);
 g_queue_push_head(>secondary_list, spkt);
 goto pri;
-}
-if (mark == COLO_COMPARE_FREE_SECONDARY) {
+} else if (mark == COLO_COMPARE_FREE_SECONDARY) {
 conn->compare_seq = spkt->seq_end;
 packet_destroy(spkt, NULL);
 goto sec;
-}
-if (mark == (COLO_COMPARE_FREE_PRIMARY | COLO_COMPARE_FREE_SECONDARY)) 
{
+} else if (mark == (COLO_COMPARE_FREE_PRIMARY | 
COLO_COMPARE_FREE_SECONDARY)) {
 conn->compare_seq = ppkt->seq_end;
 colo_release_primary_pkt(s, ppkt);
 packet_destroy(spkt, NULL);
-- 
2.28.0






[PATCH 1/3] colo-compare: return -1 if no packet is queued

2020-09-22 Thread Li Zhijian
Return 0 will trigger a packet comparison

Signed-off-by: Li Zhijian 
---
 net/colo-compare.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index 3a45d64175..039b515611 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -289,6 +289,7 @@ static int packet_enqueue(CompareState *s, int mode, 
Connection **con)
 "queue size too big, drop packet");
 packet_destroy(pkt, NULL);
 pkt = NULL;
+return -1;
 }
 
 *con = conn;
-- 
2.28.0






[PATCH 0/3] colo-compare: minor fixes

2020-09-22 Thread Li Zhijian


Li Zhijian (3):
  colo-compare: return -1 if no packet is enqueued
  colo-compare: fix missing compare_seq init
  colo-compare: check mark in mutual exclusion

 net/colo-compare.c | 7 +++
 net/colo.c | 5 +
 2 files changed, 4 insertions(+), 8 deletions(-)

-- 
2.28.0






[PATCH 2/3] colo-compare: fix missing compare_seq initialization

2020-09-22 Thread Li Zhijian
Signed-off-by: Li Zhijian 
---
 net/colo.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/net/colo.c b/net/colo.c
index a6c66d829a..ef00609848 100644
--- a/net/colo.c
+++ b/net/colo.c
@@ -133,14 +133,11 @@ void reverse_connection_key(ConnectionKey *key)
 
 Connection *connection_new(ConnectionKey *key)
 {
-Connection *conn = g_slice_new(Connection);
+Connection *conn = g_slice_new0(Connection);
 
 conn->ip_proto = key->ip_proto;
 conn->processing = false;
-conn->offset = 0;
 conn->tcp_state = TCPS_CLOSED;
-conn->pack = 0;
-conn->sack = 0;
 g_queue_init(>primary_list);
 g_queue_init(>secondary_list);
 
-- 
2.28.0






Re: [PATCH 2/3] Reduce the time of checkpoint for COLO

2020-09-22 Thread Li Zhijian




On 9/19/20 11:10 AM, leirao wrote:

we should set ram_bulk_stage to false after ram_state_init,
otherwise the bitmap will be unused in migration_bitmap_find_dirty.
all pages in ram cache will be flushed to the ram of secondary guest
for each checkpoint.

Signed-off-by: leirao 
---
  migration/ram.c | 12 
  1 file changed, 12 insertions(+)

diff --git a/migration/ram.c b/migration/ram.c
index 76d4fee..6a2b6c1 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -3019,6 +3019,17 @@ static void decompress_data_with_multi_threads(QEMUFile 
*f,
  }
  
  /*

+ * we must set ram_bulk_stage to fasle, otherwise in
+ * migation_bitmap_find_dirty the bitmap will be unused and
+ * all the pages in ram cache wil be flushed to the ram of
+ * secondary VM.
+ */
+static void colo_set_ram_state(RAMState *rsp)

this function name is too general, how about

colo_init_ram_state(ram_state)
{
ram_state_init(_state);
ram_state->ram_bulk_stage = false;
}

Thanks
Zhijian


+{
+rsp->ram_bulk_stage = false;
+}
+
+/*
   * colo cache: this is for secondary VM, we cache the whole
   * memory of the secondary VM, it is need to hold the global lock
   * to call this helper.
@@ -3062,6 +3073,7 @@ int colo_init_ram_cache(void)
  }
  
  ram_state_init(_state);

+colo_set_ram_state(ram_state);
  return 0;
  }
  







Re: [PATCH 1/3] Optimize seq_sorter function for colo-compare

2020-09-22 Thread Li Zhijian




On 9/19/20 11:10 AM, leirao wrote:

The seq of tcp has been filled in fill_pkt_tcp_info, it
can be used directly here.

Signed-off-by: leirao 

Reviewed-by: Li Zhijian 



---
  net/colo-compare.c | 6 +-
  1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index 3a45d64..86980ce 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -196,11 +196,7 @@ static void colo_compare_inconsistency_notify(CompareState 
*s)
  
  static gint seq_sorter(Packet *a, Packet *b, gpointer data)

  {
-struct tcp_hdr *atcp, *btcp;
-
-atcp = (struct tcp_hdr *)(a->transport_header);
-btcp = (struct tcp_hdr *)(b->transport_header);
-return ntohl(atcp->th_seq) - ntohl(btcp->th_seq);
+return a->tcp_seq - b->tcp_seq;
  }
  
  static void fill_pkt_tcp_info(void *data, uint32_t *max_ack)







Re: [PATCH 3/4] net/colo-compare.c: Add secondary old packet detection

2020-09-22 Thread Li Zhijian




On 9/18/20 5:22 PM, Zhang Chen wrote:

From: Zhang Chen 

Detect queued secondary packet to sync VM state in time.

Signed-off-by: Zhang Chen 
---
  net/colo-compare.c | 25 -
  1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index 3b72309d08..f7271b976f 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -641,19 +641,26 @@ void colo_compare_unregister_notifier(Notifier *notify)
  static int colo_old_packet_check_one_conn(Connection *conn,
CompareState *s)
  {
-GList *result = NULL;
-
-result = g_queue_find_custom(>primary_list,
- >compare_timeout,
- (GCompareFunc)colo_old_packet_check_one);
+if (!g_queue_is_empty(>primary_list)) {

Looks we don't need to check is_empty


+if (g_queue_find_custom(>primary_list,
+>compare_timeout,
+(GCompareFunc)colo_old_packet_check_one))
+goto out;
+}
  
-if (result) {

-/* Do checkpoint will flush old packet */
-colo_compare_inconsistency_notify(s);
-return 0;
+if (!g_queue_is_empty(>secondary_list)) {

Ditto

Thanks

+if (g_queue_find_custom(>secondary_list,
+>compare_timeout,
+(GCompareFunc)colo_old_packet_check_one))
+goto out;
  }
  
  return 1;

+
+out:
+/* Do checkpoint will flush old packet */
+colo_compare_inconsistency_notify(s);
+return 0;
  }
  
  /*







Re: [PATCH 2/4] net/colo-compare.c: Change the timer clock type

2020-09-22 Thread Li Zhijian




On 9/18/20 5:22 PM, Zhang Chen wrote:

From: Zhang Chen 

The virtual clock only runs during the emulation. It stops
when the virtual machine is stopped.
The host clock should be used for device models that emulate accurate
real time sources. It will continue to run when the virtual machine
is suspended. COLO need to know the host time here.

Reported-by: Derek Su 
Signed-off-by: Zhang Chen 

Reviewed-by: Li Zhijian 



---
  net/colo-compare.c | 6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index 7cba573dae..3b72309d08 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -905,7 +905,7 @@ static void check_old_packet_regular(void *opaque)
  
  /* if have old packet we will notify checkpoint */

  colo_old_packet_check(s);
-timer_mod(s->packet_check_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
+timer_mod(s->packet_check_timer, qemu_clock_get_ms(QEMU_CLOCK_HOST) +
s->expired_scan_cycle);
  }
  
@@ -939,10 +939,10 @@ static void colo_compare_timer_init(CompareState *s)

  {
  AioContext *ctx = iothread_get_aio_context(s->iothread);
  
-s->packet_check_timer = aio_timer_new(ctx, QEMU_CLOCK_VIRTUAL,

+s->packet_check_timer = aio_timer_new(ctx, QEMU_CLOCK_HOST,
  SCALE_MS, check_old_packet_regular,
  s);
-timer_mod(s->packet_check_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
+timer_mod(s->packet_check_timer, qemu_clock_get_ms(QEMU_CLOCK_HOST) +
s->expired_scan_cycle);
  }
  







Re: [PATCH 1/4] net/colo-compare.c: Fix compare_timeout format issue

2020-09-22 Thread Li Zhijian




On 9/18/20 5:22 PM, Zhang Chen wrote:

From: Zhang Chen 

This parameter need compare with the return of qemu_clock_get_ms(),
it is uinit64_t. So we need fix this issue here.

Reported-by: Derek Su 
Signed-off-by: Zhang Chen 

Reviewed-by: Li Zhijian 


---
  net/colo-compare.c | 12 ++--
  1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index 3a45d64175..7cba573dae 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -120,7 +120,7 @@ struct CompareState {
  SendCo out_sendco;
  SendCo notify_sendco;
  bool vnet_hdr;
-uint32_t compare_timeout;
+uint64_t compare_timeout;
  uint32_t expired_scan_cycle;
  
  /*

@@ -1081,9 +1081,9 @@ static void compare_get_timeout(Object *obj, Visitor *v,
  Error **errp)
  {
  CompareState *s = COLO_COMPARE(obj);
-uint32_t value = s->compare_timeout;
+uint64_t value = s->compare_timeout;
  
-visit_type_uint32(v, name, , errp);

+visit_type_uint64(v, name, , errp);
  }
  
  static void compare_set_timeout(Object *obj, Visitor *v,

@@ -1146,9 +1146,9 @@ static void set_max_queue_size(Object *obj, Visitor *v,
 Error **errp)
  {
  Error *local_err = NULL;
-uint32_t value;
+uint64_t value;
  
-visit_type_uint32(v, name, , _err);

+visit_type_uint64(v, name, , _err);
  if (local_err) {
  goto out;
  }
@@ -1396,7 +1396,7 @@ static void colo_compare_init(Object *obj)
  object_property_add_str(obj, "notify_dev",
  compare_get_notify_dev, compare_set_notify_dev);
  
-object_property_add(obj, "compare_timeout", "uint32",

+object_property_add(obj, "compare_timeout", "uint64",
  compare_get_timeout,
  compare_set_timeout, NULL, NULL);
  







[PATCH v3] virtio-gpu: fix unmap the already mapped items

2020-08-26 Thread Li Zhijian
we go here either (!(*iov)[i].iov_base) or (len != l), so we need to consider
to unmap the 'i'th item as well when the 'i'th item is not nil

CC: Li Qiang 
Signed-off-by: Li Zhijian 
---
v2: address Gerd's comments
v3: leave (*iov)[i].iov_len as the real mapped len (Li Qiang)
---
 hw/display/virtio-gpu.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
index 5f0dd7c150..90be4e3ed7 100644
--- a/hw/display/virtio-gpu.c
+++ b/hw/display/virtio-gpu.c
@@ -646,9 +646,9 @@ int virtio_gpu_create_mapping_iov(VirtIOGPU *g,
 uint64_t a = le64_to_cpu(ents[i].addr);
 uint32_t l = le32_to_cpu(ents[i].length);
 hwaddr len = l;
-(*iov)[i].iov_len = l;
 (*iov)[i].iov_base = dma_memory_map(VIRTIO_DEVICE(g)->dma_as,
 a, , DMA_DIRECTION_TO_DEVICE);
+(*iov)[i].iov_len = len;
 if (addr) {
 (*addr)[i] = a;
 }
@@ -656,6 +656,9 @@ int virtio_gpu_create_mapping_iov(VirtIOGPU *g,
 qemu_log_mask(LOG_GUEST_ERROR, "%s: failed to map MMIO memory for"
   " resource %d element %d\n",
   __func__, ab->resource_id, i);
+if ((*iov)[i].iov_base) {
+i++; /* cleanup the 'i'th map */
+}
 virtio_gpu_cleanup_mapping_iov(g, *iov, i);
 g_free(ents);
 *iov = NULL;
-- 
2.28.0






Re: [PATCH v2] virtio-gpu: fix unmap the already mapped items

2020-08-26 Thread Li Zhijian




On 8/26/20 10:54 PM, Li Qiang wrote:

Li Zhijian  于2020年8月21日周五 下午7:34写道:

we go here either (!(*iov)[i].iov_base) or (len != l), so we need to consider
to unmap the 'i'th item as well when the 'i'th item is not nil

Signed-off-by: Li Zhijian 

---
v2: address Gerd's comments
---
  hw/display/virtio-gpu.c | 3 +++
  1 file changed, 3 insertions(+)

diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
index 5f0dd7c150..e93f99932a 100644
--- a/hw/display/virtio-gpu.c
+++ b/hw/display/virtio-gpu.c
@@ -656,6 +656,9 @@ int virtio_gpu_create_mapping_iov(VirtIOGPU *g,
  qemu_log_mask(LOG_GUEST_ERROR, "%s: failed to map MMIO memory for"
" resource %d element %d\n",
__func__, ab->resource_id, i);
+if ((*iov)[i].iov_base) {
+i++; /* cleanup the 'i'th map */

Should we also reset (*iov)[i].iov_len to 'len' so the
dma_memory_unmap has the right size?

Indeed, good caught, thanks





Thanks,
Li Qiang


+}
  virtio_gpu_cleanup_mapping_iov(g, *iov, i);
  g_free(ents);
  *iov = NULL;
--
2.17.1













[PATCH v2] virtio-gpu: fix unmap the already mapped items

2020-08-21 Thread Li Zhijian
we go here either (!(*iov)[i].iov_base) or (len != l), so we need to consider
to unmap the 'i'th item as well when the 'i'th item is not nil

Signed-off-by: Li Zhijian 

---
v2: address Gerd's comments
---
 hw/display/virtio-gpu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
index 5f0dd7c150..e93f99932a 100644
--- a/hw/display/virtio-gpu.c
+++ b/hw/display/virtio-gpu.c
@@ -656,6 +656,9 @@ int virtio_gpu_create_mapping_iov(VirtIOGPU *g,
 qemu_log_mask(LOG_GUEST_ERROR, "%s: failed to map MMIO memory for"
   " resource %d element %d\n",
   __func__, ab->resource_id, i);
+if ((*iov)[i].iov_base) {
+i++; /* cleanup the 'i'th map */
+}
 virtio_gpu_cleanup_mapping_iov(g, *iov, i);
 g_free(ents);
 *iov = NULL;
-- 
2.17.1






Re: [PATCH] virtio-gpu: fix unmap the already mapped items

2020-08-21 Thread Li Zhijian




On 8/21/20 6:07 PM, Gerd Hoffmann wrote:

On Fri, Aug 21, 2020 at 04:49:45PM +0800, Li Zhijian wrote:

we go here either (!(*iov)[i].iov_base) or (len != l), so we need to consider
to unmap the 'i'th item as well when the 'i'th item is not nil

Signed-off-by: Li Zhijian 
---
  hw/display/virtio-gpu.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
index 5f0dd7c150..1f777e43ff 100644
--- a/hw/display/virtio-gpu.c
+++ b/hw/display/virtio-gpu.c
@@ -656,7 +656,7 @@ int virtio_gpu_create_mapping_iov(VirtIOGPU *g,
  qemu_log_mask(LOG_GUEST_ERROR, "%s: failed to map MMIO memory for"
" resource %d element %d\n",
__func__, ab->resource_id, i);
-virtio_gpu_cleanup_mapping_iov(g, *iov, i);
+virtio_gpu_cleanup_mapping_iov(g, *iov, i + !!(*iov)[i].iov_base);

Cute trick, but the code should be readable without having to dig out
the commit message which explains it.  Can we have something simpler
along the lines of "if (iov_base) { i++; /* cleanup partial map */ }"
please?

make sense !

Thanks
Zhijian



thanks,
   Gerd










[PATCH] virtio-gpu: fix unmap the already mapped items

2020-08-21 Thread Li Zhijian
we go here either (!(*iov)[i].iov_base) or (len != l), so we need to consider
to unmap the 'i'th item as well when the 'i'th item is not nil

Signed-off-by: Li Zhijian 
---
 hw/display/virtio-gpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
index 5f0dd7c150..1f777e43ff 100644
--- a/hw/display/virtio-gpu.c
+++ b/hw/display/virtio-gpu.c
@@ -656,7 +656,7 @@ int virtio_gpu_create_mapping_iov(VirtIOGPU *g,
 qemu_log_mask(LOG_GUEST_ERROR, "%s: failed to map MMIO memory for"
   " resource %d element %d\n",
   __func__, ab->resource_id, i);
-virtio_gpu_cleanup_mapping_iov(g, *iov, i);
+virtio_gpu_cleanup_mapping_iov(g, *iov, i + !!(*iov)[i].iov_base);
 g_free(ents);
 *iov = NULL;
 if (addr) {
-- 
2.17.1






[PATCH] .gitignore: add virtiofsd binary

2020-08-21 Thread Li Zhijian
Signed-off-by: Li Zhijian 
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 2992d15931..28729241f1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -57,6 +57,7 @@
 /qemu-ga
 /qemu-bridge-helper
 /qemu-keymap
+/virtiofsd
 /qemu-monitor.texi
 /qemu-monitor-info.texi
 /qemu-storage-daemon
-- 
2.17.1






Re: [Qemu-devel] [PATCH 4/6] COLO-compare: Add colo-compare remote notify support

2019-06-02 Thread Li Zhijian

how about do switch inside colo_compare_inconsistency_notify(), like:

colo_compare_inconsistency_notify(CompareState *s)
{
if (s->remote_notify)
remote_notify
else
local_notity
}

Thanks
Zhijian

On 6/2/19 11:42 AM, Zhang Chen wrote:

From: Zhang Chen 

This patch make colo-compare can send message to remote COLO frame(Xen) when 
occur checkpoint.

Signed-off-by: Zhang Chen 
---
  net/colo-compare.c | 51 +-
  1 file changed, 46 insertions(+), 5 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index 16285f4a96..19075c7a66 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -251,6 +251,17 @@ static void colo_release_primary_pkt(CompareState *s, 
Packet *pkt)
  packet_destroy(pkt, NULL);
  }
  
+static void notify_remote_frame(CompareState *s)

+{
+char msg[] = "DO_CHECKPOINT";
+int ret = 0;
+
+ret = compare_chr_send(s, (uint8_t *)msg, strlen(msg), 0, true);
+if (ret < 0) {
+error_report("Notify Xen COLO-frame failed");
+}
+}
+
  /*
   * The IP packets sent by primary and secondary
   * will be compared in here
@@ -435,7 +446,11 @@ sec:
  qemu_hexdump((char *)spkt->data, stderr,
   "colo-compare spkt", spkt->size);
  
-colo_compare_inconsistency_notify();

+if (s->notify_dev) {
+notify_remote_frame(s);
+} else {
+colo_compare_inconsistency_notify();
+}
  }
  }
  
@@ -577,7 +592,7 @@ void colo_compare_unregister_notifier(Notifier *notify)

  }
  
  static int colo_old_packet_check_one_conn(Connection *conn,

-   void *user_data)
+  CompareState *s)
  {
  GList *result = NULL;
  int64_t check_time = REGULAR_PACKET_CHECK_MS;
@@ -588,7 +603,11 @@ static int colo_old_packet_check_one_conn(Connection *conn,
  
  if (result) {

  /* Do checkpoint will flush old packet */
-colo_compare_inconsistency_notify();
+if (s->notify_dev) {
+notify_remote_frame(s);
+} else {
+colo_compare_inconsistency_notify();
+}
  return 0;
  }
  
@@ -608,7 +627,7 @@ static void colo_old_packet_check(void *opaque)

   * If we find one old packet, stop finding job and notify
   * COLO frame do checkpoint.
   */
-g_queue_find_custom(>conn_list, NULL,
+g_queue_find_custom(>conn_list, s,
  (GCompareFunc)colo_old_packet_check_one_conn);
  }
  
@@ -637,7 +656,12 @@ static void colo_compare_packet(CompareState *s, Connection *conn,

   */
  trace_colo_compare_main("packet different");
  g_queue_push_head(>primary_list, pkt);
-colo_compare_inconsistency_notify();
+
+if (s->notify_dev) {
+notify_remote_frame(s);
+} else {
+colo_compare_inconsistency_notify();
+}
  break;
  }
  }
@@ -989,7 +1013,24 @@ static void compare_sec_rs_finalize(SocketReadState 
*sec_rs)
  
  static void compare_notify_rs_finalize(SocketReadState *notify_rs)

  {
+CompareState *s = container_of(notify_rs, CompareState, notify_rs);
+
  /* Get Xen colo-frame's notify and handle the message */
+char *data = g_memdup(notify_rs->buf, notify_rs->packet_len);
+char msg[] = "COLO_COMPARE_GET_XEN_INIT";
+int ret;
+
+if (!strcmp(data, "COLO_USERSPACE_PROXY_INIT")) {
+ret = compare_chr_send(s, (uint8_t *)msg, strlen(msg), 0, true);
+if (ret < 0) {
+error_report("Notify Xen COLO-frame INIT failed");
+}
+}
+
+if (!strcmp(data, "COLO_CHECKPOINT")) {
+/* colo-compare do checkpoint, flush pri packet and remove sec packet 
*/
+g_queue_foreach(>conn_list, colo_flush_packets, s);
+}
  }
  
  /*





[Qemu-devel] [PATCH v2 1/2] Acceptance tests: use linux-3.6 and set vm memory to 4GiB

2019-01-27 Thread Li Zhijian
QEMU have already supported to load up to 4G initrd if the sepcified memory is
enough and XLF_CAN_BE_LOADED_ABOVE_4G is set by guest kernel

linux-3.6 kernel shipped by Fedora-18 cannot support xldflags so that it
cannot support loading more than 2GiB initrd

CC: Wainer dos Santos Moschetta 
CC: Caio Carrara 
CC: Cleber Rosa 
CC: Eduardo Habkost 
CC: Philippe Mathieu-Daudé 
Signed-off-by: Li Zhijian 

---
V2: fix typos
---
 tests/acceptance/linux_initrd.py | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tests/acceptance/linux_initrd.py b/tests/acceptance/linux_initrd.py
index 737355c..b283715 100644
--- a/tests/acceptance/linux_initrd.py
+++ b/tests/acceptance/linux_initrd.py
@@ -24,14 +24,16 @@ class LinuxInitrd(Test):
 
 timeout = 60
 
-def test_with_2gib_file_should_exit_error_msg(self):
+def test_with_2gib_file_should_exit_error_msg_with_linux_v3_6(self):
 """
 Pretends to boot QEMU with an initrd file with size of 2GiB
 and expect it exits with error message.
+Fedora-18 shipped with linux-3.6 which have not supported xloadflags
+cannot support more than 2GiB initrd.
 """
-kernel_url = ('https://mirrors.kernel.org/fedora/releases/28/'
-  'Everything/x86_64/os/images/pxeboot/vmlinuz')
-kernel_hash = '238e083e114c48200f80d889f7e32eeb2793e02a'
+kernel_url = 
('https://archives.fedoraproject.org/pub/archive/fedora/li'
+  
'nux/releases/18/Fedora/x86_64/os/images/pxeboot/vmlinuz')
+kernel_hash = '41464f68efe42b9991250bed86c7081d2ccdbb21'
 kernel_path = self.fetch_asset(kernel_url, asset_hash=kernel_hash)
 max_size = 2 * (1024 ** 3) - 1
 
@@ -39,8 +41,8 @@ class LinuxInitrd(Test):
 initrd.seek(max_size)
 initrd.write(b'\0')
 initrd.flush()
-cmd = "%s -kernel %s -initrd %s" % (self.qemu_bin, kernel_path,
-initrd.name)
+cmd = "%s -kernel %s -initrd %s -m 4096" % (
+  self.qemu_bin, kernel_path, initrd.name)
 res = run(cmd, ignore_status=True)
 self.assertEqual(res.exit_status, 1)
 expected_msg = r'.*initrd is too large.*max: \d+, need %s.*' % (
-- 
2.7.4




[Qemu-devel] [PATCH v2 2/2] Acceptance tests: expect boot to extract 2GiB+ initrd with linux-v4.16

2019-01-27 Thread Li Zhijian
XLF_CAN_BE_LOADED_ABOVE_4G is set on vmlinuz shipped by Fedora-28 so that
it's allowed to be loaded below 4 GB address.

timeout is updated to 5 minutes as well since we need more time to load a
large initrd to the guest

CC: Wainer dos Santos Moschetta 
CC: Caio Carrara 
CC: Cleber Rosa 
CC: Eduardo Habkost 
CC: Philippe Mathieu-Daudé 
Signed-off-by: Li Zhijian 

---
V2: fix typos
---
 tests/acceptance/linux_initrd.py | 37 -
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/tests/acceptance/linux_initrd.py b/tests/acceptance/linux_initrd.py
index b283715..29f5b06 100644
--- a/tests/acceptance/linux_initrd.py
+++ b/tests/acceptance/linux_initrd.py
@@ -8,6 +8,7 @@
 # This work is licensed under the terms of the GNU GPL, version 2 or
 # later.  See the COPYING file in the top-level directory.
 
+import logging
 import tempfile
 from avocado.utils.process import run
 
@@ -22,7 +23,7 @@ class LinuxInitrd(Test):
 :avocado: tags=x86_64
 """
 
-timeout = 60
+timeout = 300
 
 def test_with_2gib_file_should_exit_error_msg_with_linux_v3_6(self):
 """
@@ -48,3 +49,37 @@ class LinuxInitrd(Test):
 expected_msg = r'.*initrd is too large.*max: \d+, need %s.*' % (
 max_size + 1)
 self.assertRegex(res.stderr_text, expected_msg)
+
+def test_with_2gib_file_should_work_with_linux_v4_16(self):
+"""
+QEMU has supported up to 4 GiB initrd for recent kernel
+Expect guest can reach 'Unpacking initramfs...'
+"""
+kernel_url = ('https://mirrors.kernel.org/fedora/releases/28/'
+  'Everything/x86_64/os/images/pxeboot/vmlinuz')
+kernel_hash = '238e083e114c48200f80d889f7e32eeb2793e02a'
+kernel_path = self.fetch_asset(kernel_url, asset_hash=kernel_hash)
+max_size = 2 * (1024 ** 3) + 1
+
+with tempfile.NamedTemporaryFile() as initrd:
+initrd.seek(max_size)
+initrd.write(b'\0')
+initrd.flush()
+
+self.vm.set_machine('pc')
+self.vm.set_console()
+kernel_command_line = 'console=ttyS0'
+self.vm.add_args('-kernel', kernel_path,
+ '-append', kernel_command_line,
+ '-initrd', initrd.name,
+ '-m', '5120')
+self.vm.launch()
+console = self.vm.console_socket.makefile()
+console_logger = logging.getLogger('console')
+while True:
+msg = console.readline()
+console_logger.debug(msg.strip())
+if 'Unpacking initramfs...' in msg:
+break
+if 'Kernel panic - not syncing' in msg:
+self.fail("Kernel panic reached")
-- 
2.7.4




Re: [Qemu-devel] [PATCH 2/2] Acceptance tests: add support more than 2GiB initrd test with linux-v4.19

2019-01-18 Thread Li Zhijian

Sorry, there are some typos, please ignore this version
i will correct in V2
 
s/v4.19/v4.16 on subject.


On 1/18/19 18:02, Li Zhijian wrote:

timeout is updated to 5mins since we need more time to load a
large initrd to the guest

CC: Wainer dos Santos Moschetta 
CC: Caio Carrara 
CC: Cleber Rosa 
CC: Eduardo Habkost 
CC: Philippe Mathieu-Daudé 
Signed-off-by: Li Zhijian 
---
  tests/acceptance/linux_initrd.py | 37 -
  1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/tests/acceptance/linux_initrd.py b/tests/acceptance/linux_initrd.py
index aeb9fde..6f1ee1f 100644
--- a/tests/acceptance/linux_initrd.py
+++ b/tests/acceptance/linux_initrd.py
@@ -8,6 +8,7 @@
  # This work is licensed under the terms of the GNU GPL, version 2 or
  # later.  See the COPYING file in the top-level directory.
  
+import logging

  import tempfile
  from avocado.utils.process import run
  
@@ -22,7 +23,7 @@ class LinuxInitrd(Test):

  :avocado: tags=x86_64
  """
  
-timeout = 60

+timeout = 300
  
  def test_with_2gib_file_should_exit_error_msg_with_linux_v3_6(self):

  """
@@ -48,3 +49,37 @@ class LinuxInitrd(Test):
  expected_msg = r'.*initrd is too large.*max: \d+, need %s.*' % (
  max_size + 1)
  self.assertRegex(res.stderr_text, expected_msg)
+
+def test_with_2gib_file_should_work_with_linux_v4_18(self):


s/v4_18/v4_16


Thanks



+"""
+since linux header introduced xloadflags which can tell bootloader
+whether initrd can be loaded into above 4G address.
+"""
+kernel_url = ('https://mirrors.kernel.org/fedora/releases/28/'
+  'Everything/x86_64/os/images/pxeboot/vmlinuz')
+kernel_hash = '238e083e114c48200f80d889f7e32eeb2793e02a'
+kernel_path = self.fetch_asset(kernel_url, asset_hash=kernel_hash)
+max_size = 2 * (1024 ** 3) - 1
+
+with tempfile.NamedTemporaryFile() as initrd:
+initrd.seek(max_size)
+initrd.write(b'\0')
+initrd.flush()
+
+self.vm.set_machine('pc')
+self.vm.set_console()
+kernel_command_line = 'console=ttyS0'
+self.vm.add_args('-kernel', kernel_path,
+ '-append', kernel_command_line,
+ '-initrd', initrd.name,
+ '-m', '5120')
+self.vm.launch()
+console = self.vm.console_socket.makefile()
+console_logger = logging.getLogger('console')
+while True:
+msg = console.readline()
+console_logger.debug(msg.strip())
+if 'Unpacking initramfs...' in msg:
+break
+if 'Kernel panic - not syncing' in msg:
+self.fail("Kernel panic reached")






Re: [Qemu-devel] [PATCH 1/2] Acceptance tests: use linux-3.6 and set vm memory to 4GiB

2019-01-18 Thread Li Zhijian

Sorry, there are some typos, please ignore this version
i will correct in V2


On 1/18/19 18:02, Li Zhijian wrote:

linux-3.6 kernel shipped by Fedora-19 cannot support xldflags so that it


s/Fedora-19/Fedora-18



cannot support loading more than 2GiB initrd

CC: Wainer dos Santos Moschetta 
CC: Caio Carrara 
CC: Cleber Rosa 
CC: Eduardo Habkost 
CC: Philippe Mathieu-Daudé 
Signed-off-by: Li Zhijian 

---
this patch is basing on early Acceptance tests:
https://patchwork.kernel.org/patch/10676415/
---
  tests/acceptance/linux_initrd.py | 14 --
  1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tests/acceptance/linux_initrd.py b/tests/acceptance/linux_initrd.py
index 737355c..aeb9fde 100644
--- a/tests/acceptance/linux_initrd.py
+++ b/tests/acceptance/linux_initrd.py
@@ -24,14 +24,16 @@ class LinuxInitrd(Test):
  
  timeout = 60
  
-def test_with_2gib_file_should_exit_error_msg(self):

+def test_with_2gib_file_should_exit_error_msg_with_linux_v3_6(self):
  """
  Pretends to boot QEMU with an initrd file with size of 2GiB
  and expect it exits with error message.
+Fedora-19 shipped with linux-3.6 which have not supported xloadflags


Ditto

Thanks

 


+cannot support more than 2GiB initrd.
  """
-kernel_url = ('https://mirrors.kernel.org/fedora/releases/28/'
-  'Everything/x86_64/os/images/pxeboot/vmlinuz')
-kernel_hash = '238e083e114c48200f80d889f7e32eeb2793e02a'
+kernel_url = 
('https://archives.fedoraproject.org/pub/archive/fedora/li'
+  
'nux/releases/18/Fedora/x86_64/os/images/pxeboot/vmlinuz')
+kernel_hash = '41464f68efe42b9991250bed86c7081d2ccdbb21'
  kernel_path = self.fetch_asset(kernel_url, asset_hash=kernel_hash)
  max_size = 2 * (1024 ** 3) - 1
  
@@ -39,8 +41,8 @@ class LinuxInitrd(Test):

  initrd.seek(max_size)
  initrd.write(b'\0')
  initrd.flush()
-cmd = "%s -kernel %s -initrd %s" % (self.qemu_bin, kernel_path,
-initrd.name)
+cmd = "%s -kernel %s -initrd %s -m 4096" % (
+  self.qemu_bin, kernel_path, initrd.name)
  res = run(cmd, ignore_status=True)
  self.assertEqual(res.exit_status, 1)
  expected_msg = r'.*initrd is too large.*max: \d+, need %s.*' % (






[Qemu-devel] [PATCH 1/2] Acceptance tests: use linux-3.6 and set vm memory to 4GiB

2019-01-18 Thread Li Zhijian
linux-3.6 kernel shipped by Fedora-19 cannot support xldflags so that it
cannot support loading more than 2GiB initrd

CC: Wainer dos Santos Moschetta 
CC: Caio Carrara 
CC: Cleber Rosa 
CC: Eduardo Habkost 
CC: Philippe Mathieu-Daudé 
Signed-off-by: Li Zhijian 

---
this patch is basing on early Acceptance tests:
https://patchwork.kernel.org/patch/10676415/
---
 tests/acceptance/linux_initrd.py | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tests/acceptance/linux_initrd.py b/tests/acceptance/linux_initrd.py
index 737355c..aeb9fde 100644
--- a/tests/acceptance/linux_initrd.py
+++ b/tests/acceptance/linux_initrd.py
@@ -24,14 +24,16 @@ class LinuxInitrd(Test):
 
 timeout = 60
 
-def test_with_2gib_file_should_exit_error_msg(self):
+def test_with_2gib_file_should_exit_error_msg_with_linux_v3_6(self):
 """
 Pretends to boot QEMU with an initrd file with size of 2GiB
 and expect it exits with error message.
+Fedora-19 shipped with linux-3.6 which have not supported xloadflags
+cannot support more than 2GiB initrd.
 """
-kernel_url = ('https://mirrors.kernel.org/fedora/releases/28/'
-  'Everything/x86_64/os/images/pxeboot/vmlinuz')
-kernel_hash = '238e083e114c48200f80d889f7e32eeb2793e02a'
+kernel_url = 
('https://archives.fedoraproject.org/pub/archive/fedora/li'
+  
'nux/releases/18/Fedora/x86_64/os/images/pxeboot/vmlinuz')
+kernel_hash = '41464f68efe42b9991250bed86c7081d2ccdbb21'
 kernel_path = self.fetch_asset(kernel_url, asset_hash=kernel_hash)
 max_size = 2 * (1024 ** 3) - 1
 
@@ -39,8 +41,8 @@ class LinuxInitrd(Test):
 initrd.seek(max_size)
 initrd.write(b'\0')
 initrd.flush()
-cmd = "%s -kernel %s -initrd %s" % (self.qemu_bin, kernel_path,
-initrd.name)
+cmd = "%s -kernel %s -initrd %s -m 4096" % (
+  self.qemu_bin, kernel_path, initrd.name)
 res = run(cmd, ignore_status=True)
 self.assertEqual(res.exit_status, 1)
 expected_msg = r'.*initrd is too large.*max: \d+, need %s.*' % (
-- 
2.7.4




[Qemu-devel] [PATCH 2/2] Acceptance tests: add support more than 2GiB initrd test with linux-v4.19

2019-01-18 Thread Li Zhijian
timeout is updated to 5mins since we need more time to load a
large initrd to the guest

CC: Wainer dos Santos Moschetta 
CC: Caio Carrara 
CC: Cleber Rosa 
CC: Eduardo Habkost 
CC: Philippe Mathieu-Daudé 
Signed-off-by: Li Zhijian 
---
 tests/acceptance/linux_initrd.py | 37 -
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/tests/acceptance/linux_initrd.py b/tests/acceptance/linux_initrd.py
index aeb9fde..6f1ee1f 100644
--- a/tests/acceptance/linux_initrd.py
+++ b/tests/acceptance/linux_initrd.py
@@ -8,6 +8,7 @@
 # This work is licensed under the terms of the GNU GPL, version 2 or
 # later.  See the COPYING file in the top-level directory.
 
+import logging
 import tempfile
 from avocado.utils.process import run
 
@@ -22,7 +23,7 @@ class LinuxInitrd(Test):
 :avocado: tags=x86_64
 """
 
-timeout = 60
+timeout = 300
 
 def test_with_2gib_file_should_exit_error_msg_with_linux_v3_6(self):
 """
@@ -48,3 +49,37 @@ class LinuxInitrd(Test):
 expected_msg = r'.*initrd is too large.*max: \d+, need %s.*' % (
 max_size + 1)
 self.assertRegex(res.stderr_text, expected_msg)
+
+def test_with_2gib_file_should_work_with_linux_v4_18(self):
+"""
+since linux header introduced xloadflags which can tell bootloader
+whether initrd can be loaded into above 4G address.
+"""
+kernel_url = ('https://mirrors.kernel.org/fedora/releases/28/'
+  'Everything/x86_64/os/images/pxeboot/vmlinuz')
+kernel_hash = '238e083e114c48200f80d889f7e32eeb2793e02a'
+kernel_path = self.fetch_asset(kernel_url, asset_hash=kernel_hash)
+max_size = 2 * (1024 ** 3) - 1
+
+with tempfile.NamedTemporaryFile() as initrd:
+initrd.seek(max_size)
+initrd.write(b'\0')
+initrd.flush()
+
+self.vm.set_machine('pc')
+self.vm.set_console()
+kernel_command_line = 'console=ttyS0'
+self.vm.add_args('-kernel', kernel_path,
+ '-append', kernel_command_line,
+ '-initrd', initrd.name,
+ '-m', '5120')
+self.vm.launch()
+console = self.vm.console_socket.makefile()
+console_logger = logging.getLogger('console')
+while True:
+msg = console.readline()
+console_logger.debug(msg.strip())
+if 'Unpacking initramfs...' in msg:
+break
+if 'Kernel panic - not syncing' in msg:
+self.fail("Kernel panic reached")
-- 
2.7.4




[Qemu-devel] [PATCH v6 2/4] hw/core/loader.c: Read as long as possible in load_image_size()

2019-01-17 Thread Li Zhijian
Don't expect read(2) can always read as many as it's told.

CC: Richard Henderson 
CC: Stefano Garzarella 
Signed-off-by: Li Zhijian 
Reviewed-by: Richard Henderson 
Reviewed-by: Stefano Garzarella 

---
V5: update subject and add reviewed-by tag (Stefano Garzarella)
V4: add reviewed-by tag
---
 hw/core/loader.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/hw/core/loader.c b/hw/core/loader.c
index c4f62fe..bf2951f 100644
--- a/hw/core/loader.c
+++ b/hw/core/loader.c
@@ -77,21 +77,20 @@ int64_t get_image_size(const char *filename)
 ssize_t load_image_size(const char *filename, void *addr, size_t size)
 {
 int fd;
-ssize_t actsize;
+ssize_t actsize, l = 0;
 
 fd = open(filename, O_RDONLY | O_BINARY);
 if (fd < 0) {
 return -1;
 }
 
-actsize = read(fd, addr, size);
-if (actsize < 0) {
-close(fd);
-return -1;
+while ((actsize = read(fd, addr + l, size - l)) > 0) {
+l += actsize;
 }
+
 close(fd);
 
-return actsize;
+return actsize < 0 ? -1 : l;
 }
 
 /* read()-like version */
-- 
2.7.4




[Qemu-devel] [PATCH v6 1/4] unify len and addr type for memory/address APIs

2019-01-17 Thread Li Zhijian
Some address/memory APIs have different type between
'hwaddr/target_ulong addr' and 'int len'. It is very unsafe, especially
some APIs will be passed a non-int len by caller which might cause
overflow quietly.
Below is an potential overflow case:
dma_memory_read(uint32_t len)
  -> dma_memory_rw(uint32_t len)
-> dma_memory_rw_relaxed(uint32_t len)
  -> address_space_rw(int len) # len overflow

CC: Paolo Bonzini 
CC: Peter Crosthwaite 
CC: Richard Henderson 
CC: Peter Maydell 
CC: Stefano Garzarella 
Signed-off-by: Li Zhijian 
Reviewed-by: Peter Maydell 
Reviewed-by: Richard Henderson 
Reviewed-by: Stefano Garzarella 

---
V6: Fix "WARNING: line over 80 characters"
V5: Fix typo and Reviewed-tag (Stefano Garzarella)
V4: minor fix at commit message and add Reviewed-by tag
V3: use the same type between len and addr(Peter Maydell)
rebase code basing on 
https://patchew.org/QEMU/20181122133507.30950-1-peter.mayd...@linaro.org/

Signed-off-by: Li Zhijian 
---
 exec.c| 47 +++
 include/exec/cpu-all.h|  2 +-
 include/exec/cpu-common.h |  8 
 include/exec/memory.h | 22 +++---
 4 files changed, 39 insertions(+), 40 deletions(-)

diff --git a/exec.c b/exec.c
index 895449f..9697e63 100644
--- a/exec.c
+++ b/exec.c
@@ -2849,10 +2849,10 @@ static const MemoryRegionOps watch_mem_ops = {
 };
 
 static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
-  MemTxAttrs attrs, uint8_t *buf, int len);
+ MemTxAttrs attrs, uint8_t *buf, hwaddr len);
 static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
-  const uint8_t *buf, int len);
-static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len,
+  const uint8_t *buf, hwaddr len);
+static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
   bool is_write, MemTxAttrs attrs);
 
 static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data,
@@ -3100,10 +3100,10 @@ MemoryRegion *get_system_io(void)
 /* physical memory access (slow version, mainly for debug) */
 #if defined(CONFIG_USER_ONLY)
 int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
-uint8_t *buf, int len, int is_write)
+uint8_t *buf, target_ulong len, int is_write)
 {
-int l, flags;
-target_ulong page;
+int flags;
+target_ulong l, page;
 void * p;
 
 while (len > 0) {
@@ -3216,7 +3216,7 @@ static bool prepare_mmio_access(MemoryRegion *mr)
 static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr,
MemTxAttrs attrs,
const uint8_t *buf,
-   int len, hwaddr addr1,
+   hwaddr len, hwaddr addr1,
hwaddr l, MemoryRegion *mr)
 {
 uint8_t *ptr;
@@ -3261,7 +3261,7 @@ static MemTxResult flatview_write_continue(FlatView *fv, 
hwaddr addr,
 
 /* Called from RCU critical section.  */
 static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
-  const uint8_t *buf, int len)
+  const uint8_t *buf, hwaddr len)
 {
 hwaddr l;
 hwaddr addr1;
@@ -3279,7 +3279,7 @@ static MemTxResult flatview_write(FlatView *fv, hwaddr 
addr, MemTxAttrs attrs,
 /* Called within RCU critical section.  */
 MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
MemTxAttrs attrs, uint8_t *buf,
-   int len, hwaddr addr1, hwaddr l,
+   hwaddr len, hwaddr addr1, hwaddr l,
MemoryRegion *mr)
 {
 uint8_t *ptr;
@@ -3322,7 +3322,7 @@ MemTxResult flatview_read_continue(FlatView *fv, hwaddr 
addr,
 
 /* Called from RCU critical section.  */
 static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
- MemTxAttrs attrs, uint8_t *buf, int len)
+ MemTxAttrs attrs, uint8_t *buf, hwaddr len)
 {
 hwaddr l;
 hwaddr addr1;
@@ -3335,7 +3335,7 @@ static MemTxResult flatview_read(FlatView *fv, hwaddr 
addr,
 }
 
 MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
-MemTxAttrs attrs, uint8_t *buf, int len)
+MemTxAttrs attrs, uint8_t *buf, hwaddr len)
 {
 MemTxResult result = MEMTX_OK;
 FlatView *fv;
@@ -3352,7 +3352,7 @@ MemTxResult address_space_read_full(AddressSpace *as, 
hwaddr addr,
 
 MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
 MemTxAttrs attrs,
-   

[Qemu-devel] [PATCH v6 3/4] i386: import & use bootparam.h

2019-01-17 Thread Li Zhijian
it's from v4.20-rc5.

CC: Stefano Garzarella 
CC: Michael S. Tsirkin 
Signed-off-by: Li Zhijian 
Reviewed-by: Michael S. Tsirkin 
Reviewed-by: Stefano Garzarella 
---
V6: Fix line over 80 characters && use double quates for all pathes (Stefano 
Garzarella)
V5: add reviewed-by tag
V4: use scirpt to import bootparam.h (Michael S. Tsirkin)
V3: new patch

Signed-off-by: Li Zhijian 
---
 hw/i386/pc.c |  8 +--
 include/standard-headers/asm-x86/bootparam.h | 34 
 scripts/update-linux-headers.sh  |  6 +
 3 files changed, 41 insertions(+), 7 deletions(-)
 create mode 100644 include/standard-headers/asm-x86/bootparam.h

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 73d688f..64d23b2 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -75,6 +75,7 @@
 #include "hw/usb.h"
 #include "hw/i386/intel_iommu.h"
 #include "hw/net/ne2000-isa.h"
+#include "standard-headers/asm-x86/bootparam.h"
 
 /* debug PC/ISA interrupts */
 //#define DEBUG_IRQ
@@ -1046,13 +1047,6 @@ static long get_file_size(FILE *f)
 return size;
 }
 
-/* setup_data types */
-#define SETUP_NONE 0
-#define SETUP_E820_EXT 1
-#define SETUP_DTB  2
-#define SETUP_PCI  3
-#define SETUP_EFI  4
-
 struct setup_data {
 uint64_t next;
 uint32_t type;
diff --git a/include/standard-headers/asm-x86/bootparam.h 
b/include/standard-headers/asm-x86/bootparam.h
new file mode 100644
index 000..67d4f01
--- /dev/null
+++ b/include/standard-headers/asm-x86/bootparam.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_BOOTPARAM_H
+#define _ASM_X86_BOOTPARAM_H
+
+/* setup_data types */
+#define SETUP_NONE 0
+#define SETUP_E820_EXT 1
+#define SETUP_DTB  2
+#define SETUP_PCI  3
+#define SETUP_EFI  4
+#define SETUP_APPLE_PROPERTIES 5
+#define SETUP_JAILHOUSE6
+
+/* ram_size flags */
+#define RAMDISK_IMAGE_START_MASK   0x07FF
+#define RAMDISK_PROMPT_FLAG0x8000
+#define RAMDISK_LOAD_FLAG  0x4000
+
+/* loadflags */
+#define LOADED_HIGH(1<<0)
+#define KASLR_FLAG (1<<1)
+#define QUIET_FLAG (1<<5)
+#define KEEP_SEGMENTS  (1<<6)
+#define CAN_USE_HEAP   (1<<7)
+
+/* xloadflags */
+#define XLF_KERNEL_64  (1<<0)
+#define XLF_CAN_BE_LOADED_ABOVE_4G (1<<1)
+#define XLF_EFI_HANDOVER_32(1<<2)
+#define XLF_EFI_HANDOVER_64(1<<3)
+#define XLF_EFI_KEXEC  (1<<4)
+
+
+#endif /* _ASM_X86_BOOTPARAM_H */
diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
index 0a964fe..3578cfe 100755
--- a/scripts/update-linux-headers.sh
+++ b/scripts/update-linux-headers.sh
@@ -120,6 +120,12 @@ for arch in $ARCHLIST; do
 cp "$tmpdir/include/asm/unistd_x32.h" "$output/linux-headers/asm-x86/"
 cp "$tmpdir/include/asm/unistd_64.h" "$output/linux-headers/asm-x86/"
 cp_portable "$tmpdir/include/asm/kvm_para.h" 
"$output/include/standard-headers/asm-$arch"
+# Remove everything except the macros from bootparam.h avoiding the
+# unnecessary import of several video/ist/etc headers
+sed -e '/__ASSEMBLY__/,/__ASSEMBLY__/d' \
+   "$tmpdir/include/asm/bootparam.h" > "$tmpdir/bootparam.h"
+cp_portable "$tmpdir/bootparam.h" \
+"$output/include/standard-headers/asm-$arch"
 fi
 done
 
-- 
2.7.4




[Qemu-devel] [PATCH v6 0/4] allow to load initrd below 4G for recent kernel

2019-01-17 Thread Li Zhijian
Long long ago, linux kernel has supported up to 4G initrd, but it's header
still hard code to allow loading initrd below 2G only.
 cutting from arch/boot/x86/header.S:
 # (Header version 0x0203 or later) the highest safe address for the contents
 # of an initrd. The current kernel allows up to 4 GB, but leave it at 2 GB to
 # avoid possible bootloader bugs.

In order to support more than 2G initrd, qemu must allow loading initrd
above 2G address. Luckly, recent kernel introduced a new field to linux header
named xloadflags:XLF_CAN_BE_LOADED_ABOVE_4G which tells bootloader an optional
and safe address to load initrd.

It's believed that if XLF_CAN_BE_LOADED_ABOVE_4G is set, initrd can
be loaded into any address.

Default roms(Seabios + optionrom(linuxboot_dma)) works as expected with this
patchset.

I stole some comments from yours, fell free to let me know if you don't like 
this.

changes:
V6: no functional changes
 - Patch 3/4: Fix line over 80 characters && use double quates for all pathes 
(Stefano Garzarella)
 - Patch 4/4: update comments

V5: add a few reviewed-tag and update 4/4 changelog and comments
V4:
  - add Reviwed-by tag to 1/4 and 2/4
  - use scripts/update-linux-headers.sh to import bootparam.h
  - minor fix at commit log
V3:
 - rebase code basing on http://patchwork.ozlabs.org/cover/1005990 and
   https://patchew.org/QEMU/20181122133507.30950-1-peter.mayd...@linaro.org
 - add new patch 3/4 to import header bootparam.h (Michael S. Tsirkin)

V2: add 2 patches(3/5, 4/5) to fix potential loading issue.


CC: Paolo Bonzini 
CC: Richard Henderson 
CC: Eduardo Habkost 
CC: "Michael S. Tsirkin" 
CC: Marcel Apfelbaum 
CC: Stefano Garzarella 
CC: Peter Crosthwaite 
CC: Peter Maydell 

Li Zhijian (4):
  unify len and addr type for memory/address APIs
  hw/core/loader.c: Read as long as possible in load_image_size()
  i386: import & use bootparam.h
  i386: allow to load initrd below 4 GB for recent linux

 exec.c   | 47 ++--
 hw/core/loader.c | 11 +++
 hw/i386/pc.c | 29 -
 include/exec/cpu-all.h   |  2 +-
 include/exec/cpu-common.h|  8 ++---
 include/exec/memory.h| 22 ++---
 include/standard-headers/asm-x86/bootparam.h | 34 
 scripts/update-linux-headers.sh  |  6 
 8 files changed, 105 insertions(+), 54 deletions(-)
 create mode 100644 include/standard-headers/asm-x86/bootparam.h

-- 
2.7.4




[Qemu-devel] [PATCH v6 4/4] i386: allow to load initrd below 4 GB for recent linux

2019-01-17 Thread Li Zhijian
Since linux commit: cf8fa920cb42 ("i386: handle an initrd in highmem (version 
2)")
linux has supported initrd up to 4 GB, but the header field
ramdisk_max is still set to 2 GB to avoid "possible bootloader bugs".

When use '-kernel vmlinux -initrd initrd.cgz' to launch a VM,
the firmware(it could be linuxboot_dma.bin) helps to read initrd
contents into guest memory(below ramdisk_max) and jump to kernel.
that's similar with what bootloader does, like grub.

In addition, initrd_max is uint32_t simply because QEMU doesn't support
the 64-bit boot protocol (specifically the ext_ramdisk_image field).

Therefore here just limit initrd_max to UINT32_MAX simply as well to
allow initrd to be loaded below 4 GB.

NOTE: it's possible that linux protocol within [0x208, 0x20c]
supports up to 4 GB initrd as well.

CC: Paolo Bonzini 
CC: Richard Henderson 
CC: Eduardo Habkost 
CC: "Michael S. Tsirkin" 
CC: Marcel Apfelbaum 
Signed-off-by: Li Zhijian 

---
V6: update comments
V5: udpate comments and changelog
V3: correct grammar and check XLF_CAN_BE_LOADED_ABOVE_4G first (Michael S. 
Tsirkin)

Signed-off-by: Li Zhijian 
---
 hw/i386/pc.c | 21 -
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 64d23b2..10977a3 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1130,7 +1130,26 @@ static void load_linux(PCMachineState *pcms,
 #endif
 
 /* highest address for loading the initrd */
-if (protocol >= 0x203) {
+if (protocol >= 0x20c &&
+lduw_p(header+0x236) & XLF_CAN_BE_LOADED_ABOVE_4G) {
+/*
+ * Linux has supported initrd up to 4 GB for a very long time (2007,
+ * long before XLF_CAN_BE_LOADED_ABOVE_4G which was added in 2013),
+ * though it only sets initrd_max to 2 GB to "work around bootloader
+ * bugs". Luckily, QEMU firmware(which does something like bootloader)
+ * has supported this.
+ *
+ * It's believed that if XLF_CAN_BE_LOADED_ABOVE_4G is set, initrd can
+ * be loaded into any address.
+ *
+ * In addition, initrd_max is uint32_t simply because QEMU doesn't
+ * support the 64-bit boot protocol (specifically the ext_ramdisk_image
+ * field).
+ *
+ * Therefore here just limit initrd_max to UINT32_MAX simply as well.
+ */
+initrd_max = UINT32_MAX;
+} else if (protocol >= 0x203) {
 initrd_max = ldl_p(header+0x22c);
 } else {
 initrd_max = 0x37ff;
-- 
2.7.4




Re: [Qemu-devel] [PATCH v5 4/4] i386: allow to load initrd below 4G for recent linux

2019-01-16 Thread Li Zhijian

Hi Michael, Eduardo

On 1/15/19 09:46, Michael S. Tsirkin wrote:

On Tue, Jan 15, 2019 at 09:35:09AM +0800, Li Zhijian wrote:

Hi Eduardo


On 1/15/19 01:53, Eduardo Habkost wrote:

 +if (protocol >= 0x20c &&
 +lduw_p(header+0x236) & XLF_CAN_BE_LOADED_ABOVE_4G) {
 +/*
 + * Linux has supported initrd up to 4 GB for a very long time 
(2007,
 + * long before XLF_CAN_BE_LOADED_ABOVE_4G which was added in 
2013),
 + * though it only sets initrd_max to 2 GB to "work around 
bootloader
 + * bugs". Luckily, QEMU firmware(which does something like 
bootloader)
 + * has supported this.
 + *
 + * It's believed that if XLF_CAN_BE_LOADED_ABOVE_4G is set, 
initrd can
 + * be loaded into any address.
 + *
 + * In addition, initrd_max is uint32_t simply because QEMU 
doesn't
 + * support the 64-bit boot protocol (specifically the 
ext_ramdisk_image
 + * field).
 + *
 + * Therefore here just limit initrd_max to UINT32_MAX simply 
as well.
 + *
 + * FIXME: it's possible that linux protocol within [0x208, 
0x20c]
 + * supports up to 4G initrd as well.

 I don't understand what exactly this FIXME comment is
 documenting.  What exactly needs to be fixed?


XLF_CAN_BE_LOADED_ABOVE_4G is one of the indicators, actually as comments said,
linux has supported up to 4 GB initrd since linux-2.26(protocol version 0x208).


I just want to comment that linux with protocol within [0x208, 0x20c] supports 
up to 4 GB initrd as well.

Is documenting with FIXME appropriate?


Thanks




Fixme should say what is missing in the qemu implementation.




thanks for your explanation @Michael
I'd like to update "FIXME" to "NOTE" and move it into git-commit-log if no 
objection
and it's okay to delete it simply if it confuses others :)

BTW: any other comments for the others

Thanks
Zhijian





E.g.

/*
  * Bar 2010 and up can actually be supported using foo.
  * FIXME: make use of foo to support bar.
  */







Re: [Qemu-devel] [PATCH v5 4/4] i386: allow to load initrd below 4G for recent linux

2019-01-14 Thread Li Zhijian

Hi Eduardo


On 1/15/19 01:53, Eduardo Habkost wrote:

+if (protocol >= 0x20c &&
+lduw_p(header+0x236) & XLF_CAN_BE_LOADED_ABOVE_4G) {
+/*
+ * Linux has supported initrd up to 4 GB for a very long time (2007,
+ * long before XLF_CAN_BE_LOADED_ABOVE_4G which was added in 2013),
+ * though it only sets initrd_max to 2 GB to "work around bootloader
+ * bugs". Luckily, QEMU firmware(which does something like bootloader)
+ * has supported this.
+ *
+ * It's believed that if XLF_CAN_BE_LOADED_ABOVE_4G is set, initrd can
+ * be loaded into any address.
+ *
+ * In addition, initrd_max is uint32_t simply because QEMU doesn't
+ * support the 64-bit boot protocol (specifically the ext_ramdisk_image
+ * field).
+ *
+ * Therefore here just limit initrd_max to UINT32_MAX simply as well.
+ *
+ * FIXME: it's possible that linux protocol within [0x208, 0x20c]
+ * supports up to 4G initrd as well.

I don't understand what exactly this FIXME comment is
documenting.  What exactly needs to be fixed?


XLF_CAN_BE_LOADED_ABOVE_4G is one of the indicators, actually as comments said,
linux has supported up to 4 GB initrd since linux-2.26(protocol version 0x208).


I just want to comment that linux with protocol within [0x208, 0x20c] supports 
up to 4 GB initrd as well.

Is documenting with FIXME appropriate?


Thanks






Re: [Qemu-devel] [PATCH v5 3/4] i386: import & use bootparam.h

2019-01-11 Thread Li Zhijian

Hi Stefano


On 1/11/19 17:48, Stefano Garzarella wrote:

Hi Li,

On Fri, Jan 11, 2019 at 10:06 AM Li Zhijian  wrote:

+# unnecessary import of several video/ist/etc headers
+sed -e '/__ASSEMBLY__/,/__ASSEMBLY__/d' $tmpdir/include/asm/bootparam.h 
> $tmpdir/bootparam.h
+cp_portable $tmpdir/bootparam.h 
"$output/include/standard-headers/asm-$arch"

Maybe you miss my comment. Anyway, IMHO is better to use the double
quotes for all paths.

Reviewed-by: Stefano Garzarella 


So sorry about it, i folded it on a wrong branch at the beginning.

will update it soon.


Thanks





[Qemu-devel] [PATCH v5 3/4] i386: import & use bootparam.h

2019-01-11 Thread Li Zhijian
it's from v4.20-rc5.

CC: Michael S. Tsirkin 
Signed-off-by: Li Zhijian 
Reviewed-by: Michael S. Tsirkin 
---
V5: add reviewed-by tag
V4: use scirpt to import bootparam.h (Michael S. Tsirkin)
V3: new patch
---
 hw/i386/pc.c |  8 +--
 include/standard-headers/asm-x86/bootparam.h | 34 
 scripts/update-linux-headers.sh  |  4 
 3 files changed, 39 insertions(+), 7 deletions(-)
 create mode 100644 include/standard-headers/asm-x86/bootparam.h

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index f248662..89c25b2 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -74,6 +74,7 @@
 #include "hw/nmi.h"
 #include "hw/i386/intel_iommu.h"
 #include "hw/net/ne2000-isa.h"
+#include "standard-headers/asm-x86/bootparam.h"
 
 /* debug PC/ISA interrupts */
 //#define DEBUG_IRQ
@@ -820,13 +821,6 @@ static long get_file_size(FILE *f)
 return size;
 }
 
-/* setup_data types */
-#define SETUP_NONE 0
-#define SETUP_E820_EXT 1
-#define SETUP_DTB  2
-#define SETUP_PCI  3
-#define SETUP_EFI  4
-
 struct setup_data {
 uint64_t next;
 uint32_t type;
diff --git a/include/standard-headers/asm-x86/bootparam.h 
b/include/standard-headers/asm-x86/bootparam.h
new file mode 100644
index 000..67d4f01
--- /dev/null
+++ b/include/standard-headers/asm-x86/bootparam.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_BOOTPARAM_H
+#define _ASM_X86_BOOTPARAM_H
+
+/* setup_data types */
+#define SETUP_NONE 0
+#define SETUP_E820_EXT 1
+#define SETUP_DTB  2
+#define SETUP_PCI  3
+#define SETUP_EFI  4
+#define SETUP_APPLE_PROPERTIES 5
+#define SETUP_JAILHOUSE6
+
+/* ram_size flags */
+#define RAMDISK_IMAGE_START_MASK   0x07FF
+#define RAMDISK_PROMPT_FLAG0x8000
+#define RAMDISK_LOAD_FLAG  0x4000
+
+/* loadflags */
+#define LOADED_HIGH(1<<0)
+#define KASLR_FLAG (1<<1)
+#define QUIET_FLAG (1<<5)
+#define KEEP_SEGMENTS  (1<<6)
+#define CAN_USE_HEAP   (1<<7)
+
+/* xloadflags */
+#define XLF_KERNEL_64  (1<<0)
+#define XLF_CAN_BE_LOADED_ABOVE_4G (1<<1)
+#define XLF_EFI_HANDOVER_32(1<<2)
+#define XLF_EFI_HANDOVER_64(1<<3)
+#define XLF_EFI_KEXEC  (1<<4)
+
+
+#endif /* _ASM_X86_BOOTPARAM_H */
diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
index 0a964fe..77ec108 100755
--- a/scripts/update-linux-headers.sh
+++ b/scripts/update-linux-headers.sh
@@ -120,6 +120,10 @@ for arch in $ARCHLIST; do
 cp "$tmpdir/include/asm/unistd_x32.h" "$output/linux-headers/asm-x86/"
 cp "$tmpdir/include/asm/unistd_64.h" "$output/linux-headers/asm-x86/"
 cp_portable "$tmpdir/include/asm/kvm_para.h" 
"$output/include/standard-headers/asm-$arch"
+# Remove everything except the macros from bootparam.h avoiding the
+# unnecessary import of several video/ist/etc headers
+sed -e '/__ASSEMBLY__/,/__ASSEMBLY__/d' 
$tmpdir/include/asm/bootparam.h > $tmpdir/bootparam.h
+cp_portable $tmpdir/bootparam.h 
"$output/include/standard-headers/asm-$arch"
 fi
 done
 
-- 
2.7.4




[Qemu-devel] [PATCH v5 4/4] i386: allow to load initrd below 4G for recent linux

2019-01-11 Thread Li Zhijian
Since linux commit: cf8fa920cb42 ("i386: handle an initrd in highmem (version 
2)")
linux has supported initrd up to 4 GB, but the header field
ramdisk_max is still set to 2 GB to avoid "possible bootloader bugs".

When use '-kernel vmlinux -initrd initrd.cgz' to launch a VM,
the firmware(it could be linuxboot_dma.bin) helps to read initrd
contents into guest memory(below ramdisk_max) and jump to kernel.
that's similar with what bootloader does, like grub.

In addition, initrd_max is uint32_t simply because QEMU doesn't support
the 64-bit boot protocol (specifically the ext_ramdisk_image field).

Therefore here just limit initrd_max to UINT32_MAX simply as well to
allow initrd to be loaded below 4 GB.

CC: Paolo Bonzini 
CC: Richard Henderson 
CC: Eduardo Habkost 
CC: "Michael S. Tsirkin" 
CC: Marcel Apfelbaum 
Signed-off-by: Li Zhijian 

---
V5: udpate comments and changelog
V3: correct grammar and check XLF_CAN_BE_LOADED_ABOVE_4G first (Michael S. 
Tsirkin)

Signed-off-by: Li Zhijian 
---
 hw/i386/pc.c | 24 +++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 89c25b2..ea7a3c7 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -904,7 +904,29 @@ static void load_linux(PCMachineState *pcms,
 #endif
 
 /* highest address for loading the initrd */
-if (protocol >= 0x203) {
+if (protocol >= 0x20c &&
+lduw_p(header+0x236) & XLF_CAN_BE_LOADED_ABOVE_4G) {
+/*
+ * Linux has supported initrd up to 4 GB for a very long time (2007,
+ * long before XLF_CAN_BE_LOADED_ABOVE_4G which was added in 2013),
+ * though it only sets initrd_max to 2 GB to "work around bootloader
+ * bugs". Luckily, QEMU firmware(which does something like bootloader)
+ * has supported this.
+ *
+ * It's believed that if XLF_CAN_BE_LOADED_ABOVE_4G is set, initrd can
+ * be loaded into any address.
+ *
+ * In addition, initrd_max is uint32_t simply because QEMU doesn't
+ * support the 64-bit boot protocol (specifically the ext_ramdisk_image
+ * field).
+ *
+ * Therefore here just limit initrd_max to UINT32_MAX simply as well.
+ *
+ * FIXME: it's possible that linux protocol within [0x208, 0x20c]
+ * supports up to 4G initrd as well.
+ */
+initrd_max = UINT32_MAX;
+} else if (protocol >= 0x203) {
 initrd_max = ldl_p(header+0x22c);
 } else {
 initrd_max = 0x37ff;
-- 
2.7.4




  1   2   3   >