[PATCH v1] target/i386: Always set leaf 0x1f
From: Manish Mishra QEMU does not set 0x1f in case VM does not have extended CPU topology and expects guests to fallback to 0xb. Some versions of Windows does not like this behavior and expects this leaf to be populated. As a result Windows VM fails with blue screen. Leaf 0x1f is superset of 0xb, so it makes sense to set 0x1f equivalent to 0xb by default and workaround windows issue. This change adds a new property 'cpuid-0x1f-enforce' to set leaf 0x1f equivalent to 0xb in case extended CPU topology is not configured and behave as before otherwise. Steps to reproduce this issue: This requires Windows 10 or 11 VM, with credential guard and HyperV role enabled. Also this issue shows starting SapphireRapids which raised cpuid level to 0x20, hence exposing 0x1f to guests. ./qemu-system-x86_64 -drive file=/usr/share/OVMF/OVMF_CODE_4MB.secboot.fd,if=pflash,format=raw,unit=0,readonly=on -drive file=/usr/share/OVMF/OVMF_VARS_4MB.fd,if=pflash,format=raw -machine pc-q35,smm=on -global mch.extended-tseg-mbytes=80 -accel kvm -m 2G -cpu SapphireRapids-v1,invtsc=on,vmx=on -smp 2,maxcpus=240,sockets=120,dies=1,cores=2,threads=1 -hda systest_ahv_win10_cg.qcow2 Signed-off-by: Manish Mishra --- hw/i386/pc.c | 1 + target/i386/cpu.c | 7 +-- target/i386/cpu.h | 5 + target/i386/kvm/kvm.c | 4 +++- 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index c74931d577..4cab04e443 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -85,6 +85,7 @@ GlobalProperty pc_compat_9_0[] = { { TYPE_X86_CPU, "guest-phys-bits", "0" }, { "sev-guest", "legacy-vm-type", "on" }, { TYPE_X86_CPU, "legacy-multi-node", "on" }, +{ TYPE_X86_CPU, "cpuid-0x1f-enforce", "false" }, }; const size_t pc_compat_9_0_len = G_N_ELEMENTS(pc_compat_9_0); diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 4688d140c2..7b71083bc9 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -6637,7 +6637,8 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, break; case 0x1F: /* V2 Extended Topology Enumeration Leaf */ -if (!x86_has_extended_topo(env->avail_cpu_topo)) { +if (!x86_has_extended_topo(env->avail_cpu_topo) && +!cpu->enable_cpuid_0x1f_enforce) { *eax = *ebx = *ecx = *edx = 0; break; } @@ -7450,7 +7451,8 @@ void x86_cpu_expand_features(X86CPU *cpu, Error **errp) * cpu->vendor_cpuid_only has been unset for compatibility with older * machine types. */ -if (x86_has_extended_topo(env->avail_cpu_topo) && +if ((x86_has_extended_topo(env->avail_cpu_topo) || + cpu->enable_cpuid_0x1f_enforce) && (IS_INTEL_CPU(env) || !cpu->vendor_cpuid_only)) { x86_cpu_adjust_level(cpu, &env->cpuid_min_level, 0x1F); } @@ -8316,6 +8318,7 @@ static Property x86_cpu_properties[] = { DEFINE_PROP_BOOL("full-cpuid-auto-level", X86CPU, full_cpuid_auto_level, true), DEFINE_PROP_STRING("hv-vendor-id", X86CPU, hyperv_vendor), DEFINE_PROP_BOOL("cpuid-0xb", X86CPU, enable_cpuid_0xb, true), +DEFINE_PROP_BOOL("cpuid-0x1f-enforce", X86CPU, enable_cpuid_0x1f_enforce, true), DEFINE_PROP_BOOL("x-vendor-cpuid-only", X86CPU, vendor_cpuid_only, true), DEFINE_PROP_BOOL("x-amd-topoext-features-only", X86CPU, amd_topoext_features_only, true), DEFINE_PROP_BOOL("lmce", X86CPU, enable_lmce, false), diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 1e121acef5..49c5641ba8 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -2102,6 +2102,11 @@ struct ArchCPU { /* Compatibility bits for old machine types: */ bool enable_cpuid_0xb; +/* Always return values for 0x1f leaf. In cases where extended CPU topology + * is not configured, return values equivalent of leaf 0xb. + */ +bool enable_cpuid_0x1f_enforce; + /* Enable auto level-increase for all CPUID leaves */ bool full_cpuid_auto_level; diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index becca2efa5..a9c6f02900 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -1799,6 +1799,7 @@ static uint32_t kvm_x86_build_cpuid(CPUX86State *env, uint32_t limit, i, j; uint32_t unused; struct kvm_cpuid_entry2 *c; +X86CPU *cpu = env_archcpu(env); cpu_x86_cpuid(env, 0, 0, &limit, &unused, &unused, &unused); @@ -1831,7 +1832,8 @@ static uint32_t kvm_x86_build_cpuid(CPUX86State *env, break; } case 0x1f: -if (!x86_has_extended_topo(env->avail_cpu_topo)) { +if (!x86_has_extended_topo(env->avail_cpu_topo) && +!cpu->enable_cpuid_0x1f_enforce) { cpuid_i--; break; } -- 2.43.0
[PATCH] target/i386: Always set leaf 0x1f
QEMU does not set 0x1f in case VM does not have extended CPU topology and expects guests to fallback to 0xb. Some versions of windows i.e. windows 10, 11 does not like this behavior and expects this leaf to be populated. This is observed with windows VMs with secure boot, uefi and HyperV role enabled. Leaf 0x1f is superset of 0xb, so it makes sense to set 0x1f equivalent to 0xb by default and workaround windows issue. This change adds a new property 'cpuid-0x1f-enforce' to set leaf 0x1f equivalent to 0xb in case extended CPU topology is not present and behave as before otherwise. --- hw/i386/pc.c | 1 + target/i386/cpu.c | 71 +++ target/i386/cpu.h | 5 +++ target/i386/kvm/kvm.c | 4 ++- 4 files changed, 53 insertions(+), 28 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index c74931d577..4cab04e443 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -85,6 +85,7 @@ GlobalProperty pc_compat_9_0[] = { { TYPE_X86_CPU, "guest-phys-bits", "0" }, { "sev-guest", "legacy-vm-type", "on" }, { TYPE_X86_CPU, "legacy-multi-node", "on" }, +{ TYPE_X86_CPU, "cpuid-0x1f-enforce", "false" }, }; const size_t pc_compat_9_0_len = G_N_ELEMENTS(pc_compat_9_0); diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 4688d140c2..f89b2ef335 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -416,6 +416,43 @@ static void encode_topo_cpuid1f(CPUX86State *env, uint32_t count, assert(!(*eax & ~0x1f)); } +static void encode_topo_cpuid_b(CPUX86State *env, uint32_t count, +X86CPUTopoInfo *topo_info, +uint32_t threads_per_pkg, +uint32_t *eax, uint32_t *ebx, +uint32_t *ecx, uint32_t *edx) +{ +X86CPU *cpu = env_archcpu(env); + +if (!cpu->enable_cpuid_0xb) { +*eax = *ebx = *ecx = *edx = 0; +return; +} + +*ecx = count & 0xff; +*edx = cpu->apic_id; + +switch (count) { +case 0: +*eax = apicid_core_offset(topo_info); +*ebx = topo_info->threads_per_core; +*ecx |= CPUID_B_ECX_TOPO_LEVEL_SMT << 8; +break; +case 1: +*eax = apicid_pkg_offset(topo_info); +*ebx = threads_per_pkg; +*ecx |= CPUID_B_ECX_TOPO_LEVEL_CORE << 8; +break; +default: +*eax = 0; +*ebx = 0; +*ecx |= CPUID_B_ECX_TOPO_LEVEL_INVALID << 8; +} + +assert(!(*eax & ~0x1f)); +*ebx &= 0x; /* The count doesn't need to be reliable. */ +} + /* Encode cache info for CPUID[0x8005].ECX or CPUID[0x8005].EDX */ static uint32_t encode_cache_cpuid8005(CPUCacheInfo *cache) { @@ -6601,33 +6638,8 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, break; case 0xB: /* Extended Topology Enumeration Leaf */ -if (!cpu->enable_cpuid_0xb) { -*eax = *ebx = *ecx = *edx = 0; -break; -} - -*ecx = count & 0xff; -*edx = cpu->apic_id; - -switch (count) { -case 0: -*eax = apicid_core_offset(&topo_info); -*ebx = topo_info.threads_per_core; -*ecx |= CPUID_B_ECX_TOPO_LEVEL_SMT << 8; -break; -case 1: -*eax = apicid_pkg_offset(&topo_info); -*ebx = threads_per_pkg; -*ecx |= CPUID_B_ECX_TOPO_LEVEL_CORE << 8; -break; -default: -*eax = 0; -*ebx = 0; -*ecx |= CPUID_B_ECX_TOPO_LEVEL_INVALID << 8; -} - -assert(!(*eax & ~0x1f)); -*ebx &= 0x; /* The count doesn't need to be reliable. */ +encode_topo_cpuid_b(env, count, &topo_info, threads_per_pkg, +eax, ebx, ecx, edx); break; case 0x1C: if (cpu->enable_pmu && (env->features[FEAT_7_0_EDX] & CPUID_7_0_EDX_ARCH_LBR)) { @@ -6639,6 +6651,10 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, /* V2 Extended Topology Enumeration Leaf */ if (!x86_has_extended_topo(env->avail_cpu_topo)) { *eax = *ebx = *ecx = *edx = 0; +if (cpu->enable_cpuid_0x1f_enforce) { +encode_topo_cpuid_b(env, count, &topo_info, threads_per_pkg, +eax, ebx, ecx, edx); +} break; } @@ -8316,6 +8332,7 @@ static Property x86_cpu_properties[] = { DEFINE_PROP_BOOL("full-cpuid-auto-level", X86CPU, full_cpuid_auto_level, true), DEFINE_PROP_STRING("hv-vendor-id", X86CPU, hyperv_vendor), DEFINE_PROP_BOOL("cpuid-0xb", X86CPU, enable_cpuid_0xb, true), +DEFINE_PROP_BOOL("cpuid-0x1f-enforce", X86CPU, enable_cpuid_0x1f_enforce, true), DEFINE_PROP_BOOL("x-vendor-cpuid-only", X86CPU, vendor_cpuid_only, true), DEFINE_PROP_BOOL("x-amd-topoext-features-on
Re: [Qemu-devel] [PATCH] ahci: enable pci bus master MemoryRegion before loading ahci engines
On 21/08/23 5:31 pm, manish.mishra wrote: Hi Everyone, We are facing this issue. I see this conversation was never conversed and discussed issue is still active on QEMU master. Just for summary, the solution mentioned in this thread "temporarily enable bus master memory region" was not taken with the following justification. "Poking at PCI device internals like this seems fragile. And force enabling bus master can lead to unpleasantness like corrupting guest memory, unhandled interrupts, etc. E.g. it's quite reasonable, spec-wise, for the guest to move thing in memory around while bus mastering is off." There was an alternate solution mentioned in thread, that is to mark PORT_CMD_FIS_RX and PORT_CMD_START disabled forcefully if the bus master for the device is disabled. But there are no further conclusive discussions on this. I wanted to start this conversation again to hopefully get a conclusion for this. Thanks Manish Mishra On 10/09/19 7:38 pm, John Snow wrote: On 9/10/19 9:58 AM, Michael S. Tsirkin wrote: On Tue, Sep 10, 2019 at 09:50:41AM -0400, John Snow wrote: On 9/10/19 3:04 AM, Michael S. Tsirkin wrote: On Tue, Sep 10, 2019 at 01:18:37AM +0800, andychiu wrote: If Windows 10 guests have enabled 'turn off hard disk after idle' option in power settings, and the guest has a SATA disk plugged in, the SATA disk will be turned off after a specified idle time. If the guest is live migrated or saved/loaded with its SATA disk turned off, the following error will occur: qemu-system-x86_64: AHCI: Failed to start FIS receive engine: bad FIS receive buffer address qemu-system-x86_64: Failed to load ich9_ahci:ahci qemu-system-x86_64: error while loading state for instance 0x0 of device ':00:1a.0/ich9_ahci' qemu-system-x86_64: load of migration failed: Operation not permitted Observation from trace logs shows that a while after Windows 10 turns off a SATA disk (IDE disks don't have the following behavior), it will disable the PCI_COMMAND_MASTER flag of the pci device containing the ahci device. When the the disk is turning back on, the PCI_COMMAND_MASTER flag will be restored first. But if the guest is migrated or saved/loaded while the disk is off, the post_load callback of ahci device, ahci_state_post_load(), will fail at ahci_cond_start_engines() if the MemoryRegion pci_dev->bus_master_enable_region is not enabled, with pci_dev pointing to the PCIDevice struct containing the ahci device. This patch enables pci_dev->bus_master_enable_region before calling ahci_cond_start_engines() in ahci_state_post_load(), and restore the MemoryRegion to its original state afterwards. Signed-off-by: andychiu Poking at PCI device internals like this seems fragile. And force enabling bus master can lead to unpleasantness like corrupting guest memory, unhandled interrupts, etc. E.g. it's quite reasonable, spec-wise, for the guest to move thing in memory around while bus mastering is off. Can you teach ahci that region being disabled during migration is ok, and recover from it? That's what I'm wondering. I could try to just disable the FIS RX engine if the mapping fails, but that will require a change to guest visible state. My hunch, though, is that when windows re-enables the device it will need to re-program the address registers anyway, so it might cope well with the FIS RX bit getting switched off. (I'm wondering if it isn't a mistake that QEMU is trying to re-map this address in the first place. Is it legal that the PCI device has pci bus master disabled but we've held on to a mapping? If you are poking at guest memory when bus master is off, then most likely yes. Should there be some callback where AHCI knows to invalidate mappings at that point...?) ATM the callback is the config write, you check proxy->pci_dev.config[PCI_COMMAND] & PCI_COMMAND_MASTER and if disabled invalidate the mapping. virtio at least has code that pokes at proxy->pci_dev.config[PCI_COMMAND] too, I'm quite open to a function along the lines of pci_is_bus_master_enabled() that will do this. Well, that's not a callback. I don't think it's right to check the PCI_COMMAND register *every* time AHCI does anything at all to see if its mappings are still valid. AHCI makes a mapping *once* when FIS RX is turned on, and it unmaps it when it's turned off. It assumes it remains valid that whole time. When we migrate, it checks to see if it was running, and performs the mappings again to re-boot the state machine. What I'm asking is; what are the implications of a guest disabling PCI_COMMAND_MASTER? (I don't know PCI as well as you do.) What should that mean for the AHCI state machine? Does this *necessarily* invalidate the mappings? (In which case -- it's an error that AHCI held on to them after Windows disabled the card, even if AHCI isn't being engaged by the gue
Re: [Qemu-devel] [PATCH] ahci: enable pci bus master MemoryRegion before loading ahci engines
Hi Everyone, We are facing this issue. I see this conversation was never conversed and discussed issue is still active on QEMU master. Just for summary, the solution mentioned in this thread "temporarily enable bus master memory region" was not taken with the following justification. "Poking at PCI device internals like this seems fragile. And force enabling bus master can lead to unpleasantness like corrupting guest memory, unhandled interrupts, etc. E.g. it's quite reasonable, spec-wise, for the guest to move thing in memory around while bus mastering is off." There was an alternate solution mentioned in thread, that is to mark PORT_CMD_FIS_RX and PORT_CMD_START disabled forcefully if the bus master for the device is disabled. But there are no further conclusive discussions on this. I wanted to start this conversation again to hopefully get a conclusion for this. Thanks Manish Mishra On 10/09/19 7:38 pm, John Snow wrote: On 9/10/19 9:58 AM, Michael S. Tsirkin wrote: On Tue, Sep 10, 2019 at 09:50:41AM -0400, John Snow wrote: On 9/10/19 3:04 AM, Michael S. Tsirkin wrote: On Tue, Sep 10, 2019 at 01:18:37AM +0800, andychiu wrote: If Windows 10 guests have enabled 'turn off hard disk after idle' option in power settings, and the guest has a SATA disk plugged in, the SATA disk will be turned off after a specified idle time. If the guest is live migrated or saved/loaded with its SATA disk turned off, the following error will occur: qemu-system-x86_64: AHCI: Failed to start FIS receive engine: bad FIS receive buffer address qemu-system-x86_64: Failed to load ich9_ahci:ahci qemu-system-x86_64: error while loading state for instance 0x0 of device ':00:1a.0/ich9_ahci' qemu-system-x86_64: load of migration failed: Operation not permitted Observation from trace logs shows that a while after Windows 10 turns off a SATA disk (IDE disks don't have the following behavior), it will disable the PCI_COMMAND_MASTER flag of the pci device containing the ahci device. When the the disk is turning back on, the PCI_COMMAND_MASTER flag will be restored first. But if the guest is migrated or saved/loaded while the disk is off, the post_load callback of ahci device, ahci_state_post_load(), will fail at ahci_cond_start_engines() if the MemoryRegion pci_dev->bus_master_enable_region is not enabled, with pci_dev pointing to the PCIDevice struct containing the ahci device. This patch enables pci_dev->bus_master_enable_region before calling ahci_cond_start_engines() in ahci_state_post_load(), and restore the MemoryRegion to its original state afterwards. Signed-off-by: andychiu Poking at PCI device internals like this seems fragile. And force enabling bus master can lead to unpleasantness like corrupting guest memory, unhandled interrupts, etc. E.g. it's quite reasonable, spec-wise, for the guest to move thing in memory around while bus mastering is off. Can you teach ahci that region being disabled during migration is ok, and recover from it? That's what I'm wondering. I could try to just disable the FIS RX engine if the mapping fails, but that will require a change to guest visible state. My hunch, though, is that when windows re-enables the device it will need to re-program the address registers anyway, so it might cope well with the FIS RX bit getting switched off. (I'm wondering if it isn't a mistake that QEMU is trying to re-map this address in the first place. Is it legal that the PCI device has pci bus master disabled but we've held on to a mapping? If you are poking at guest memory when bus master is off, then most likely yes. Should there be some callback where AHCI knows to invalidate mappings at that point...?) ATM the callback is the config write, you check proxy->pci_dev.config[PCI_COMMAND] & PCI_COMMAND_MASTER and if disabled invalidate the mapping. virtio at least has code that pokes at proxy->pci_dev.config[PCI_COMMAND] too, I'm quite open to a function along the lines of pci_is_bus_master_enabled() that will do this. Well, that's not a callback. I don't think it's right to check the PCI_COMMAND register *every* time AHCI does anything at all to see if its mappings are still valid. AHCI makes a mapping *once* when FIS RX is turned on, and it unmaps it when it's turned off. It assumes it remains valid that whole time. When we migrate, it checks to see if it was running, and performs the mappings again to re-boot the state machine. What I'm asking is; what are the implications of a guest disabling PCI_COMMAND_MASTER? (I don't know PCI as well as you do.) What should that mean for the AHCI state machine? Does this *necessarily* invalidate the mappings? (In which case -- it's an error that AHCI held on to them after Windows disabled the card, even if AHCI isn't being engaged by the guest anymore. Essentially, we were turned off but didn't clean up a dangling pointer, but we need the event that tells us to clean the dangling mapping.)
Re: [PATCH] multifd: Avoid busy-wait in multifd_send_pages()
On 26/04/23 5:35 pm, Juan Quintela wrote: "manish.mishra" wrote: On 26/04/23 4:35 pm, Juan Quintela wrote: "manish.mishra" wrote: On 26/04/23 3:58 pm, Juan Quintela wrote: Before: while (true) { sem_post(channels_ready) } And you want to add to the initialization a counter equal to the number of channels. Now: while (true) { sem_post(channels_ready) } It is semantically the same, but when we setup it ready it means that when we set it to 1, we now that the channel and thread are ready for action. May be we can do one thing let the sem_post in while loop at same position itself. But we can do another post just before start I can see how this can make any difference. of this while loop, as that will be called only once it should do work of initialising count equal to multiFD channels? Yeap. But I can see what difference do we have here. Later, Juan. Thanks Juan, Just confirming if i misunderstood something :) I meant your approach makes sense, i was just suggesting a small change. To do something like this. qemu_sem_init(&multifd_send_state->channels_ready, 0); static void *multifd_send_thread(void *opaque) { ... sem_post(channels_ready); // Post once at start of thread and let one in loop as it is. while (true) { sem_post(channels_ready) } } Something like below has issue that we are marking channel_ready even before channel is actually ready, I think it is exactly the same. i meant if network is slow it may take some time to update pending_job and hence we can busy loop in send_multifd_page(). No difference from send_multifd_page() point of view. Notice that I mank that the channel is ready before I do any work. send_multifd_page() does a sem_wait() before doing anything related to this channel, so I can't see how it can be a differnce. static void *multifd_send_thread(void *opaque) { ... while (true) { sem_post(channels_ready); } } Not sure if we are already in agreement :) just confirming. sem_post(channels_ready); // Post once at start of thread and let one in loop as it is. while (true) { ... sem_post(channels_ready) } and while (true) { sem_post(channels_ready) ... } When "..." is exactly the same don't make any difference, the only difference is that in one case we "write" to semposts, and in the other we write just one. Or I am missing something. Later, Juan. yes sorry, i misunderstood techically it does not make any difference. LGTM Thanks Manish Mishra
Re: [PATCH] multifd: Avoid busy-wait in multifd_send_pages()
On 26/04/23 4:35 pm, Juan Quintela wrote: "manish.mishra" wrote: On 26/04/23 3:58 pm, Juan Quintela wrote: "manish.mishra" wrote: multifd_send_sync_main() posts request on the multifd channel but does not call sem_wait() on channels_ready semaphore, making the channels_ready semaphore count keep increasing. As a result, sem_wait() on channels_ready in multifd_send_pages() is always non-blocking hence multifd_send_pages() keeps searching for a free channel in a busy loop until a channel is freed. Signed-off-by: manish.mishra --- migration/multifd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/migration/multifd.c b/migration/multifd.c index cce3ad6988..43d26e7012 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -615,6 +615,7 @@ int multifd_send_sync_main(QEMUFile *f) trace_multifd_send_sync_main_signal(p->id); +qemu_sem_wait(&multifd_send_state->channels_ready); qemu_mutex_lock(&p->mutex); if (p->quit) { We need this, but I think it is better to put it on the second loop. @@ -919,7 +920,7 @@ int multifd_save_setup(Error **errp) multifd_send_state = g_malloc0(sizeof(*multifd_send_state)); multifd_send_state->params = g_new0(MultiFDSendParams, thread_count); multifd_send_state->pages = multifd_pages_init(page_count); -qemu_sem_init(&multifd_send_state->channels_ready, 0); +qemu_sem_init(&multifd_send_state->channels_ready, thread_count); qatomic_set(&multifd_send_state->exiting, 0); multifd_send_state->ops = multifd_ops[migrate_multifd_compression()]; I think this bit is wrong. We should not set the channels ready until the thread is ready and channel is created. What do you think about this patch: From bcb0ef9b97b858458c403d2e4dc9e0dbd96721b3 Mon Sep 17 00:00:00 2001 From: Juan Quintela Date: Wed, 26 Apr 2023 12:20:36 +0200 Subject: [PATCH] multifd: Fix the number of channels ready We don't wait in the sem when we are doing a sync_main. Make it wait there. To make things clearer, we mark the channel ready at the begining of the thread loop. This causes a busy loop in multifd_send_page(). Found-by: manish.mishra Signed-off-by: Juan Quintela --- migration/multifd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/migration/multifd.c b/migration/multifd.c index 903df2117b..e625e8725e 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -635,6 +635,7 @@ int multifd_send_sync_main(QEMUFile *f) for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; +qemu_sem_wait(&multifd_send_state->channels_ready); trace_multifd_send_sync_main_wait(p->id); qemu_sem_wait(&p->sem_sync); @@ -668,6 +669,7 @@ static void *multifd_send_thread(void *opaque) p->num_packets = 1; while (true) { +qemu_sem_post(&multifd_send_state->channels_ready); This has one issue though, if we mark channel_ready here itself, channel is actually not ready so we can still busy loop? Before: while (true) { sem_post(channels_ready) } And you want to add to the initialization a counter equal to the number of channels. Now: while (true) { sem_post(channels_ready) } It is semantically the same, but when we setup it ready it means that when we set it to 1, we now that the channel and thread are ready for action. May be we can do one thing let the sem_post in while loop at same position itself. But we can do another post just before start I can see how this can make any difference. of this while loop, as that will be called only once it should do work of initialising count equal to multiFD channels? Yeap. But I can see what difference do we have here. Later, Juan. Thanks Juan, Just confirming if i misunderstood something :) I meant your approach makes sense, i was just suggesting a small change. To do something like this. qemu_sem_init(&multifd_send_state->channels_ready, 0); static void *multifd_send_thread(void *opaque) { ... sem_post(channels_ready); // Post once at start of thread and let one in loop as it is. while (true) { sem_post(channels_ready) } } Something like below has issue that we are marking channel_ready even before channel is actually ready, i meant if network is slow it may take some time to update pending_job and hence we can busy loop in send_multifd_page(). static void *multifd_send_thread(void *opaque) { ... while (true) { sem_post(channels_ready); } } Not sure if we are already in agreement :) just confirming. Thanks Manish Mishra
Re: [PATCH] multifd: Avoid busy-wait in multifd_send_pages()
On 26/04/23 3:58 pm, Juan Quintela wrote: "manish.mishra" wrote: multifd_send_sync_main() posts request on the multifd channel but does not call sem_wait() on channels_ready semaphore, making the channels_ready semaphore count keep increasing. As a result, sem_wait() on channels_ready in multifd_send_pages() is always non-blocking hence multifd_send_pages() keeps searching for a free channel in a busy loop until a channel is freed. Signed-off-by: manish.mishra --- migration/multifd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/migration/multifd.c b/migration/multifd.c index cce3ad6988..43d26e7012 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -615,6 +615,7 @@ int multifd_send_sync_main(QEMUFile *f) trace_multifd_send_sync_main_signal(p->id); +qemu_sem_wait(&multifd_send_state->channels_ready); qemu_mutex_lock(&p->mutex); if (p->quit) { We need this, but I think it is better to put it on the second loop. @@ -919,7 +920,7 @@ int multifd_save_setup(Error **errp) multifd_send_state = g_malloc0(sizeof(*multifd_send_state)); multifd_send_state->params = g_new0(MultiFDSendParams, thread_count); multifd_send_state->pages = multifd_pages_init(page_count); -qemu_sem_init(&multifd_send_state->channels_ready, 0); +qemu_sem_init(&multifd_send_state->channels_ready, thread_count); qatomic_set(&multifd_send_state->exiting, 0); multifd_send_state->ops = multifd_ops[migrate_multifd_compression()]; I think this bit is wrong. We should not set the channels ready until the thread is ready and channel is created. What do you think about this patch: From bcb0ef9b97b858458c403d2e4dc9e0dbd96721b3 Mon Sep 17 00:00:00 2001 From: Juan Quintela Date: Wed, 26 Apr 2023 12:20:36 +0200 Subject: [PATCH] multifd: Fix the number of channels ready We don't wait in the sem when we are doing a sync_main. Make it wait there. To make things clearer, we mark the channel ready at the begining of the thread loop. This causes a busy loop in multifd_send_page(). Found-by: manish.mishra Signed-off-by: Juan Quintela --- migration/multifd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/migration/multifd.c b/migration/multifd.c index 903df2117b..e625e8725e 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -635,6 +635,7 @@ int multifd_send_sync_main(QEMUFile *f) for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; +qemu_sem_wait(&multifd_send_state->channels_ready); trace_multifd_send_sync_main_wait(p->id); qemu_sem_wait(&p->sem_sync); @@ -668,6 +669,7 @@ static void *multifd_send_thread(void *opaque) p->num_packets = 1; while (true) { +qemu_sem_post(&multifd_send_state->channels_ready); This has one issue though, if we mark channel_ready here itself, channel is actually not ready so we can still busy loop? May be we can do one thing let the sem_post in while loop at same position itself. But we can do another post just before start of this while loop, as that will be called only once it should do work of initialising count equal to multiFD channels? qemu_sem_wait(&p->sem); if (qatomic_read(&multifd_send_state->exiting)) { @@ -736,7 +738,6 @@ static void *multifd_send_thread(void *opaque) if (flags & MULTIFD_FLAG_SYNC) { qemu_sem_post(&p->sem_sync); } -qemu_sem_post(&multifd_send_state->channels_ready); } else if (p->quit) { qemu_mutex_unlock(&p->mutex); break;
[PATCH] multifd: Avoid busy-wait in multifd_send_pages()
multifd_send_sync_main() posts request on the multifd channel but does not call sem_wait() on channels_ready semaphore, making the channels_ready semaphore count keep increasing. As a result, sem_wait() on channels_ready in multifd_send_pages() is always non-blocking hence multifd_send_pages() keeps searching for a free channel in a busy loop until a channel is freed. Signed-off-by: manish.mishra --- migration/multifd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/migration/multifd.c b/migration/multifd.c index cce3ad6988..43d26e7012 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -615,6 +615,7 @@ int multifd_send_sync_main(QEMUFile *f) trace_multifd_send_sync_main_signal(p->id); +qemu_sem_wait(&multifd_send_state->channels_ready); qemu_mutex_lock(&p->mutex); if (p->quit) { @@ -919,7 +920,7 @@ int multifd_save_setup(Error **errp) multifd_send_state = g_malloc0(sizeof(*multifd_send_state)); multifd_send_state->params = g_new0(MultiFDSendParams, thread_count); multifd_send_state->pages = multifd_pages_init(page_count); -qemu_sem_init(&multifd_send_state->channels_ready, 0); +qemu_sem_init(&multifd_send_state->channels_ready, thread_count); qatomic_set(&multifd_send_state->exiting, 0); multifd_send_state->ops = multifd_ops[migrate_multifd_compression()]; -- 2.22.3
Re: Expose support for HyperV features via QMP
On 09/02/23 7:47 pm, Vitaly Kuznetsov wrote: Alex Bennée writes: "manish.mishra" writes: Hi Everyone, Checking if there is any feedback on this. I've expanded the CC list to some relevant maintainers and people who have touched that code in case this was missed. Thanks Manish Mishra On 31/01/23 8:17 pm, manish.mishra wrote: Hi Everyone, I hope everyone is doing great. We wanted to check why we do not expose support for HyperV features in Qemu similar to what we do for normal CPU features via query-cpu-defs or cpu-model-expansion QMP commands. This support is required for live migration with HyperV features as hyperv passthrough is not an option. If users had knowledge of what features are supported by source and destination, VM can be started with an intersection of features supported by both source and destination. If there is no specific reason for not doing this, does it make sense to add a new QMP which expose support (internally also validating with KVM or KVM_GET_SUPPORTED_HV_CPUID ioctl) for HyperV features. Apologies in advance if i misunderstood something. Thanks for Ccing me. Hyper-V features should appear in QMP since commit 071ce4b03becf9e2df6b758fde9609be8ddf56f1 Author: Vitaly Kuznetsov Date: Tue Jun 8 14:08:13 2021 +0200 i386: expand Hyper-V features during CPU feature expansion time also, the support for Hypre-V feature discovery was just added to libvirt: 903ea9370d qemu_capabilities: Report Hyper-V Enlightenments in domcapabilities 10f4784864 qemu_capabilities: Query for Hyper-V Enlightenments ff8731680b qemuMonitorJSONGetCPUModelExpansion: Introduce @hv_passthrough argument 7c12eb2397 qemuMonitorJSONMakeCPUModel: Introduce @hv_passthrough argument 7c1ecfd512 domain_capabilities: Expose Hyper-V Enlightenments 179e45d237 virDomainCapsEnumFormat: Retrun void a7789d9324 virDomainCapsEnumFormat: Switch to virXMLFormatElement() in case this is not enough, could you please elaborate on the use-case you have in mind? Thanks Vitaly, Alex, Yes this should work. Sorry I checked qemu master code but not sure how i missed this :). Thanks Manish Mishra
Re: Expose support for HyperV features via QMP
Hi Everyone, Checking if there is any feedback on this. Thanks Manish Mishra On 31/01/23 8:17 pm, manish.mishra wrote: Hi Everyone, I hope everyone is doing great. We wanted to check why we do not expose support for HyperV features in Qemu similar to what we do for normal CPU features via query-cpu-defs or cpu-model-expansion QMP commands. This support is required for live migration with HyperV features as hyperv passthrough is not an option. If users had knowledge of what features are supported by source and destination, VM can be started with an intersection of features supported by both source and destination. If there is no specific reason for not doing this, does it make sense to add a new QMP which expose support (internally also validating with KVM or KVM_GET_SUPPORTED_HV_CPUID ioctl) for HyperV features. Apologies in advance if i misunderstood something. Thanks Manish Mishra
Re: Expose support for HyperV features via QMP
On 31/01/23 8:17 pm, manish.mishra wrote: Hi Everyone, I hope everyone is doing great. We wanted to check why we do not expose support for HyperV features in Qemu similar to what we do for normal CPU features via query-cpu-defs or cpu-model-expansion QMP commands. This support is required for live migration with HyperV features as hyperv passthrough is not an option. If users had knowledge of what features are supported by source and destination, VM can be started with an intersection of features supported by both source and destination. If there is no specific reason for not doing this, does it make sense to add a new QMP which expose support (internally also validating with KVM or KVM_GET_SUPPORTED_HV_CPUID ioctl) for HyperV features. Apologies in advance if i misunderstood something. Thanks Manish Mishra Hi Everyone, Checking if there is any feedback on this. Thanks Manish Mishra
Re: [PATCH v6 0/2] check magic value for deciding the mapping of channels
On 31/01/23 8:47 pm, Peter Xu wrote: On Tue, Jan 31, 2023 at 08:29:08PM +0530, manish.mishra wrote: Hi Peter, Daniel, Just a gentle reminder on this patch if it can be merged, and really sorry i see now earlier reminders i sent were on v6[0/2] and somehow you were not CCed on that earlier. You were CCed just on v6[1/2] and v6[2,2] so that's why probably missed it. Yes I think so. For some reason I guess Juan missed this set when sending the most recent PR. We should pick them up soon. Thanks Peter :)
Expose support for HyperV features via QMP
Hi Everyone, I hope everyone is doing great. We wanted to check why we do not expose support for HyperV features in Qemu similar to what we do for normal CPU features via query-cpu-defs or cpu-model-expansion QMP commands. This support is required for live migration with HyperV features as hyperv passthrough is not an option. If users had knowledge of what features are supported by source and destination, VM can be started with an intersection of features supported by both source and destination. If there is no specific reason for not doing this, does it make sense to add a new QMP which expose support (internally also validating with KVM or KVM_GET_SUPPORTED_HV_CPUID ioctl) for HyperV features. Apologies in advance if i misunderstood something. Thanks Manish Mishra
Re: [PATCH v6 0/2] check magic value for deciding the mapping of channels
Hi Peter, Daniel, Just a gentle reminder on this patch if it can be merged, and really sorry i see now earlier reminders i sent were on v6[0/2] and somehow you were not CCed on that earlier. You were CCed just on v6[1/2] and v6[2,2] so that's why probably missed it. Thanks Manish Mishra On 21/12/22 12:06 am, manish.mishra wrote: Current logic assumes that channel connections on the destination side are always established in the same order as the source and the first one will always be the main channel followed by the multifid or post-copy preemption channel. This may not be always true, as even if a channel has a connection established on the source side it can be in the pending state on the destination side and a newer connection can be established first. Basically causing out of order mapping of channels on the destination side. Currently, all channels except post-copy preempt send a magic number, this patch uses that magic number to decide the type of channel. This logic is applicable only for precopy(multifd) live migration, as mentioned, the post-copy preempt channel does not send any magic number. Also, tls live migrations already does tls handshake before creating other channels, so this issue is not possible with tls, hence this logic is avoided for tls live migrations. This patch uses MSG_PEEK to check the magic number of channels so that current data/control stream management remains un-effected. v2: TLS does not support MSG_PEEK, so V1 was broken for tls live migrations. For tls live migration, while initializing main channel tls handshake is done before we can create other channels, so this issue is not possible for tls live migrations. In V2 added a check to avoid checking magic number for tls live migration and fallback to older method to decide mapping of channels on destination side. v3: 1. Split change in two patches, io patch for read_peek routines, migration patch for migration related changes. 2. Add flags to io_readv calls to get extra read flags like MSG_PEEK. 3. Some other minor fixes. v4: 1. Removed common *all_eof routines for read peek and added one specific to live migration. 2. Updated to use qemu_co_sleep_ns instead of qio_channel_yield. 3. Some other minor fixes. v5: 1. Handle busy-wait in migration_channel_read_peek due partial reads. v6: With earlier patch, multifd_load_setup was done only in migration_incoming_setup but if multifd channel is received before default channel, multifd channels will be uninitialized. Moved multifd_load_setup to migration_ioc_process_incoming. manish.mishra (2): io: Add support for MSG_PEEK for socket channel migration: check magic value for deciding the mapping of channels chardev/char-socket.c | 4 +-- include/io/channel.h| 6 io/channel-buffer.c | 1 + io/channel-command.c| 1 + io/channel-file.c | 1 + io/channel-null.c | 1 + io/channel-socket.c | 17 - io/channel-tls.c| 1 + io/channel-websock.c| 1 + io/channel.c| 16 ++--- migration/channel-block.c | 1 + migration/channel.c | 45 migration/channel.h | 5 +++ migration/migration.c | 54 - migration/multifd.c | 19 +- migration/multifd.h | 2 +- migration/postcopy-ram.c| 5 +-- migration/postcopy-ram.h| 2 +- scsi/qemu-pr-helper.c | 2 +- tests/qtest/tpm-emu.c | 2 +- tests/unit/test-io-channel-socket.c | 1 + util/vhost-user-server.c| 2 +- 22 files changed, 148 insertions(+), 41 deletions(-)
Re: [PATCH v6 0/2] check magic value for deciding the mapping of channels
Hi Everyone, I was just checking if it was not missed in holidays and was received. :) Thanks Manish Mishra On 21/12/22 12:14 am, manish.mishra wrote: Current logic assumes that channel connections on the destination side are always established in the same order as the source and the first one will always be the main channel followed by the multifid or post-copy preemption channel. This may not be always true, as even if a channel has a connection established on the source side it can be in the pending state on the destination side and a newer connection can be established first. Basically causing out of order mapping of channels on the destination side. Currently, all channels except post-copy preempt send a magic number, this patch uses that magic number to decide the type of channel. This logic is applicable only for precopy(multifd) live migration, as mentioned, the post-copy preempt channel does not send any magic number. Also, tls live migrations already does tls handshake before creating other channels, so this issue is not possible with tls, hence this logic is avoided for tls live migrations. This patch uses MSG_PEEK to check the magic number of channels so that current data/control stream management remains un-effected. v2: TLS does not support MSG_PEEK, so V1 was broken for tls live migrations. For tls live migration, while initializing main channel tls handshake is done before we can create other channels, so this issue is not possible for tls live migrations. In V2 added a check to avoid checking magic number for tls live migration and fallback to older method to decide mapping of channels on destination side. v3: 1. Split change in two patches, io patch for read_peek routines, migration patch for migration related changes. 2. Add flags to io_readv calls to get extra read flags like MSG_PEEK. 3. Some other minor fixes. v4: 1. Removed common *all_eof routines for read peek and added one specific to live migration. 2. Updated to use qemu_co_sleep_ns instead of qio_channel_yield. 3. Some other minor fixes. v5: 1. Handle busy-wait in migration_channel_read_peek due partial reads. v6: With earlier patch, multifd_load_setup was done only in migration_incoming_setup but if multifd channel is received before default channel, multifd channels will be uninitialized. Moved multifd_load_setup to migration_ioc_process_incoming. manish.mishra (2): io: Add support for MSG_PEEK for socket channel migration: check magic value for deciding the mapping of channels chardev/char-socket.c | 4 +-- include/io/channel.h| 6 io/channel-buffer.c | 1 + io/channel-command.c| 1 + io/channel-file.c | 1 + io/channel-null.c | 1 + io/channel-socket.c | 17 - io/channel-tls.c| 1 + io/channel-websock.c| 1 + io/channel.c| 16 ++--- migration/channel-block.c | 1 + migration/channel.c | 45 migration/channel.h | 5 +++ migration/migration.c | 54 - migration/multifd.c | 19 +- migration/multifd.h | 2 +- migration/postcopy-ram.c| 5 +-- migration/postcopy-ram.h| 2 +- scsi/qemu-pr-helper.c | 2 +- tests/qtest/tpm-emu.c | 2 +- tests/unit/test-io-channel-socket.c | 1 + util/vhost-user-server.c| 2 +- 22 files changed, 148 insertions(+), 41 deletions(-)
[PATCH v6 2/2] migration: check magic value for deciding the mapping of channels
Current logic assumes that channel connections on the destination side are always established in the same order as the source and the first one will always be the main channel followed by the multifid or post-copy preemption channel. This may not be always true, as even if a channel has a connection established on the source side it can be in the pending state on the destination side and a newer connection can be established first. Basically causing out of order mapping of channels on the destination side. Currently, all channels except post-copy preempt send a magic number, this patch uses that magic number to decide the type of channel. This logic is applicable only for precopy(multifd) live migration, as mentioned, the post-copy preempt channel does not send any magic number. Also, tls live migrations already does tls handshake before creating other channels, so this issue is not possible with tls, hence this logic is avoided for tls live migrations. This patch uses read peek to check the magic number of channels so that current data/control stream management remains un-effected. Reviewed-by: Peter Xu Reviewed-by: Daniel P. Berrange Suggested-by: Daniel P. Berrange Signed-off-by: manish.mishra --- migration/channel.c | 45 + migration/channel.h | 5 migration/migration.c| 54 migration/multifd.c | 19 +++--- migration/multifd.h | 2 +- migration/postcopy-ram.c | 5 +--- migration/postcopy-ram.h | 2 +- 7 files changed, 101 insertions(+), 31 deletions(-) diff --git a/migration/channel.c b/migration/channel.c index 1b0815039f..fc0228a198 100644 --- a/migration/channel.c +++ b/migration/channel.c @@ -92,3 +92,48 @@ void migration_channel_connect(MigrationState *s, migrate_fd_connect(s, error); error_free(error); } + + +/** + * @migration_channel_read_peek - Peek at migration channel, without + * actually removing it from channel buffer. + * + * @ioc: the channel object + * @buf: the memory region to read data into + * @buflen: the number of bytes to read in @buf + * @errp: pointer to a NULL-initialized error object + * + * Returns 0 if successful, returns -1 and sets @errp if fails. + */ +int migration_channel_read_peek(QIOChannel *ioc, +const char *buf, +const size_t buflen, +Error **errp) +{ +ssize_t len = 0; +struct iovec iov = { .iov_base = (char *)buf, .iov_len = buflen }; + +while (true) { +len = qio_channel_readv_full(ioc, &iov, 1, NULL, + NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp); + +if (len <= 0 && len != QIO_CHANNEL_ERR_BLOCK) { +error_setg(errp, + "Failed to peek at channel"); +return -1; +} + +if (len == buflen) { +break; +} + +/* 1ms sleep. */ +if (qemu_in_coroutine()) { +qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100); +} else { +g_usleep(1000); +} +} + +return 0; +} diff --git a/migration/channel.h b/migration/channel.h index 67a461c28a..5bdb8208a7 100644 --- a/migration/channel.h +++ b/migration/channel.h @@ -24,4 +24,9 @@ void migration_channel_connect(MigrationState *s, QIOChannel *ioc, const char *hostname, Error *error_in); + +int migration_channel_read_peek(QIOChannel *ioc, +const char *buf, +const size_t buflen, +Error **errp); #endif diff --git a/migration/migration.c b/migration/migration.c index 52b5d39244..bbc9ce3ca6 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -31,6 +31,7 @@ #include "migration.h" #include "savevm.h" #include "qemu-file.h" +#include "channel.h" #include "migration/vmstate.h" #include "block/block.h" #include "qapi/error.h" @@ -663,10 +664,6 @@ static bool migration_incoming_setup(QEMUFile *f, Error **errp) { MigrationIncomingState *mis = migration_incoming_get_current(); -if (multifd_load_setup(errp) != 0) { -return false; -} - if (!mis->from_src_file) { mis->from_src_file = f; } @@ -733,31 +730,56 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) { MigrationIncomingState *mis = migration_incoming_get_current(); Error *local_err = NULL; -bool start_migration; QEMUFile *f; +bool default_channel = true; +uint32_t channel_magic = 0; +int ret = 0; -if (!mis->from_src_file) { -/* The first connection (multifd may have multiple) */ +if (migrate_use_multifd() && !migrate_postcopy_ram() &
[PATCH v6 1/2] io: Add support for MSG_PEEK for socket channel
MSG_PEEK peeks at the channel, The data is treated as unread and the next read shall still return this data. This support is currently added only for socket class. Extra parameter 'flags' is added to io_readv calls to pass extra read flags like MSG_PEEK. Reviewed-by: Peter Xu Reviewed-by: Daniel P. Berrange Suggested-by: Daniel P. Berrange Signed-off-by: manish.mishra --- chardev/char-socket.c | 4 ++-- include/io/channel.h| 6 ++ io/channel-buffer.c | 1 + io/channel-command.c| 1 + io/channel-file.c | 1 + io/channel-null.c | 1 + io/channel-socket.c | 17 - io/channel-tls.c| 1 + io/channel-websock.c| 1 + io/channel.c| 16 migration/channel-block.c | 1 + scsi/qemu-pr-helper.c | 2 +- tests/qtest/tpm-emu.c | 2 +- tests/unit/test-io-channel-socket.c | 1 + util/vhost-user-server.c| 2 +- 15 files changed, 47 insertions(+), 10 deletions(-) diff --git a/chardev/char-socket.c b/chardev/char-socket.c index 29ffe5075e..c2265436ac 100644 --- a/chardev/char-socket.c +++ b/chardev/char-socket.c @@ -283,11 +283,11 @@ static ssize_t tcp_chr_recv(Chardev *chr, char *buf, size_t len) if (qio_channel_has_feature(s->ioc, QIO_CHANNEL_FEATURE_FD_PASS)) { ret = qio_channel_readv_full(s->ioc, &iov, 1, &msgfds, &msgfds_num, - NULL); + 0, NULL); } else { ret = qio_channel_readv_full(s->ioc, &iov, 1, NULL, NULL, - NULL); + 0, NULL); } if (msgfds_num) { diff --git a/include/io/channel.h b/include/io/channel.h index f1b7e05f81..5b41d02b2b 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -34,6 +34,8 @@ OBJECT_DECLARE_TYPE(QIOChannel, QIOChannelClass, #define QIO_CHANNEL_WRITE_FLAG_ZERO_COPY 0x1 +#define QIO_CHANNEL_READ_FLAG_MSG_PEEK 0x1 + typedef enum QIOChannelFeature QIOChannelFeature; enum QIOChannelFeature { @@ -41,6 +43,7 @@ enum QIOChannelFeature { QIO_CHANNEL_FEATURE_SHUTDOWN, QIO_CHANNEL_FEATURE_LISTEN, QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY, +QIO_CHANNEL_FEATURE_READ_MSG_PEEK, }; @@ -114,6 +117,7 @@ struct QIOChannelClass { size_t niov, int **fds, size_t *nfds, +int flags, Error **errp); int (*io_close)(QIOChannel *ioc, Error **errp); @@ -188,6 +192,7 @@ void qio_channel_set_name(QIOChannel *ioc, * @niov: the length of the @iov array * @fds: pointer to an array that will received file handles * @nfds: pointer filled with number of elements in @fds on return + * @flags: read flags (QIO_CHANNEL_READ_FLAG_*) * @errp: pointer to a NULL-initialized error object * * Read data from the IO channel, storing it in the @@ -224,6 +229,7 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc, size_t niov, int **fds, size_t *nfds, + int flags, Error **errp); diff --git a/io/channel-buffer.c b/io/channel-buffer.c index bf52011be2..8096180f85 100644 --- a/io/channel-buffer.c +++ b/io/channel-buffer.c @@ -54,6 +54,7 @@ static ssize_t qio_channel_buffer_readv(QIOChannel *ioc, size_t niov, int **fds, size_t *nfds, +int flags, Error **errp) { QIOChannelBuffer *bioc = QIO_CHANNEL_BUFFER(ioc); diff --git a/io/channel-command.c b/io/channel-command.c index 74516252ba..e7edd091af 100644 --- a/io/channel-command.c +++ b/io/channel-command.c @@ -203,6 +203,7 @@ static ssize_t qio_channel_command_readv(QIOChannel *ioc, size_t niov, int **fds, size_t *nfds, + int flags, Error **errp) { QIOChannelCommand *cioc = QIO_CHANNEL_COMMAND(ioc); diff --git a/io/channel-file.c b/io/channel-file.c index b67687c2aa..d76663e6ae 100644 --- a/io/channel-file.c +++ b/io/channel-file.c @@ -86,6 +86,7 @@ static ssize_t qio_channel_file_readv(QIOChannel *ioc, size_t niov, int **fds, size_t *nfds, +
[PATCH v6 0/2] check magic value for deciding the mapping of channels
Current logic assumes that channel connections on the destination side are always established in the same order as the source and the first one will always be the main channel followed by the multifid or post-copy preemption channel. This may not be always true, as even if a channel has a connection established on the source side it can be in the pending state on the destination side and a newer connection can be established first. Basically causing out of order mapping of channels on the destination side. Currently, all channels except post-copy preempt send a magic number, this patch uses that magic number to decide the type of channel. This logic is applicable only for precopy(multifd) live migration, as mentioned, the post-copy preempt channel does not send any magic number. Also, tls live migrations already does tls handshake before creating other channels, so this issue is not possible with tls, hence this logic is avoided for tls live migrations. This patch uses MSG_PEEK to check the magic number of channels so that current data/control stream management remains un-effected. v2: TLS does not support MSG_PEEK, so V1 was broken for tls live migrations. For tls live migration, while initializing main channel tls handshake is done before we can create other channels, so this issue is not possible for tls live migrations. In V2 added a check to avoid checking magic number for tls live migration and fallback to older method to decide mapping of channels on destination side. v3: 1. Split change in two patches, io patch for read_peek routines, migration patch for migration related changes. 2. Add flags to io_readv calls to get extra read flags like MSG_PEEK. 3. Some other minor fixes. v4: 1. Removed common *all_eof routines for read peek and added one specific to live migration. 2. Updated to use qemu_co_sleep_ns instead of qio_channel_yield. 3. Some other minor fixes. v5: 1. Handle busy-wait in migration_channel_read_peek due partial reads. v6: With earlier patch, multifd_load_setup was done only in migration_incoming_setup but if multifd channel is received before default channel, multifd channels will be uninitialized. Moved multifd_load_setup to migration_ioc_process_incoming. manish.mishra (2): io: Add support for MSG_PEEK for socket channel migration: check magic value for deciding the mapping of channels chardev/char-socket.c | 4 +-- include/io/channel.h| 6 io/channel-buffer.c | 1 + io/channel-command.c| 1 + io/channel-file.c | 1 + io/channel-null.c | 1 + io/channel-socket.c | 17 - io/channel-tls.c| 1 + io/channel-websock.c| 1 + io/channel.c| 16 ++--- migration/channel-block.c | 1 + migration/channel.c | 45 migration/channel.h | 5 +++ migration/migration.c | 54 - migration/multifd.c | 19 +- migration/multifd.h | 2 +- migration/postcopy-ram.c| 5 +-- migration/postcopy-ram.h| 2 +- scsi/qemu-pr-helper.c | 2 +- tests/qtest/tpm-emu.c | 2 +- tests/unit/test-io-channel-socket.c | 1 + util/vhost-user-server.c| 2 +- 22 files changed, 148 insertions(+), 41 deletions(-) -- 2.22.3
[PATCH v6 0/2] check magic value for deciding the mapping of channels
Current logic assumes that channel connections on the destination side are always established in the same order as the source and the first one will always be the main channel followed by the multifid or post-copy preemption channel. This may not be always true, as even if a channel has a connection established on the source side it can be in the pending state on the destination side and a newer connection can be established first. Basically causing out of order mapping of channels on the destination side. Currently, all channels except post-copy preempt send a magic number, this patch uses that magic number to decide the type of channel. This logic is applicable only for precopy(multifd) live migration, as mentioned, the post-copy preempt channel does not send any magic number. Also, tls live migrations already does tls handshake before creating other channels, so this issue is not possible with tls, hence this logic is avoided for tls live migrations. This patch uses MSG_PEEK to check the magic number of channels so that current data/control stream management remains un-effected. v2: TLS does not support MSG_PEEK, so V1 was broken for tls live migrations. For tls live migration, while initializing main channel tls handshake is done before we can create other channels, so this issue is not possible for tls live migrations. In V2 added a check to avoid checking magic number for tls live migration and fallback to older method to decide mapping of channels on destination side. v3: 1. Split change in two patches, io patch for read_peek routines, migration patch for migration related changes. 2. Add flags to io_readv calls to get extra read flags like MSG_PEEK. 3. Some other minor fixes. v4: 1. Removed common *all_eof routines for read peek and added one specific to live migration. 2. Updated to use qemu_co_sleep_ns instead of qio_channel_yield. 3. Some other minor fixes. v5: 1. Handle busy-wait in migration_channel_read_peek due partial reads. v6: With earlier patch, multifd_load_setup was done only in migration_incoming_setup but if multifd channel is received before default channel, multifd channels will be uninitialized. Moved multifd_load_setup to migration_ioc_process_incoming. manish.mishra (2): io: Add support for MSG_PEEK for socket channel migration: check magic value for deciding the mapping of channels chardev/char-socket.c | 4 +-- include/io/channel.h| 6 io/channel-buffer.c | 1 + io/channel-command.c| 1 + io/channel-file.c | 1 + io/channel-null.c | 1 + io/channel-socket.c | 17 - io/channel-tls.c| 1 + io/channel-websock.c| 1 + io/channel.c| 16 ++--- migration/channel-block.c | 1 + migration/channel.c | 45 migration/channel.h | 5 +++ migration/migration.c | 54 - migration/multifd.c | 19 +- migration/multifd.h | 2 +- migration/postcopy-ram.c| 5 +-- migration/postcopy-ram.h| 2 +- scsi/qemu-pr-helper.c | 2 +- tests/qtest/tpm-emu.c | 2 +- tests/unit/test-io-channel-socket.c | 1 + util/vhost-user-server.c| 2 +- 22 files changed, 148 insertions(+), 41 deletions(-) -- 2.22.3
Re: [PATCH 1/5] io: Add support for MSG_PEEK for socket channel
On 15/12/22 11:05 pm, Peter Xu wrote: On Thu, Dec 15, 2022 at 09:40:41AM +, Daniel P. Berrangé wrote: On Wed, Dec 14, 2022 at 04:30:48PM -0500, Peter Xu wrote: On Wed, Dec 14, 2022 at 09:14:09AM +, Daniel P. Berrangé wrote: On Tue, Dec 13, 2022 at 04:38:46PM -0500, Peter Xu wrote: From: "manish.mishra" MSG_PEEK reads from the peek of channel, The data is treated as unread and the next read shall still return this data. This support is currently added only for socket class. Extra parameter 'flags' is added to io_readv calls to pass extra read flags like MSG_PEEK. Reviewed-by: Daniel P. Berrang?? The last letter of my name has been mangled - whatever tools used to pull in manish's patches seem to not be UTF-8 clean. Also the email addr isn't terminated, but that was pre-existing in manish's previous posting. Yes sorry, noticed it now, should i send another patch to update it. Also in our internal testing we noticed multifd_init was skipped for 1 case, so if this patch series is not yet submitted should i send another version of it early morning monday? Thanks Manish Mishra I'll fix at least the latter in my next post, sorry. For the 1st one - I am still looking at what went wrong. Here from the web interfaces it all looks good (besides the wrong ending..), e.g. on lore or patchew: https://urldefense.proofpoint.com/v2/url?u=https-3A__lore.kernel.org_all_20221213213850.1481858-2D2-2Dpeterx-40redhat.com_&d=DwIDaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=UYXwYmi1YA2mdAgHNDIkmgh-Xbem8hmqgCedCRNaj-32GKMVK8X30wH3nnXE9VF9&s=_9G0TGvGLvwCdB7vumPEboQqcrX5RoaxZ_4aZTnMDzY&e= https://urldefense.proofpoint.com/v2/url?u=https-3A__patchew.org_QEMU_20221213213850.1481858-2D1-2Dpeterx-40redhat.com_20221213213850.1481858-2D2-2Dpeterx-40redhat.com_&d=DwIDaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=UYXwYmi1YA2mdAgHNDIkmgh-Xbem8hmqgCedCRNaj-32GKMVK8X30wH3nnXE9VF9&s=Nv0ek8VsMHEDFdQ--vwCOrwxT_u1KqbV9014SQzI1kQ&e= It also looks good with e.g. Gmail webclient. Then I digged into the email headers and I found that comparing to Manish's original message, the patches I posted has one more line of "Content-type": Content-Type: text/plain; charset="utf-8" Content-type: text/plain https://urldefense.proofpoint.com/v2/url?u=https-3A__patchew.org_QEMU_20221213213850.1481858-2D2-2Dpeterx-40redhat.com_mbox&d=DwIDaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=UYXwYmi1YA2mdAgHNDIkmgh-Xbem8hmqgCedCRNaj-32GKMVK8X30wH3nnXE9VF9&s=wD5Y-ZvvyUOVKpYHJkM7M8znJ3BjjpkbRIE1NVlMloI&e= While Manish's patch only has one line: Content-Type: text/plain; charset="utf-8" https://urldefense.proofpoint.com/v2/url?u=https-3A__patchew.org_QEMU_20221123172735.25181-2D2-2Dmanish.mishra-40nutanix.com_mbox&d=DwIDaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=UYXwYmi1YA2mdAgHNDIkmgh-Xbem8hmqgCedCRNaj-32GKMVK8X30wH3nnXE9VF9&s=eQ3eyB3GvzUzyHdjTlWTt14dw4x0PL2xSbmXWPfN104&e= Don't trust what is shown by patchew, as that's been through many hops. The copy I receieved came directly to me via CC, so didn't hit mailman, nor patchew, and that *only* has "Content-type: text/plain". So the extra Content-type line with utf8 must have been added either by mailman or patchew. So it probably looks like a config problem in the tool you use to send the patches originally. Ouch... for misterious reasons I had one line in .gitconfig: 177 [format] 178 headers = "Content-type: text/plain" And that'll also affect git-publish too... I have it dropped now. Thanks,
Re: [PATCH v5 1/2] io: Add support for MSG_PEEK for socket channel
On 29/11/22 7:57 pm, Peter Xu wrote: On Tue, Nov 29, 2022 at 04:24:58PM +0530, manish.mishra wrote: On 23/11/22 11:34 pm, Peter Xu wrote: On Wed, Nov 23, 2022 at 05:27:34PM +, manish.mishra wrote: MSG_PEEK reads from the peek of channel, The data is treated as unread and the next read shall still return this data. This support is currently added only for socket class. Extra parameter 'flags' is added to io_readv calls to pass extra read flags like MSG_PEEK. Reviewed-by: Daniel P. Berrangé Reviewed-by: Peter Xu Hi Peter, Daniel Just wanted to check if both patches in this series are approved or i need to update few more things. Also Peter, for your Review-by to be included do i need to send another patch version or it will be taken up whoever will be merging it? No need to resend it for now. https://urldefense.proofpoint.com/v2/url?u=https-3A__wiki.qemu.org_Planning_7.2&d=DwIDaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=VKz-1dQAm60GTS1TgeCxSN7GCerZujf727kNL7OIzxaQGwdxTmmXBlOi4jJyNnGv&s=wnWA4vdxdaNYmEh0ZDD498R4ClO30VnmpOQhsIurs7U&e= Today should tag rc3, maybe it's too late for such a change to go into this release. It's not a regression of 7.2 so may not be justified either. Maybe it'll need to wait until the next dev cycle opens. Thanks, ok got it, thank you Peter. Thanks Manish Mishra
Re: [PATCH v5 1/2] io: Add support for MSG_PEEK for socket channel
On 23/11/22 11:34 pm, Peter Xu wrote: On Wed, Nov 23, 2022 at 05:27:34PM +, manish.mishra wrote: MSG_PEEK reads from the peek of channel, The data is treated as unread and the next read shall still return this data. This support is currently added only for socket class. Extra parameter 'flags' is added to io_readv calls to pass extra read flags like MSG_PEEK. Reviewed-by: Daniel P. Berrangé Reviewed-by: Peter Xu Hi Peter, Daniel Just wanted to check if both patches in this series are approved or i need to update few more things. Also Peter, for your Review-by to be included do i need to send another patch version or it will be taken up whoever will be merging it? Thanks Manish Mishra
[PATCH v5 0/2] check magic value for deciding the mapping of channels
Current logic assumes that channel connections on the destination side are always established in the same order as the source and the first one will always be the main channel followed by the multifid or post-copy preemption channel. This may not be always true, as even if a channel has a connection established on the source side it can be in the pending state on the destination side and a newer connection can be established first. Basically causing out of order mapping of channels on the destination side. Currently, all channels except post-copy preempt send a magic number, this patch uses that magic number to decide the type of channel. This logic is applicable only for precopy(multifd) live migration, as mentioned, the post-copy preempt channel does not send any magic number. Also, tls live migrations already does tls handshake before creating other channels, so this issue is not possible with tls, hence this logic is avoided for tls live migrations. This patch uses MSG_PEEK to check the magic number of channels so that current data/control stream management remains un-effected. v2: TLS does not support MSG_PEEK, so V1 was broken for tls live migrations. For tls live migration, while initializing main channel tls handshake is done before we can create other channels, so this issue is not possible for tls live migrations. In V2 added a check to avoid checking magic number for tls live migration and fallback to older method to decide mapping of channels on destination side. v3: 1. Split change in two patches, io patch for read_peek routines, migration patch for migration related changes. 2. Add flags to io_readv calls to get extra read flags like MSG_PEEK. 3. Some other minor fixes. v4: 1. Removed common *all_eof routines for read peek and added one specific to live migration. 2. Updated to use qemu_co_sleep_ns instead of qio_channel_yield. 3. Some other minor fixes. v5: 1. Handle busy-wait in migration_channel_read_peek due partial reads. manish.mishra (2): io: Add support for MSG_PEEK for socket channel migration: check magic value for deciding the mapping of channels chardev/char-socket.c | 4 +-- include/io/channel.h| 6 io/channel-buffer.c | 1 + io/channel-command.c| 1 + io/channel-file.c | 1 + io/channel-null.c | 1 + io/channel-socket.c | 17 ++- io/channel-tls.c| 1 + io/channel-websock.c| 1 + io/channel.c| 16 +++--- migration/channel-block.c | 1 + migration/channel.c | 45 + migration/channel.h | 5 migration/migration.c | 45 + migration/multifd.c | 12 +++- migration/multifd.h | 2 +- migration/postcopy-ram.c| 5 +--- migration/postcopy-ram.h| 2 +- scsi/qemu-pr-helper.c | 2 +- tests/qtest/tpm-emu.c | 2 +- tests/unit/test-io-channel-socket.c | 1 + util/vhost-user-server.c| 2 +- 22 files changed, 137 insertions(+), 36 deletions(-) -- 2.22.3
[PATCH v5 1/2] io: Add support for MSG_PEEK for socket channel
MSG_PEEK reads from the peek of channel, The data is treated as unread and the next read shall still return this data. This support is currently added only for socket class. Extra parameter 'flags' is added to io_readv calls to pass extra read flags like MSG_PEEK. Reviewed-by: Daniel P. Berrangé --- chardev/char-socket.c | 4 ++-- include/io/channel.h| 6 ++ io/channel-buffer.c | 1 + io/channel-command.c| 1 + io/channel-file.c | 1 + io/channel-null.c | 1 + io/channel-socket.c | 17 - io/channel-tls.c| 1 + io/channel-websock.c| 1 + io/channel.c| 16 migration/channel-block.c | 1 + scsi/qemu-pr-helper.c | 2 +- tests/qtest/tpm-emu.c | 2 +- tests/unit/test-io-channel-socket.c | 1 + util/vhost-user-server.c| 2 +- 15 files changed, 47 insertions(+), 10 deletions(-) diff --git a/chardev/char-socket.c b/chardev/char-socket.c index 879564aa8a..5afce9a464 100644 --- a/chardev/char-socket.c +++ b/chardev/char-socket.c @@ -283,11 +283,11 @@ static ssize_t tcp_chr_recv(Chardev *chr, char *buf, size_t len) if (qio_channel_has_feature(s->ioc, QIO_CHANNEL_FEATURE_FD_PASS)) { ret = qio_channel_readv_full(s->ioc, &iov, 1, &msgfds, &msgfds_num, - NULL); + 0, NULL); } else { ret = qio_channel_readv_full(s->ioc, &iov, 1, NULL, NULL, - NULL); + 0, NULL); } if (msgfds_num) { diff --git a/include/io/channel.h b/include/io/channel.h index c680ee7480..716235d496 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -34,6 +34,8 @@ OBJECT_DECLARE_TYPE(QIOChannel, QIOChannelClass, #define QIO_CHANNEL_WRITE_FLAG_ZERO_COPY 0x1 +#define QIO_CHANNEL_READ_FLAG_MSG_PEEK 0x1 + typedef enum QIOChannelFeature QIOChannelFeature; enum QIOChannelFeature { @@ -41,6 +43,7 @@ enum QIOChannelFeature { QIO_CHANNEL_FEATURE_SHUTDOWN, QIO_CHANNEL_FEATURE_LISTEN, QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY, +QIO_CHANNEL_FEATURE_READ_MSG_PEEK, }; @@ -114,6 +117,7 @@ struct QIOChannelClass { size_t niov, int **fds, size_t *nfds, +int flags, Error **errp); int (*io_close)(QIOChannel *ioc, Error **errp); @@ -188,6 +192,7 @@ void qio_channel_set_name(QIOChannel *ioc, * @niov: the length of the @iov array * @fds: pointer to an array that will received file handles * @nfds: pointer filled with number of elements in @fds on return + * @flags: read flags (QIO_CHANNEL_READ_FLAG_*) * @errp: pointer to a NULL-initialized error object * * Read data from the IO channel, storing it in the @@ -224,6 +229,7 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc, size_t niov, int **fds, size_t *nfds, + int flags, Error **errp); diff --git a/io/channel-buffer.c b/io/channel-buffer.c index bf52011be2..8096180f85 100644 --- a/io/channel-buffer.c +++ b/io/channel-buffer.c @@ -54,6 +54,7 @@ static ssize_t qio_channel_buffer_readv(QIOChannel *ioc, size_t niov, int **fds, size_t *nfds, +int flags, Error **errp) { QIOChannelBuffer *bioc = QIO_CHANNEL_BUFFER(ioc); diff --git a/io/channel-command.c b/io/channel-command.c index 74516252ba..e7edd091af 100644 --- a/io/channel-command.c +++ b/io/channel-command.c @@ -203,6 +203,7 @@ static ssize_t qio_channel_command_readv(QIOChannel *ioc, size_t niov, int **fds, size_t *nfds, + int flags, Error **errp) { QIOChannelCommand *cioc = QIO_CHANNEL_COMMAND(ioc); diff --git a/io/channel-file.c b/io/channel-file.c index b67687c2aa..d76663e6ae 100644 --- a/io/channel-file.c +++ b/io/channel-file.c @@ -86,6 +86,7 @@ static ssize_t qio_channel_file_readv(QIOChannel *ioc, size_t niov, int **fds, size_t *nfds, + int flags, Error **errp) { QIOChannelFil
[PATCH v5 2/2] migration: check magic value for deciding the mapping of channels
Current logic assumes that channel connections on the destination side are always established in the same order as the source and the first one will always be the main channel followed by the multifid or post-copy preemption channel. This may not be always true, as even if a channel has a connection established on the source side it can be in the pending state on the destination side and a newer connection can be established first. Basically causing out of order mapping of channels on the destination side. Currently, all channels except post-copy preempt send a magic number, this patch uses that magic number to decide the type of channel. This logic is applicable only for precopy(multifd) live migration, as mentioned, the post-copy preempt channel does not send any magic number. Also, tls live migrations already does tls handshake before creating other channels, so this issue is not possible with tls, hence this logic is avoided for tls live migrations. This patch uses read peek to check the magic number of channels so that current data/control stream management remains un-effected. Reviewed-by: Peter Xu Reviewed-by: Daniel P. Berrangé --- migration/channel.c | 45 migration/channel.h | 5 + migration/migration.c| 45 +--- migration/multifd.c | 12 --- migration/multifd.h | 2 +- migration/postcopy-ram.c | 5 + migration/postcopy-ram.h | 2 +- 7 files changed, 90 insertions(+), 26 deletions(-) diff --git a/migration/channel.c b/migration/channel.c index 1b0815039f..270b7acca2 100644 --- a/migration/channel.c +++ b/migration/channel.c @@ -92,3 +92,48 @@ void migration_channel_connect(MigrationState *s, migrate_fd_connect(s, error); error_free(error); } + + +/** + * @migration_channel_read_peek - Read from the peek of migration channel, + *without actually removing it from channel buffer. + * + * @ioc: the channel object + * @buf: the memory region to read data into + * @buflen: the number of bytes to read in @buf + * @errp: pointer to a NULL-initialized error object + * + * Returns 0 if successful, returns -1 and sets @errp if fails. + */ +int migration_channel_read_peek(QIOChannel *ioc, +const char *buf, +const size_t buflen, +Error **errp) +{ +ssize_t len = 0; +struct iovec iov = { .iov_base = (char *)buf, .iov_len = buflen }; + +while (true) { +len = qio_channel_readv_full(ioc, &iov, 1, NULL, + NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp); + +if (len <= 0 && len != QIO_CHANNEL_ERR_BLOCK) { +error_setg(errp, + "Failed to read peek of channel"); +return -1; +} + +if (len == buflen) { +break; +} + +/* 1ms sleep. */ +if (qemu_in_coroutine()) { +qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100); +} else { +g_usleep(1000); +} +} + +return 0; +} diff --git a/migration/channel.h b/migration/channel.h index 67a461c28a..5bdb8208a7 100644 --- a/migration/channel.h +++ b/migration/channel.h @@ -24,4 +24,9 @@ void migration_channel_connect(MigrationState *s, QIOChannel *ioc, const char *hostname, Error *error_in); + +int migration_channel_read_peek(QIOChannel *ioc, +const char *buf, +const size_t buflen, +Error **errp); #endif diff --git a/migration/migration.c b/migration/migration.c index f485eea5fb..8509f20a80 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -31,6 +31,7 @@ #include "migration.h" #include "savevm.h" #include "qemu-file.h" +#include "channel.h" #include "migration/vmstate.h" #include "block/block.h" #include "qapi/error.h" @@ -733,31 +734,51 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) { MigrationIncomingState *mis = migration_incoming_get_current(); Error *local_err = NULL; -bool start_migration; QEMUFile *f; +bool default_channel = true; +uint32_t channel_magic = 0; +int ret = 0; -if (!mis->from_src_file) { -/* The first connection (multifd may have multiple) */ +if (migrate_use_multifd() && !migrate_postcopy_ram() && +qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) { +/* + * With multiple channels, it is possible that we receive channels + * out of order on destination side, causing incorrect mapping of + * source channels on destination side. Check channel MAGIC to + * decide type of channel. Please note this is best effort, postcopy + * preempt channel does not send any magic number so avoid it for +
Re: [PATCH v4 2/2] migration: check magic value for deciding the mapping of channels
On 23/11/22 9:57 pm, Peter Xu wrote: On Wed, Nov 23, 2022 at 09:28:14PM +0530, manish.mishra wrote: On 23/11/22 9:22 pm, Peter Xu wrote: On Wed, Nov 23, 2022 at 03:05:27PM +, manish.mishra wrote: +int migration_channel_read_peek(QIOChannel *ioc, +const char *buf, +const size_t buflen, +Error **errp) +{ + ssize_t len = 0; + struct iovec iov = { .iov_base = (char *)buf, .iov_len = buflen }; + + while (len < buflen) { + len = qio_channel_readv_full(ioc, &iov, 1, NULL, +NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp); + + if (len == QIO_CHANNEL_ERR_BLOCK) { This needs to take care of partial len too? sorry Peter, I did not quite understand it. Can you please give some more details. As Daniel pointed out, I think if we peek partially it'll go into a loop. Since we shouldn't read 0 anyway here, maybe you can directly write it as: while (true) { len = read(); if (len <= 0 && len != QIO_CHANNEL_ERR_BLOCK) { error_setg(...); return -1; } if (len == buflen) { break; } /* For either partial peek or QIO_CHANNEL_ERR_BLOCK, retry with timeout */ sleep(); } Yes, got it, sorry, will update it. Thanks Manish Mishra +if (qemu_in_coroutine()) { +/* 1ms sleep. */ +qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100); +} else { +qio_channel_wait(ioc, G_IO_IN); +} +continue; + } + if (len == 0) { + error_setg(errp, + "Unexpected end-of-file on channel"); + return -1; + } + if (len < 0) { + return -1; + } + } + + return 0; +} Thanks Manish Mishra
Re: [PATCH v4 2/2] migration: check magic value for deciding the mapping of channels
On 23/11/22 9:28 pm, Daniel P. Berrangé wrote: On Wed, Nov 23, 2022 at 03:05:27PM +, manish.mishra wrote: Current logic assumes that channel connections on the destination side are always established in the same order as the source and the first one will always be the main channel followed by the multifid or post-copy preemption channel. This may not be always true, as even if a channel has a connection established on the source side it can be in the pending state on the destination side and a newer connection can be established first. Basically causing out of order mapping of channels on the destination side. Currently, all channels except post-copy preempt send a magic number, this patch uses that magic number to decide the type of channel. This logic is applicable only for precopy(multifd) live migration, as mentioned, the post-copy preempt channel does not send any magic number. Also, tls live migrations already does tls handshake before creating other channels, so this issue is not possible with tls, hence this logic is avoided for tls live migrations. This patch uses read peek to check the magic number of channels so that current data/control stream management remains un-effected. Reviewed-by: Peter Xu Reviewed-by: Daniel P. Berrangé --- migration/channel.c | 46 migration/channel.h | 5 + migration/migration.c| 45 --- migration/multifd.c | 12 --- migration/multifd.h | 2 +- migration/postcopy-ram.c | 5 + migration/postcopy-ram.h | 2 +- 7 files changed, 91 insertions(+), 26 deletions(-) diff --git a/migration/channel.c b/migration/channel.c index 1b0815039f..a4600f52c5 100644 --- a/migration/channel.c +++ b/migration/channel.c @@ -92,3 +92,49 @@ void migration_channel_connect(MigrationState *s, migrate_fd_connect(s, error); error_free(error); } + + +/** + * @migration_channel_read_peek - Read from the peek of migration channel, + *without actually removing it from channel buffer. + * + * @ioc: the channel object + * @buf: the memory region to read data into + * @buflen: the number of bytes to read in @buf + * @errp: pointer to a NULL-initialized error object + * + * Returns 0 if successful, returns -1 and sets @errp if fails. + */ +int migration_channel_read_peek(QIOChannel *ioc, +const char *buf, +const size_t buflen, +Error **errp) +{ + ssize_t len = 0; + struct iovec iov = { .iov_base = (char *)buf, .iov_len = buflen }; + + while (len < buflen) { + len = qio_channel_readv_full(ioc, &iov, 1, NULL, +NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp); + + if (len == QIO_CHANNEL_ERR_BLOCK) { +if (qemu_in_coroutine()) { +/* 1ms sleep. */ +qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100); +} else { +qio_channel_wait(ioc, G_IO_IN); +} +continue; + } + if (len == 0) { + error_setg(errp, + "Unexpected end-of-file on channel"); + return -1; + } + if (len < 0) { + return -1; + } + } This busy waits when len > 0 and < buflen With regards, Daniel Sorry, Daniel, may be i misunderstood something from earlier discussions. I thought we discussed we may not prevent it from looping multiple times but we can qemu_co_sleep_ns after every retry to deal with busy wait? Thanks Manish Mishra
Re: [PATCH v4 2/2] migration: check magic value for deciding the mapping of channels
On 23/11/22 9:22 pm, Peter Xu wrote: On Wed, Nov 23, 2022 at 03:05:27PM +, manish.mishra wrote: +int migration_channel_read_peek(QIOChannel *ioc, +const char *buf, +const size_t buflen, +Error **errp) +{ + ssize_t len = 0; + struct iovec iov = { .iov_base = (char *)buf, .iov_len = buflen }; + + while (len < buflen) { + len = qio_channel_readv_full(ioc, &iov, 1, NULL, +NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp); + + if (len == QIO_CHANNEL_ERR_BLOCK) { This needs to take care of partial len too? sorry Peter, I did not quite understand it. Can you please give some more details. +if (qemu_in_coroutine()) { +/* 1ms sleep. */ +qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100); +} else { +qio_channel_wait(ioc, G_IO_IN); +} +continue; + } + if (len == 0) { + error_setg(errp, + "Unexpected end-of-file on channel"); + return -1; + } + if (len < 0) { + return -1; + } + } + + return 0; +} Thanks Manish Mishra
[PATCH v4 0/2] migration: check magic value for deciding the mapping of channels
Current logic assumes that channel connections on the destination side are always established in the same order as the source and the first one will always be the main channel followed by the multifid or post-copy preemption channel. This may not be always true, as even if a channel has a connection established on the source side it can be in the pending state on the destination side and a newer connection can be established first. Basically causing out of order mapping of channels on the destination side. Currently, all channels except post-copy preempt send a magic number, this patch uses that magic number to decide the type of channel. This logic is applicable only for precopy(multifd) live migration, as mentioned, the post-copy preempt channel does not send any magic number. Also, tls live migrations already does tls handshake before creating other channels, so this issue is not possible with tls, hence this logic is avoided for tls live migrations. This patch uses MSG_PEEK to check the magic number of channels so that current data/control stream management remains un-effected. v2: TLS does not support MSG_PEEK, so V1 was broken for tls live migrations. For tls live migration, while initializing main channel tls handshake is done before we can create other channels, so this issue is not possible for tls live migrations. In V2 added a check to avoid checking magic number for tls live migration and fallback to older method to decide mapping of channels on destination side. v3: 1. Split change in two patches, io patch for read_peek routines, migration patch for migration related changes. 2. Add flags to io_readv calls to get extra read flags like MSG_PEEK. 3. Some other minor fixes. v4: 1. Removed common *all_eof routines for read peek and added one specific to live migration. 2. Updated to use qemu_co_sleep_ns instead of qio_channel_yield. 3. Some other minor fixes. manish.mishra (2): io: Add support for MSG_PEEK for socket channel migration: check magic value for deciding the mapping of channels chardev/char-socket.c | 4 +-- include/io/channel.h| 6 io/channel-buffer.c | 1 + io/channel-command.c| 1 + io/channel-file.c | 1 + io/channel-null.c | 1 + io/channel-socket.c | 17 ++- io/channel-tls.c| 1 + io/channel-websock.c| 1 + io/channel.c| 16 +++--- migration/channel-block.c | 1 + migration/channel.c | 46 + migration/channel.h | 5 migration/migration.c | 45 migration/multifd.c | 12 +++- migration/multifd.h | 2 +- migration/postcopy-ram.c| 5 +--- migration/postcopy-ram.h| 2 +- scsi/qemu-pr-helper.c | 2 +- tests/qtest/tpm-emu.c | 2 +- tests/unit/test-io-channel-socket.c | 1 + util/vhost-user-server.c| 2 +- 22 files changed, 138 insertions(+), 36 deletions(-) -- 2.22.3
[PATCH v4 2/2] migration: check magic value for deciding the mapping of channels
Current logic assumes that channel connections on the destination side are always established in the same order as the source and the first one will always be the main channel followed by the multifid or post-copy preemption channel. This may not be always true, as even if a channel has a connection established on the source side it can be in the pending state on the destination side and a newer connection can be established first. Basically causing out of order mapping of channels on the destination side. Currently, all channels except post-copy preempt send a magic number, this patch uses that magic number to decide the type of channel. This logic is applicable only for precopy(multifd) live migration, as mentioned, the post-copy preempt channel does not send any magic number. Also, tls live migrations already does tls handshake before creating other channels, so this issue is not possible with tls, hence this logic is avoided for tls live migrations. This patch uses read peek to check the magic number of channels so that current data/control stream management remains un-effected. Reviewed-by: Peter Xu Reviewed-by: Daniel P. Berrangé --- migration/channel.c | 46 migration/channel.h | 5 + migration/migration.c| 45 --- migration/multifd.c | 12 --- migration/multifd.h | 2 +- migration/postcopy-ram.c | 5 + migration/postcopy-ram.h | 2 +- 7 files changed, 91 insertions(+), 26 deletions(-) diff --git a/migration/channel.c b/migration/channel.c index 1b0815039f..a4600f52c5 100644 --- a/migration/channel.c +++ b/migration/channel.c @@ -92,3 +92,49 @@ void migration_channel_connect(MigrationState *s, migrate_fd_connect(s, error); error_free(error); } + + +/** + * @migration_channel_read_peek - Read from the peek of migration channel, + *without actually removing it from channel buffer. + * + * @ioc: the channel object + * @buf: the memory region to read data into + * @buflen: the number of bytes to read in @buf + * @errp: pointer to a NULL-initialized error object + * + * Returns 0 if successful, returns -1 and sets @errp if fails. + */ +int migration_channel_read_peek(QIOChannel *ioc, +const char *buf, +const size_t buflen, +Error **errp) +{ + ssize_t len = 0; + struct iovec iov = { .iov_base = (char *)buf, .iov_len = buflen }; + + while (len < buflen) { + len = qio_channel_readv_full(ioc, &iov, 1, NULL, +NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp); + + if (len == QIO_CHANNEL_ERR_BLOCK) { +if (qemu_in_coroutine()) { +/* 1ms sleep. */ +qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100); +} else { +qio_channel_wait(ioc, G_IO_IN); +} +continue; + } + if (len == 0) { + error_setg(errp, + "Unexpected end-of-file on channel"); + return -1; + } + if (len < 0) { + return -1; + } + } + + return 0; +} diff --git a/migration/channel.h b/migration/channel.h index 67a461c28a..5bdb8208a7 100644 --- a/migration/channel.h +++ b/migration/channel.h @@ -24,4 +24,9 @@ void migration_channel_connect(MigrationState *s, QIOChannel *ioc, const char *hostname, Error *error_in); + +int migration_channel_read_peek(QIOChannel *ioc, +const char *buf, +const size_t buflen, +Error **errp); #endif diff --git a/migration/migration.c b/migration/migration.c index f485eea5fb..8509f20a80 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -31,6 +31,7 @@ #include "migration.h" #include "savevm.h" #include "qemu-file.h" +#include "channel.h" #include "migration/vmstate.h" #include "block/block.h" #include "qapi/error.h" @@ -733,31 +734,51 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) { MigrationIncomingState *mis = migration_incoming_get_current(); Error *local_err = NULL; -bool start_migration; QEMUFile *f; +bool default_channel = true; +uint32_t channel_magic = 0; +int ret = 0; -if (!mis->from_src_file) { -/* The first connection (multifd may have multiple) */ +if (migrate_use_multifd() && !migrate_postcopy_ram() && +qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) { +/* + * With multiple channels, it is possible that we receive channels + * out of order on destination side, causing incorrect mapping of + * source channels on destination side. Check channel MAGIC to + * decide type of channel. Please note this is best effort, postcopy
[PATCH v4 1/2] io: Add support for MSG_PEEK for socket channel
MSG_PEEK reads from the peek of channel, The data is treated as unread and the next read shall still return this data. This support is currently added only for socket class. Extra parameter 'flags' is added to io_readv calls to pass extra read flags like MSG_PEEK. Suggested-by: Daniel P. Berrangé --- chardev/char-socket.c | 4 ++-- include/io/channel.h| 6 ++ io/channel-buffer.c | 1 + io/channel-command.c| 1 + io/channel-file.c | 1 + io/channel-null.c | 1 + io/channel-socket.c | 17 - io/channel-tls.c| 1 + io/channel-websock.c| 1 + io/channel.c| 16 migration/channel-block.c | 1 + scsi/qemu-pr-helper.c | 2 +- tests/qtest/tpm-emu.c | 2 +- tests/unit/test-io-channel-socket.c | 1 + util/vhost-user-server.c| 2 +- 15 files changed, 47 insertions(+), 10 deletions(-) diff --git a/chardev/char-socket.c b/chardev/char-socket.c index 879564aa8a..5afce9a464 100644 --- a/chardev/char-socket.c +++ b/chardev/char-socket.c @@ -283,11 +283,11 @@ static ssize_t tcp_chr_recv(Chardev *chr, char *buf, size_t len) if (qio_channel_has_feature(s->ioc, QIO_CHANNEL_FEATURE_FD_PASS)) { ret = qio_channel_readv_full(s->ioc, &iov, 1, &msgfds, &msgfds_num, - NULL); + 0, NULL); } else { ret = qio_channel_readv_full(s->ioc, &iov, 1, NULL, NULL, - NULL); + 0, NULL); } if (msgfds_num) { diff --git a/include/io/channel.h b/include/io/channel.h index c680ee7480..716235d496 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -34,6 +34,8 @@ OBJECT_DECLARE_TYPE(QIOChannel, QIOChannelClass, #define QIO_CHANNEL_WRITE_FLAG_ZERO_COPY 0x1 +#define QIO_CHANNEL_READ_FLAG_MSG_PEEK 0x1 + typedef enum QIOChannelFeature QIOChannelFeature; enum QIOChannelFeature { @@ -41,6 +43,7 @@ enum QIOChannelFeature { QIO_CHANNEL_FEATURE_SHUTDOWN, QIO_CHANNEL_FEATURE_LISTEN, QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY, +QIO_CHANNEL_FEATURE_READ_MSG_PEEK, }; @@ -114,6 +117,7 @@ struct QIOChannelClass { size_t niov, int **fds, size_t *nfds, +int flags, Error **errp); int (*io_close)(QIOChannel *ioc, Error **errp); @@ -188,6 +192,7 @@ void qio_channel_set_name(QIOChannel *ioc, * @niov: the length of the @iov array * @fds: pointer to an array that will received file handles * @nfds: pointer filled with number of elements in @fds on return + * @flags: read flags (QIO_CHANNEL_READ_FLAG_*) * @errp: pointer to a NULL-initialized error object * * Read data from the IO channel, storing it in the @@ -224,6 +229,7 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc, size_t niov, int **fds, size_t *nfds, + int flags, Error **errp); diff --git a/io/channel-buffer.c b/io/channel-buffer.c index bf52011be2..8096180f85 100644 --- a/io/channel-buffer.c +++ b/io/channel-buffer.c @@ -54,6 +54,7 @@ static ssize_t qio_channel_buffer_readv(QIOChannel *ioc, size_t niov, int **fds, size_t *nfds, +int flags, Error **errp) { QIOChannelBuffer *bioc = QIO_CHANNEL_BUFFER(ioc); diff --git a/io/channel-command.c b/io/channel-command.c index 74516252ba..e7edd091af 100644 --- a/io/channel-command.c +++ b/io/channel-command.c @@ -203,6 +203,7 @@ static ssize_t qio_channel_command_readv(QIOChannel *ioc, size_t niov, int **fds, size_t *nfds, + int flags, Error **errp) { QIOChannelCommand *cioc = QIO_CHANNEL_COMMAND(ioc); diff --git a/io/channel-file.c b/io/channel-file.c index b67687c2aa..d76663e6ae 100644 --- a/io/channel-file.c +++ b/io/channel-file.c @@ -86,6 +86,7 @@ static ssize_t qio_channel_file_readv(QIOChannel *ioc, size_t niov, int **fds, size_t *nfds, + int flags, Error **errp) { QIOChannelFi
Re: [PATCH v3 1/2] io: Add support for MSG_PEEK for socket channel
On 22/11/22 10:03 pm, Peter Xu wrote: On Tue, Nov 22, 2022 at 11:29:05AM -0500, Peter Xu wrote: On Tue, Nov 22, 2022 at 11:10:18AM -0500, Peter Xu wrote: On Tue, Nov 22, 2022 at 09:01:59PM +0530, manish.mishra wrote: On 22/11/22 8:19 pm, Daniel P. Berrangé wrote: On Tue, Nov 22, 2022 at 09:41:01AM -0500, Peter Xu wrote: On Tue, Nov 22, 2022 at 02:38:53PM +0530, manish.mishra wrote: On 22/11/22 2:30 pm, Daniel P. Berrangé wrote: On Sat, Nov 19, 2022 at 09:36:14AM +, manish.mishra wrote: MSG_PEEK reads from the peek of channel, The data is treated as unread and the next read shall still return this data. This support is currently added only for socket class. Extra parameter 'flags' is added to io_readv calls to pass extra read flags like MSG_PEEK. Suggested-by: Daniel P. Berrangé --- chardev/char-socket.c | 4 +- include/io/channel.h| 83 + io/channel-buffer.c | 1 + io/channel-command.c| 1 + io/channel-file.c | 1 + io/channel-null.c | 1 + io/channel-socket.c | 16 +- io/channel-tls.c| 1 + io/channel-websock.c| 1 + io/channel.c| 73 +++-- migration/channel-block.c | 1 + scsi/qemu-pr-helper.c | 2 +- tests/qtest/tpm-emu.c | 2 +- tests/unit/test-io-channel-socket.c | 1 + util/vhost-user-server.c| 2 +- 15 files changed, 179 insertions(+), 11 deletions(-) diff --git a/io/channel-socket.c b/io/channel-socket.c index b76dca9cc1..a06b24766d 100644 --- a/io/channel-socket.c +++ b/io/channel-socket.c @@ -406,6 +406,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc, } #endif /* WIN32 */ +qio_channel_set_feature(QIO_CHANNEL(cioc), QIO_CHANNEL_FEATURE_READ_MSG_PEEK); + This covers the incoming server side socket. This also needs to be set in outgoing client side socket in qio_channel_socket_connect_async Yes sorry, i considered only current use-case, but as it is generic one both should be there. Thanks will update it. @@ -705,7 +718,6 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc, } #endif /* WIN32 */ - #ifdef QEMU_MSG_ZEROCOPY static int qio_channel_socket_flush(QIOChannel *ioc, Error **errp) Please remove this unrelated whitespace change. @@ -109,6 +117,37 @@ int qio_channel_readv_all_eof(QIOChannel *ioc, return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp); } +int qio_channel_readv_peek_all_eof(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + Error **errp) +{ + ssize_t len = 0; + ssize_t total = iov_size(iov, niov); + + while (len < total) { + len = qio_channel_readv_full(ioc, iov, niov, NULL, +NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp); + + if (len == QIO_CHANNEL_ERR_BLOCK) { +if (qemu_in_coroutine()) { +qio_channel_yield(ioc, G_IO_IN); +} else { +qio_channel_wait(ioc, G_IO_IN); +} +continue; + } + if (len == 0) { + return 0; + } + if (len < 0) { + return -1; + } + } This will busy wait burning CPU where there is a read > 0 and < total. Daniel, i could use MSG_WAITALL too if that works but then we will lose opportunity to yield. Or if you have some other idea. How easy would this happen? Another alternative is we could just return the partial len to caller then we fallback to the original channel orders if it happens. And then if it mostly will never happen it'll behave merely the same as what we want. Well we're trying to deal with a bug where the slow and/or unreliable network causes channels to arrive in unexpected order. Given we know we're having network trouble, I wouldn't want to make more assumptions about things happening correctly. With regards, Daniel Peter, I have seen MSG_PEEK used in combination with MSG_WAITALL, but looks like even though chances are less it can still return partial data even with multiple retries for signal case, so is not full proof. *MSG_WAITALL *(since Linux 2.2) This flag requests that the operation block until the full request is satisfied. However, the call may still return less data than requested if a signal is caught, an error or disconnect occurs, or the next data to be received is of a different type than that returned. This flag has no effect for datagram sockets. Actual read ahead will be little hackish, so just confirming we all are in a
Re: [PATCH] multifd: Updated QAPI format for 'migrate' qemu monitor command
On 22/11/22 2:53 pm, Daniel P. Berrangé wrote: On Mon, Nov 21, 2022 at 01:40:27PM +0100, Juan Quintela wrote: Het Gala wrote: To prevent double data encoding of uris, instead of passing transport mechanisms, host address and port all together in form of a single string and writing different parsing functions, we intend the user to explicitly mention most of the parameters through the medium of qmp command itself. The current patch is continuation of patchset series https://urldefense.proofpoint.com/v2/url?u=https-3A__www.mail-2Darchive.com_qemu-2Ddevel-40nongnu.org_msg901274.html&d=DwIBaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=htOVZmDRCu27EhvYDu-28snN6GV0a6-ffX34joSwBkELLLVGHzRqD0LicZed3Xtw&s=xvzrWRBN4S5l3orPqu6di0MRq-gSGWZZU6-e7wpn8w4&e= and reply to the ongoing discussion for better QAPI design here https://urldefense.proofpoint.com/v2/url?u=https-3A__www.mail-2Darchive.com_qemu-2Ddevel-40nongnu.org_msg903753.html&d=DwIBaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=htOVZmDRCu27EhvYDu-28snN6GV0a6-ffX34joSwBkELLLVGHzRqD0LicZed3Xtw&s=anLqZbhqa73P9SyPUaGk5q8R0SYR6IUtH_FWFML3lZY&e= . Suggested-by: Daniel P. Berrange Suggested-by: Aravind Retnakaran Suggested-by: Manish Mishra Signed-off-by: Het Gala +{ 'struct': 'MigrateChannel', + 'data' : { +'channeltype' : 'MigrateChannelType', +'*src-addr' : 'MigrateAddress', +'dest-addr' : 'MigrateAddress', Why do we want *both* addresses? This is part of their goal to have source based routing, so that traffic exits the src host via a particular NIC. I think this patch would be better if we split it into two parts. First introduce "MigrateChannel" struct with *only* the 'dest-addr' field, and only allow one element in the list, making 'uri' optional. Basically the first patch would *only* be about switching the 'migrate' command from using a plain string to a MigrateAddress structure. A second patch would then extend it to allow multiple MigrateChannel elements, to support different destinations for each channel. A third patch would then extend it to add src-addr to attain the source based routing. A fourth patch can then deprecate the existing 'uri' field. +'*multifd-count' : 'int' } } And if we are passing a list, why do we want to pass the real number? Yeah, technically I think this field is redundant, as you can just add many entires to the 'channels' list, using the same addresses. All this field does is allow a more compact JSON arg list. I'm not sure this is neccessary, unless we're expecting huge numbers of 'channels', and even then this isn't likely to be a performance issue. # -> { "execute": "migrate", "arguments": { "uri": "tcp:0:4446" } } # <- { "return": {} } # +# -> { "execute": "migrate", +# "arguments": { +# "channels": [ { 'channeltype': 'control', +# 'dest-addr': {'transport': 'socket', +#'type': 'inet', +#'host': '10.12.34.9', 'port': '1050'}}, +#{ 'channeltype': 'data', +# 'src-addr': {'transport': 'socket', +#'type': 'inet', +#'host': '10.12.34.9', +#'port': '4000', 'ipv4': 'true'}, +# 'dest-addr': { 'transport': 'socket', +# 'type': 'inet', +# 'host': '10.12.34.92', +# 'port': '1234', 'ipv4': 'true'}, +# 'multifd-count': 5 }, +#{ 'channeltype': 'data', +# 'src-addr': {'transport': 'socket', +#'type': 'inet', +#'host': '10.2.3.4', 'port': '1000'}, +# 'dest-addr': {'transport': 'socket', +# 'type': 'inet', +# 'host': '0.0.0.4', 'port': '3000'}, +# 'multifd-count': 3 } ] } } +# <- { "return": {} } +# ## { 'command': 'migrate', - 'data': {'uri': 'str', '*blk': 'bool', '*inc': 'bool', - '*detach': 'bool', '*resume': 'bool' } } + 'data': {'*uri': 'str', '*channels': ['MigrateChannel'], '*blk': 'bool', I think that "uri" bit should be dropped, right? No, it is required for back compatibility with existing clients. It should be marked deprecated, and removed several releases later, at which point 'chanels' can become mandatory instead of optional. + '*inc': 'bool', '*detach': 'bool', '*resume': 'bool' } } ## # @migrate-incoming: I can't see how to make the old one to work on top of this one (i.e. we would have to create strings from lists on QAPI, I think that is just too much). All we need
Re: [PATCH v3 1/2] io: Add support for MSG_PEEK for socket channel
On 22/11/22 8:19 pm, Daniel P. Berrangé wrote: On Tue, Nov 22, 2022 at 09:41:01AM -0500, Peter Xu wrote: On Tue, Nov 22, 2022 at 02:38:53PM +0530, manish.mishra wrote: On 22/11/22 2:30 pm, Daniel P. Berrangé wrote: On Sat, Nov 19, 2022 at 09:36:14AM +, manish.mishra wrote: MSG_PEEK reads from the peek of channel, The data is treated as unread and the next read shall still return this data. This support is currently added only for socket class. Extra parameter 'flags' is added to io_readv calls to pass extra read flags like MSG_PEEK. Suggested-by: Daniel P. Berrangé --- chardev/char-socket.c | 4 +- include/io/channel.h| 83 + io/channel-buffer.c | 1 + io/channel-command.c| 1 + io/channel-file.c | 1 + io/channel-null.c | 1 + io/channel-socket.c | 16 +- io/channel-tls.c| 1 + io/channel-websock.c| 1 + io/channel.c| 73 +++-- migration/channel-block.c | 1 + scsi/qemu-pr-helper.c | 2 +- tests/qtest/tpm-emu.c | 2 +- tests/unit/test-io-channel-socket.c | 1 + util/vhost-user-server.c| 2 +- 15 files changed, 179 insertions(+), 11 deletions(-) diff --git a/io/channel-socket.c b/io/channel-socket.c index b76dca9cc1..a06b24766d 100644 --- a/io/channel-socket.c +++ b/io/channel-socket.c @@ -406,6 +406,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc, } #endif /* WIN32 */ +qio_channel_set_feature(QIO_CHANNEL(cioc), QIO_CHANNEL_FEATURE_READ_MSG_PEEK); + This covers the incoming server side socket. This also needs to be set in outgoing client side socket in qio_channel_socket_connect_async Yes sorry, i considered only current use-case, but as it is generic one both should be there. Thanks will update it. @@ -705,7 +718,6 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc, } #endif /* WIN32 */ - #ifdef QEMU_MSG_ZEROCOPY static int qio_channel_socket_flush(QIOChannel *ioc, Error **errp) Please remove this unrelated whitespace change. @@ -109,6 +117,37 @@ int qio_channel_readv_all_eof(QIOChannel *ioc, return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp); } +int qio_channel_readv_peek_all_eof(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + Error **errp) +{ + ssize_t len = 0; + ssize_t total = iov_size(iov, niov); + + while (len < total) { + len = qio_channel_readv_full(ioc, iov, niov, NULL, +NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp); + + if (len == QIO_CHANNEL_ERR_BLOCK) { +if (qemu_in_coroutine()) { +qio_channel_yield(ioc, G_IO_IN); +} else { +qio_channel_wait(ioc, G_IO_IN); +} +continue; + } + if (len == 0) { + return 0; + } + if (len < 0) { + return -1; + } + } This will busy wait burning CPU where there is a read > 0 and < total. Daniel, i could use MSG_WAITALL too if that works but then we will lose opportunity to yield. Or if you have some other idea. How easy would this happen? Another alternative is we could just return the partial len to caller then we fallback to the original channel orders if it happens. And then if it mostly will never happen it'll behave merely the same as what we want. Well we're trying to deal with a bug where the slow and/or unreliable network causes channels to arrive in unexpected order. Given we know we're having network trouble, I wouldn't want to make more assumptions about things happening correctly. With regards, Daniel Peter, I have seen MSG_PEEK used in combination with MSG_WAITALL, but looks like even though chances are less it can still return partial data even with multiple retries for signal case, so is not full proof. *MSG_WAITALL *(since Linux 2.2) This flag requests that the operation block until the full request is satisfied. However, the call may still return less data than requested if a signal is caught, an error or disconnect occurs, or the next data to be received is of a different type than that returned. This flag has no effect for datagram sockets. Actual read ahead will be little hackish, so just confirming we all are in agreement to do actual read ahead and i can send patch? :) Thanks Manish Mishra
Re: [PATCH v3 1/2] io: Add support for MSG_PEEK for socket channel
On 22/11/22 3:23 pm, Daniel P. Berrangé wrote: On Tue, Nov 22, 2022 at 03:10:53PM +0530, manish.mishra wrote: On 22/11/22 2:59 pm, Daniel P. Berrangé wrote: On Tue, Nov 22, 2022 at 02:38:53PM +0530, manish.mishra wrote: On 22/11/22 2:30 pm, Daniel P. Berrangé wrote: On Sat, Nov 19, 2022 at 09:36:14AM +, manish.mishra wrote: MSG_PEEK reads from the peek of channel, The data is treated as unread and the next read shall still return this data. This support is currently added only for socket class. Extra parameter 'flags' is added to io_readv calls to pass extra read flags like MSG_PEEK. Suggested-by: Daniel P. Berrangé --- chardev/char-socket.c | 4 +- include/io/channel.h| 83 + io/channel-buffer.c | 1 + io/channel-command.c| 1 + io/channel-file.c | 1 + io/channel-null.c | 1 + io/channel-socket.c | 16 +- io/channel-tls.c| 1 + io/channel-websock.c| 1 + io/channel.c| 73 +++-- migration/channel-block.c | 1 + scsi/qemu-pr-helper.c | 2 +- tests/qtest/tpm-emu.c | 2 +- tests/unit/test-io-channel-socket.c | 1 + util/vhost-user-server.c| 2 +- 15 files changed, 179 insertions(+), 11 deletions(-) diff --git a/io/channel-socket.c b/io/channel-socket.c index b76dca9cc1..a06b24766d 100644 --- a/io/channel-socket.c +++ b/io/channel-socket.c @@ -406,6 +406,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc, } #endif /* WIN32 */ +qio_channel_set_feature(QIO_CHANNEL(cioc), QIO_CHANNEL_FEATURE_READ_MSG_PEEK); + This covers the incoming server side socket. This also needs to be set in outgoing client side socket in qio_channel_socket_connect_async Yes sorry, i considered only current use-case, but as it is generic one both should be there. Thanks will update it. @@ -705,7 +718,6 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc, } #endif /* WIN32 */ - #ifdef QEMU_MSG_ZEROCOPY static int qio_channel_socket_flush(QIOChannel *ioc, Error **errp) Please remove this unrelated whitespace change. @@ -109,6 +117,37 @@ int qio_channel_readv_all_eof(QIOChannel *ioc, return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp); } +int qio_channel_readv_peek_all_eof(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + Error **errp) +{ + ssize_t len = 0; + ssize_t total = iov_size(iov, niov); + + while (len < total) { + len = qio_channel_readv_full(ioc, iov, niov, NULL, +NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp); + + if (len == QIO_CHANNEL_ERR_BLOCK) { +if (qemu_in_coroutine()) { +qio_channel_yield(ioc, G_IO_IN); +} else { +qio_channel_wait(ioc, G_IO_IN); +} +continue; + } + if (len == 0) { + return 0; + } + if (len < 0) { + return -1; + } + } This will busy wait burning CPU where there is a read > 0 and < total. Daniel, i could use MSG_WAITALL too if that works but then we will lose opportunity to yield. Or if you have some other idea. I fear this is an inherant problem with the idea of using PEEK to look at the magic data. If we actually read the magic bytes off the wire, then we could have the same code path for TLS and non-TLS. We would have to modify the existing later code paths though to take account of fact that the magic was already read by an earlier codepath. With regards, Daniel sure Daniel, I am happy to drop use of MSG_PEEK, but that way also we have issue with tls for reason we discussed in V2. Is it okay to send a patch with actual read ahead but not for tls case? tls anyway does not have this bug as it does handshake. I've re-read the previous threads, but I don't see what the problem with TLS is. We already decided that TLS is not affected by the race condition. So there should be no problem in reading the magic bytes early on the TLS channels, while reading the bytes early on a non-TLS channel will fix the race condition. Actually with tls all channels requires handshake to be assumed established, and from source side we do initial qemu_flush only when all channels are established. But on destination side we will stuck on reading magic for main channel itself which never comes because source has not flushed data, so no new connections can be established(e.g. multiFD). So basically destination can not accept any new channel until we read from main channel and source is not putting any data on mai
Re: [PATCH v3 1/2] io: Add support for MSG_PEEK for socket channel
On 22/11/22 2:59 pm, Daniel P. Berrangé wrote: On Tue, Nov 22, 2022 at 02:38:53PM +0530, manish.mishra wrote: On 22/11/22 2:30 pm, Daniel P. Berrangé wrote: On Sat, Nov 19, 2022 at 09:36:14AM +, manish.mishra wrote: MSG_PEEK reads from the peek of channel, The data is treated as unread and the next read shall still return this data. This support is currently added only for socket class. Extra parameter 'flags' is added to io_readv calls to pass extra read flags like MSG_PEEK. Suggested-by: Daniel P. Berrangé --- chardev/char-socket.c | 4 +- include/io/channel.h| 83 + io/channel-buffer.c | 1 + io/channel-command.c| 1 + io/channel-file.c | 1 + io/channel-null.c | 1 + io/channel-socket.c | 16 +- io/channel-tls.c| 1 + io/channel-websock.c| 1 + io/channel.c| 73 +++-- migration/channel-block.c | 1 + scsi/qemu-pr-helper.c | 2 +- tests/qtest/tpm-emu.c | 2 +- tests/unit/test-io-channel-socket.c | 1 + util/vhost-user-server.c| 2 +- 15 files changed, 179 insertions(+), 11 deletions(-) diff --git a/io/channel-socket.c b/io/channel-socket.c index b76dca9cc1..a06b24766d 100644 --- a/io/channel-socket.c +++ b/io/channel-socket.c @@ -406,6 +406,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc, } #endif /* WIN32 */ +qio_channel_set_feature(QIO_CHANNEL(cioc), QIO_CHANNEL_FEATURE_READ_MSG_PEEK); + This covers the incoming server side socket. This also needs to be set in outgoing client side socket in qio_channel_socket_connect_async Yes sorry, i considered only current use-case, but as it is generic one both should be there. Thanks will update it. @@ -705,7 +718,6 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc, } #endif /* WIN32 */ - #ifdef QEMU_MSG_ZEROCOPY static int qio_channel_socket_flush(QIOChannel *ioc, Error **errp) Please remove this unrelated whitespace change. @@ -109,6 +117,37 @@ int qio_channel_readv_all_eof(QIOChannel *ioc, return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp); } +int qio_channel_readv_peek_all_eof(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + Error **errp) +{ + ssize_t len = 0; + ssize_t total = iov_size(iov, niov); + + while (len < total) { + len = qio_channel_readv_full(ioc, iov, niov, NULL, +NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp); + + if (len == QIO_CHANNEL_ERR_BLOCK) { +if (qemu_in_coroutine()) { +qio_channel_yield(ioc, G_IO_IN); +} else { +qio_channel_wait(ioc, G_IO_IN); +} +continue; + } + if (len == 0) { + return 0; + } + if (len < 0) { + return -1; + } + } This will busy wait burning CPU where there is a read > 0 and < total. Daniel, i could use MSG_WAITALL too if that works but then we will lose opportunity to yield. Or if you have some other idea. I fear this is an inherant problem with the idea of using PEEK to look at the magic data. If we actually read the magic bytes off the wire, then we could have the same code path for TLS and non-TLS. We would have to modify the existing later code paths though to take account of fact that the magic was already read by an earlier codepath. With regards, Daniel sure Daniel, I am happy to drop use of MSG_PEEK, but that way also we have issue with tls for reason we discussed in V2. Is it okay to send a patch with actual read ahead but not for tls case? tls anyway does not have this bug as it does handshake. Thanks Manish Mishra
Re: [PATCH v3 1/2] io: Add support for MSG_PEEK for socket channel
On 22/11/22 2:30 pm, Daniel P. Berrangé wrote: On Sat, Nov 19, 2022 at 09:36:14AM +, manish.mishra wrote: MSG_PEEK reads from the peek of channel, The data is treated as unread and the next read shall still return this data. This support is currently added only for socket class. Extra parameter 'flags' is added to io_readv calls to pass extra read flags like MSG_PEEK. Suggested-by: Daniel P. Berrangé --- chardev/char-socket.c | 4 +- include/io/channel.h| 83 + io/channel-buffer.c | 1 + io/channel-command.c| 1 + io/channel-file.c | 1 + io/channel-null.c | 1 + io/channel-socket.c | 16 +- io/channel-tls.c| 1 + io/channel-websock.c| 1 + io/channel.c| 73 +++-- migration/channel-block.c | 1 + scsi/qemu-pr-helper.c | 2 +- tests/qtest/tpm-emu.c | 2 +- tests/unit/test-io-channel-socket.c | 1 + util/vhost-user-server.c| 2 +- 15 files changed, 179 insertions(+), 11 deletions(-) diff --git a/io/channel-socket.c b/io/channel-socket.c index b76dca9cc1..a06b24766d 100644 --- a/io/channel-socket.c +++ b/io/channel-socket.c @@ -406,6 +406,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc, } #endif /* WIN32 */ +qio_channel_set_feature(QIO_CHANNEL(cioc), QIO_CHANNEL_FEATURE_READ_MSG_PEEK); + This covers the incoming server side socket. This also needs to be set in outgoing client side socket in qio_channel_socket_connect_async Yes sorry, i considered only current use-case, but as it is generic one both should be there. Thanks will update it. @@ -705,7 +718,6 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc, } #endif /* WIN32 */ - #ifdef QEMU_MSG_ZEROCOPY static int qio_channel_socket_flush(QIOChannel *ioc, Error **errp) Please remove this unrelated whitespace change. @@ -109,6 +117,37 @@ int qio_channel_readv_all_eof(QIOChannel *ioc, return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp); } +int qio_channel_readv_peek_all_eof(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + Error **errp) +{ + ssize_t len = 0; + ssize_t total = iov_size(iov, niov); + + while (len < total) { + len = qio_channel_readv_full(ioc, iov, niov, NULL, +NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp); + + if (len == QIO_CHANNEL_ERR_BLOCK) { +if (qemu_in_coroutine()) { +qio_channel_yield(ioc, G_IO_IN); +} else { +qio_channel_wait(ioc, G_IO_IN); +} +continue; + } + if (len == 0) { + return 0; + } + if (len < 0) { + return -1; + } + } This will busy wait burning CPU where there is a read > 0 and < total. Daniel, i could use MSG_WAITALL too if that works but then we will lose opportunity to yield. Or if you have some other idea. With regards, Daniel Thanks Manish Mishra
Re: [PATCH 1/2] io: Add support for MSG_PEEK for socket channel
On 19/11/22 3:06 pm, manish.mishra wrote: MSG_PEEK reads from the peek of channel, The data is treated as unread and the next read shall still return this data. This support is currently added only for socket class. Extra parameter 'flags' is added to io_readv calls to pass extra read flags like MSG_PEEK. --- chardev/char-socket.c | 4 +- include/io/channel.h| 83 + io/channel-buffer.c | 1 + io/channel-command.c| 1 + io/channel-file.c | 1 + io/channel-null.c | 1 + io/channel-socket.c | 16 +- io/channel-tls.c| 1 + io/channel-websock.c| 1 + io/channel.c| 73 +++-- migration/channel-block.c | 1 + scsi/qemu-pr-helper.c | 2 +- tests/qtest/tpm-emu.c | 2 +- tests/unit/test-io-channel-socket.c | 1 + util/vhost-user-server.c| 2 +- 15 files changed, 179 insertions(+), 11 deletions(-) diff --git a/chardev/char-socket.c b/chardev/char-socket.c index 879564aa8a..5afce9a464 100644 --- a/chardev/char-socket.c +++ b/chardev/char-socket.c @@ -283,11 +283,11 @@ static ssize_t tcp_chr_recv(Chardev *chr, char *buf, size_t len) if (qio_channel_has_feature(s->ioc, QIO_CHANNEL_FEATURE_FD_PASS)) { ret = qio_channel_readv_full(s->ioc, &iov, 1, &msgfds, &msgfds_num, - NULL); + 0, NULL); } else { ret = qio_channel_readv_full(s->ioc, &iov, 1, NULL, NULL, - NULL); + 0, NULL); } if (msgfds_num) { diff --git a/include/io/channel.h b/include/io/channel.h index c680ee7480..cbcde4b88f 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -34,6 +34,8 @@ OBJECT_DECLARE_TYPE(QIOChannel, QIOChannelClass, #define QIO_CHANNEL_WRITE_FLAG_ZERO_COPY 0x1 +#define QIO_CHANNEL_READ_FLAG_MSG_PEEK 0x1 + typedef enum QIOChannelFeature QIOChannelFeature; enum QIOChannelFeature { @@ -41,6 +43,7 @@ enum QIOChannelFeature { QIO_CHANNEL_FEATURE_SHUTDOWN, QIO_CHANNEL_FEATURE_LISTEN, QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY, +QIO_CHANNEL_FEATURE_READ_MSG_PEEK, }; @@ -114,6 +117,7 @@ struct QIOChannelClass { size_t niov, int **fds, size_t *nfds, +int flags, Error **errp); int (*io_close)(QIOChannel *ioc, Error **errp); @@ -188,6 +192,7 @@ void qio_channel_set_name(QIOChannel *ioc, * @niov: the length of the @iov array * @fds: pointer to an array that will received file handles * @nfds: pointer filled with number of elements in @fds on return + * @flags: read flags (QIO_CHANNEL_READ_FLAG_*) * @errp: pointer to a NULL-initialized error object * * Read data from the IO channel, storing it in the @@ -224,6 +229,7 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc, size_t niov, int **fds, size_t *nfds, + int flags, Error **errp); @@ -300,6 +306,34 @@ int qio_channel_readv_all_eof(QIOChannel *ioc, size_t niov, Error **errp); +/** + * qio_channel_readv_peek_all_eof: + * @ioc: the channel object + * @iov: the array of memory regions to read data into + * @niov: the length of the @iov array + * @errp: pointer to a NULL-initialized error object + * + * Read data from the peek of IO channel without + * actually removing it from channel buffer, storing + * it in the memory regions referenced by @iov. Each + * element in the @iov will be fully populated with + * data before the next one is used. The @niov + * parameter specifies the total number of elements + * in @iov. + * + * The function will wait for all requested data + * to be read, yielding from the current coroutine + * if required. + * + * Returns: 1 if all bytes were read, 0 if end-of-file + * occurs without data, or -1 on error + */ +int qio_channel_readv_peek_all_eof(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + Error **errp); + + /** * qio_channel_readv_all: * @ioc: the channel object @@ -328,6 +362,34 @@ int qio_channel_readv_all(QIOChannel *ioc, Error **errp); +/** + * qio_channel_readv_peek_all: + * @ioc: the channel object + * @iov: the array of memory regions to read dat
check magic value for deciding the mapping of channels
Current logic assumes that channel connections on the destination side are always established in the same order as the source and the first one will always be the main channel followed by the multifid or post-copy preemption channel. This may not be always true, as even if a channel has a connection established on the source side it can be in the pending state on the destination side and a newer connection can be established first. Basically causing out of order mapping of channels on the destination side. Currently, all channels except post-copy preempt send a magic number, this patch uses that magic number to decide the type of channel. This logic is applicable only for precopy(multifd) live migration, as mentioned, the post-copy preempt channel does not send any magic number. Also, tls live migrations already does tls handshake before creating other channels, so this issue is not possible with tls, hence this logic is avoided for tls live migrations. This patch uses MSG_PEEK to check the magic number of channels so that current data/control stream management remains un-effected. Suggested-by: Daniel P. Berrangé Signed-off-by: manish.mishra v2: TLS does not support MSG_PEEK, so V1 was broken for tls live migrations. For tls live migration, while initializing main channel tls handshake is done before we can create other channels, so this issue is not possible for tls live migrations. In V2 added a check to avoid checking magic number for tls live migration and fallback to older method to decide mapping of channels on destination side. v3: 1. Split change in two patches, io patch for read_peek routines, migration patch for migration related changes. 2. Add flags to io_readv calls to get extra read flags like MSG_PEEK. 3. Some other minor fixes. manish.mishra (2): io: Add support for MSG_PEEK for socket channel migration: check magic value for deciding the mapping of channels chardev/char-socket.c | 4 +- include/io/channel.h| 83 + io/channel-buffer.c | 1 + io/channel-command.c| 1 + io/channel-file.c | 1 + io/channel-null.c | 1 + io/channel-socket.c | 16 +- io/channel-tls.c| 1 + io/channel-websock.c| 1 + io/channel.c| 73 +++-- migration/channel-block.c | 1 + migration/migration.c | 44 ++- migration/multifd.c | 12 ++--- migration/multifd.h | 2 +- migration/postcopy-ram.c| 5 +- migration/postcopy-ram.h| 2 +- scsi/qemu-pr-helper.c | 2 +- tests/qtest/tpm-emu.c | 2 +- tests/unit/test-io-channel-socket.c | 1 + util/vhost-user-server.c| 2 +- 20 files changed, 218 insertions(+), 37 deletions(-) -- 2.22.3
[PATCH v3 2/2] migration: check magic value for deciding the mapping of channels
Current logic assumes that channel connections on the destination side are always established in the same order as the source and the first one will always be the main channel followed by the multifid or post-copy preemption channel. This may not be always true, as even if a channel has a connection established on the source side it can be in the pending state on the destination side and a newer connection can be established first. Basically causing out of order mapping of channels on the destination side. Currently, all channels except post-copy preempt send a magic number, this patch uses that magic number to decide the type of channel. This logic is applicable only for precopy(multifd) live migration, as mentioned, the post-copy preempt channel does not send any magic number. Also, tls live migrations already does tls handshake before creating other channels, so this issue is not possible with tls, hence this logic is avoided for tls live migrations. This patch uses read peek to check the magic number of channels so that current data/control stream management remains un-effected. Suggested-by: Daniel P. Berrangé --- migration/migration.c| 44 +--- migration/multifd.c | 12 --- migration/multifd.h | 2 +- migration/postcopy-ram.c | 5 + migration/postcopy-ram.h | 2 +- 5 files changed, 39 insertions(+), 26 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 739bb683f3..787e678d48 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -733,31 +733,51 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) { MigrationIncomingState *mis = migration_incoming_get_current(); Error *local_err = NULL; -bool start_migration; QEMUFile *f; +bool default_channel = true; +uint32_t channel_magic = 0; +int ret = 0; -if (!mis->from_src_file) { -/* The first connection (multifd may have multiple) */ +if (migrate_use_multifd() && !migrate_postcopy_ram() && +qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) { +/* + * With multiple channels, it is possible that we receive channels + * out of order on destination side, causing incorrect mapping of + * source channels on destination side. Check channel MAGIC to + * decide type of channel. Please note this is best effort, postcopy + * preempt channel does not send any magic number so avoid it for + * postcopy live migration. Also tls live migration already does + * tls handshake while initializing main channel so with tls this + * issue is not possible. + */ +ret = qio_channel_read_peek_all(ioc, (void *)&channel_magic, +sizeof(channel_magic), &local_err); + +if (ret != 0) { +error_propagate(errp, local_err); +return; +} + +default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC)); +} else { +default_channel = !mis->from_src_file; +} + +if (default_channel) { f = qemu_file_new_input(ioc); if (!migration_incoming_setup(f, errp)) { return; } - -/* - * Common migration only needs one channel, so we can start - * right now. Some features need more than one channel, we wait. - */ -start_migration = !migration_needs_multiple_sockets(); } else { /* Multiple connections */ assert(migration_needs_multiple_sockets()); if (migrate_use_multifd()) { -start_migration = multifd_recv_new_channel(ioc, &local_err); +multifd_recv_new_channel(ioc, &local_err); } else { assert(migrate_postcopy_preempt()); f = qemu_file_new_input(ioc); -start_migration = postcopy_preempt_new_channel(mis, f); +postcopy_preempt_new_channel(mis, f); } if (local_err) { error_propagate(errp, local_err); @@ -765,7 +785,7 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) } } -if (start_migration) { +if (migration_has_all_channels()) { /* If it's a recovery, we're done */ if (postcopy_try_recover()) { return; diff --git a/migration/multifd.c b/migration/multifd.c index 586ddc9d65..be86a4d07f 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -1220,11 +1220,9 @@ bool multifd_recv_all_channels_created(void) /* * Try to receive all multifd channels to get ready for the migration. - * - Return true and do not set @errp when correctly receiving all channels; - * - Return false and do not set @errp when correctly receiving the current one; - * - Return false and set @errp when failing to receive the current channel. + * Sets @errp when failing to receive the current channel. */ -bool multifd_recv_new_channel(QIOChannel
[PATCH 2/2] migration: check magic value for deciding the mapping of channels
Current logic assumes that channel connections on the destination side are always established in the same order as the source and the first one will always be the main channel followed by the multifid or post-copy preemption channel. This may not be always true, as even if a channel has a connection established on the source side it can be in the pending state on the destination side and a newer connection can be established first. Basically causing out of order mapping of channels on the destination side. Currently, all channels except post-copy preempt send a magic number, this patch uses that magic number to decide the type of channel. This logic is applicable only for precopy(multifd) live migration, as mentioned, the post-copy preempt channel does not send any magic number. Also, tls live migrations already does tls handshake before creating other channels, so this issue is not possible with tls, hence this logic is avoided for tls live migrations. This patch uses read peek to check the magic number of channels so that current data/control stream management remains un-effected. --- migration/migration.c| 44 +--- migration/multifd.c | 12 --- migration/multifd.h | 2 +- migration/postcopy-ram.c | 5 + migration/postcopy-ram.h | 2 +- 5 files changed, 39 insertions(+), 26 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 739bb683f3..787e678d48 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -733,31 +733,51 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) { MigrationIncomingState *mis = migration_incoming_get_current(); Error *local_err = NULL; -bool start_migration; QEMUFile *f; +bool default_channel = true; +uint32_t channel_magic = 0; +int ret = 0; -if (!mis->from_src_file) { -/* The first connection (multifd may have multiple) */ +if (migrate_use_multifd() && !migrate_postcopy_ram() && +qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) { +/* + * With multiple channels, it is possible that we receive channels + * out of order on destination side, causing incorrect mapping of + * source channels on destination side. Check channel MAGIC to + * decide type of channel. Please note this is best effort, postcopy + * preempt channel does not send any magic number so avoid it for + * postcopy live migration. Also tls live migration already does + * tls handshake while initializing main channel so with tls this + * issue is not possible. + */ +ret = qio_channel_read_peek_all(ioc, (void *)&channel_magic, +sizeof(channel_magic), &local_err); + +if (ret != 0) { +error_propagate(errp, local_err); +return; +} + +default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC)); +} else { +default_channel = !mis->from_src_file; +} + +if (default_channel) { f = qemu_file_new_input(ioc); if (!migration_incoming_setup(f, errp)) { return; } - -/* - * Common migration only needs one channel, so we can start - * right now. Some features need more than one channel, we wait. - */ -start_migration = !migration_needs_multiple_sockets(); } else { /* Multiple connections */ assert(migration_needs_multiple_sockets()); if (migrate_use_multifd()) { -start_migration = multifd_recv_new_channel(ioc, &local_err); +multifd_recv_new_channel(ioc, &local_err); } else { assert(migrate_postcopy_preempt()); f = qemu_file_new_input(ioc); -start_migration = postcopy_preempt_new_channel(mis, f); +postcopy_preempt_new_channel(mis, f); } if (local_err) { error_propagate(errp, local_err); @@ -765,7 +785,7 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) } } -if (start_migration) { +if (migration_has_all_channels()) { /* If it's a recovery, we're done */ if (postcopy_try_recover()) { return; diff --git a/migration/multifd.c b/migration/multifd.c index 586ddc9d65..be86a4d07f 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -1220,11 +1220,9 @@ bool multifd_recv_all_channels_created(void) /* * Try to receive all multifd channels to get ready for the migration. - * - Return true and do not set @errp when correctly receiving all channels; - * - Return false and do not set @errp when correctly receiving the current one; - * - Return false and set @errp when failing to receive the current channel. + * Sets @errp when failing to receive the current channel. */ -bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp) +void multifd_
[PATCH v3 1/2] io: Add support for MSG_PEEK for socket channel
MSG_PEEK reads from the peek of channel, The data is treated as unread and the next read shall still return this data. This support is currently added only for socket class. Extra parameter 'flags' is added to io_readv calls to pass extra read flags like MSG_PEEK. Suggested-by: Daniel P. Berrangé --- chardev/char-socket.c | 4 +- include/io/channel.h| 83 + io/channel-buffer.c | 1 + io/channel-command.c| 1 + io/channel-file.c | 1 + io/channel-null.c | 1 + io/channel-socket.c | 16 +- io/channel-tls.c| 1 + io/channel-websock.c| 1 + io/channel.c| 73 +++-- migration/channel-block.c | 1 + scsi/qemu-pr-helper.c | 2 +- tests/qtest/tpm-emu.c | 2 +- tests/unit/test-io-channel-socket.c | 1 + util/vhost-user-server.c| 2 +- 15 files changed, 179 insertions(+), 11 deletions(-) diff --git a/chardev/char-socket.c b/chardev/char-socket.c index 879564aa8a..5afce9a464 100644 --- a/chardev/char-socket.c +++ b/chardev/char-socket.c @@ -283,11 +283,11 @@ static ssize_t tcp_chr_recv(Chardev *chr, char *buf, size_t len) if (qio_channel_has_feature(s->ioc, QIO_CHANNEL_FEATURE_FD_PASS)) { ret = qio_channel_readv_full(s->ioc, &iov, 1, &msgfds, &msgfds_num, - NULL); + 0, NULL); } else { ret = qio_channel_readv_full(s->ioc, &iov, 1, NULL, NULL, - NULL); + 0, NULL); } if (msgfds_num) { diff --git a/include/io/channel.h b/include/io/channel.h index c680ee7480..cbcde4b88f 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -34,6 +34,8 @@ OBJECT_DECLARE_TYPE(QIOChannel, QIOChannelClass, #define QIO_CHANNEL_WRITE_FLAG_ZERO_COPY 0x1 +#define QIO_CHANNEL_READ_FLAG_MSG_PEEK 0x1 + typedef enum QIOChannelFeature QIOChannelFeature; enum QIOChannelFeature { @@ -41,6 +43,7 @@ enum QIOChannelFeature { QIO_CHANNEL_FEATURE_SHUTDOWN, QIO_CHANNEL_FEATURE_LISTEN, QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY, +QIO_CHANNEL_FEATURE_READ_MSG_PEEK, }; @@ -114,6 +117,7 @@ struct QIOChannelClass { size_t niov, int **fds, size_t *nfds, +int flags, Error **errp); int (*io_close)(QIOChannel *ioc, Error **errp); @@ -188,6 +192,7 @@ void qio_channel_set_name(QIOChannel *ioc, * @niov: the length of the @iov array * @fds: pointer to an array that will received file handles * @nfds: pointer filled with number of elements in @fds on return + * @flags: read flags (QIO_CHANNEL_READ_FLAG_*) * @errp: pointer to a NULL-initialized error object * * Read data from the IO channel, storing it in the @@ -224,6 +229,7 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc, size_t niov, int **fds, size_t *nfds, + int flags, Error **errp); @@ -300,6 +306,34 @@ int qio_channel_readv_all_eof(QIOChannel *ioc, size_t niov, Error **errp); +/** + * qio_channel_readv_peek_all_eof: + * @ioc: the channel object + * @iov: the array of memory regions to read data into + * @niov: the length of the @iov array + * @errp: pointer to a NULL-initialized error object + * + * Read data from the peek of IO channel without + * actually removing it from channel buffer, storing + * it in the memory regions referenced by @iov. Each + * element in the @iov will be fully populated with + * data before the next one is used. The @niov + * parameter specifies the total number of elements + * in @iov. + * + * The function will wait for all requested data + * to be read, yielding from the current coroutine + * if required. + * + * Returns: 1 if all bytes were read, 0 if end-of-file + * occurs without data, or -1 on error + */ +int qio_channel_readv_peek_all_eof(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + Error **errp); + + /** * qio_channel_readv_all: * @ioc: the channel object @@ -328,6 +362,34 @@ int qio_channel_readv_all(QIOChannel *ioc, Error **errp); +/** + * qio_channel_readv_peek_all: + * @ioc: the channel object + * @iov: the array of memory regions to read data into + * @niov: the length of the @iov array + * @errp: pointer to a NULL-initialized error object + * + * Read data
[PATCH 1/2] io: Add support for MSG_PEEK for socket channel
MSG_PEEK reads from the peek of channel, The data is treated as unread and the next read shall still return this data. This support is currently added only for socket class. Extra parameter 'flags' is added to io_readv calls to pass extra read flags like MSG_PEEK. --- chardev/char-socket.c | 4 +- include/io/channel.h| 83 + io/channel-buffer.c | 1 + io/channel-command.c| 1 + io/channel-file.c | 1 + io/channel-null.c | 1 + io/channel-socket.c | 16 +- io/channel-tls.c| 1 + io/channel-websock.c| 1 + io/channel.c| 73 +++-- migration/channel-block.c | 1 + scsi/qemu-pr-helper.c | 2 +- tests/qtest/tpm-emu.c | 2 +- tests/unit/test-io-channel-socket.c | 1 + util/vhost-user-server.c| 2 +- 15 files changed, 179 insertions(+), 11 deletions(-) diff --git a/chardev/char-socket.c b/chardev/char-socket.c index 879564aa8a..5afce9a464 100644 --- a/chardev/char-socket.c +++ b/chardev/char-socket.c @@ -283,11 +283,11 @@ static ssize_t tcp_chr_recv(Chardev *chr, char *buf, size_t len) if (qio_channel_has_feature(s->ioc, QIO_CHANNEL_FEATURE_FD_PASS)) { ret = qio_channel_readv_full(s->ioc, &iov, 1, &msgfds, &msgfds_num, - NULL); + 0, NULL); } else { ret = qio_channel_readv_full(s->ioc, &iov, 1, NULL, NULL, - NULL); + 0, NULL); } if (msgfds_num) { diff --git a/include/io/channel.h b/include/io/channel.h index c680ee7480..cbcde4b88f 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -34,6 +34,8 @@ OBJECT_DECLARE_TYPE(QIOChannel, QIOChannelClass, #define QIO_CHANNEL_WRITE_FLAG_ZERO_COPY 0x1 +#define QIO_CHANNEL_READ_FLAG_MSG_PEEK 0x1 + typedef enum QIOChannelFeature QIOChannelFeature; enum QIOChannelFeature { @@ -41,6 +43,7 @@ enum QIOChannelFeature { QIO_CHANNEL_FEATURE_SHUTDOWN, QIO_CHANNEL_FEATURE_LISTEN, QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY, +QIO_CHANNEL_FEATURE_READ_MSG_PEEK, }; @@ -114,6 +117,7 @@ struct QIOChannelClass { size_t niov, int **fds, size_t *nfds, +int flags, Error **errp); int (*io_close)(QIOChannel *ioc, Error **errp); @@ -188,6 +192,7 @@ void qio_channel_set_name(QIOChannel *ioc, * @niov: the length of the @iov array * @fds: pointer to an array that will received file handles * @nfds: pointer filled with number of elements in @fds on return + * @flags: read flags (QIO_CHANNEL_READ_FLAG_*) * @errp: pointer to a NULL-initialized error object * * Read data from the IO channel, storing it in the @@ -224,6 +229,7 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc, size_t niov, int **fds, size_t *nfds, + int flags, Error **errp); @@ -300,6 +306,34 @@ int qio_channel_readv_all_eof(QIOChannel *ioc, size_t niov, Error **errp); +/** + * qio_channel_readv_peek_all_eof: + * @ioc: the channel object + * @iov: the array of memory regions to read data into + * @niov: the length of the @iov array + * @errp: pointer to a NULL-initialized error object + * + * Read data from the peek of IO channel without + * actually removing it from channel buffer, storing + * it in the memory regions referenced by @iov. Each + * element in the @iov will be fully populated with + * data before the next one is used. The @niov + * parameter specifies the total number of elements + * in @iov. + * + * The function will wait for all requested data + * to be read, yielding from the current coroutine + * if required. + * + * Returns: 1 if all bytes were read, 0 if end-of-file + * occurs without data, or -1 on error + */ +int qio_channel_readv_peek_all_eof(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + Error **errp); + + /** * qio_channel_readv_all: * @ioc: the channel object @@ -328,6 +362,34 @@ int qio_channel_readv_all(QIOChannel *ioc, Error **errp); +/** + * qio_channel_readv_peek_all: + * @ioc: the channel object + * @iov: the array of memory regions to read data into + * @niov: the length of the @iov array + * @errp: pointer to a NULL-initialized error object + * + * Read data from the the peek of IO channel wi
Re: [PATCH v2] migration: check magic value for deciding the mapping of channels
On 16/11/22 4:57 pm, Daniel P. Berrangé wrote: On Wed, Nov 16, 2022 at 04:49:18PM +0530, manish.mishra wrote: On 16/11/22 12:20 am, Daniel P. Berrangé wrote: On Tue, Nov 15, 2022 at 06:11:30PM +, Daniel P. Berrangé wrote: On Mon, Nov 07, 2022 at 04:51:59PM +, manish.mishra wrote: Current logic assumes that channel connections on the destination side are always established in the same order as the source and the first one will always be the main channel followed by the multifid or post-copy preemption channel. This may not be always true, as even if a channel has a connection established on the source side it can be in the pending state on the destination side and a newer connection can be established first. Basically causing out of order mapping of channels on the destination side. Currently, all channels except post-copy preempt send a magic number, this patch uses that magic number to decide the type of channel. This logic is applicable only for precopy(multifd) live migration, as mentioned, the post-copy preempt channel does not send any magic number. Also, tls live migrations already does tls handshake before creating other channels, so this issue is not possible with tls, hence this logic is avoided for tls live migrations. This patch uses MSG_PEEK to check the magic number of channels so that current data/control stream management remains un-effected. Suggested-by: Daniel P. Berrangé Signed-off-by: manish.mishra v2: TLS does not support MSG_PEEK, so V1 was broken for tls live migrations. For tls live migration, while initializing main channel tls handshake is done before we can create other channels, so this issue is not possible for tls live migrations. In V2 added a check to avoid checking magic number for tls live migration and fallback to older method to decide mapping of channels on destination side. --- include/io/channel.h | 25 +++ io/channel-socket.c | 27 io/channel.c | 39 +++ migration/migration.c| 44 +--- migration/multifd.c | 12 --- migration/multifd.h | 2 +- migration/postcopy-ram.c | 5 + migration/postcopy-ram.h | 2 +- 8 files changed, 130 insertions(+), 26 deletions(-) This should be two commits, because the 'io' and 'migration' code are two separate subsystems in QEMU. diff --git a/include/io/channel.h b/include/io/channel.h index c680ee7480..74177aeeea 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -115,6 +115,10 @@ struct QIOChannelClass { int **fds, size_t *nfds, Error **errp); +ssize_t (*io_read_peek)(QIOChannel *ioc, +void *buf, +size_t nbytes, +Error **errp); This API should be called "io_read_peekv" and use "const struct iovec *iov", such that is matches the design of 'io_readv'. There should also be a QIOChannelFeature flag registered to indicate whether a given channel impl supports peeking at data. @@ -475,6 +479,27 @@ int qio_channel_write_all(QIOChannel *ioc, size_t buflen, Error **errp); +/** + * qio_channel_read_peek_all: + * @ioc: the channel object + * @buf: the memory region to read in data + * @nbytes: the number of bytes to read + * @errp: pointer to a NULL-initialized error object + * + * Read given @nbytes data from peek of channel into + * memory region @buf. + * + * The function will be blocked until read size is + * equal to requested size. + * + * Returns: 1 if all bytes were read, 0 if end-of-file + * occurs without data, or -1 on error + */ +int qio_channel_read_peek_all(QIOChannel *ioc, + void* buf, + size_t nbytes, + Error **errp); There should be qio_channel_read_peek, qio_channel_read_peekv, qio_channel_read_peek_all and qio_channel_read_peekv_all. Actually ignore that. We should not add any new APIs at all. Instead the io_readv callback, and the qio_channel_read*all() methods should gain a 'int flags' parameter, in the same way that the write methods have one. Then there should be as QIO_CHANNEL_READ_FLAG_PEEK constant defined. Hi Daniel, As MSG_PEEK always reads from top even if there were previos partial reads, so current |qio_channel_readv_all_eofmay not work? I can keep things upto ||qio_channel_readv as common for both with flags parameters but have separate ||qio_channel_readv_peek_all_eof? Does something like that works.||| Simplest is probably to just not add 'flags' to the 'all' variants, just the non-'all' varants. sure Daniel, will do that. Thanks. With regards, Daniel
Re: [PATCH v2] migration: check magic value for deciding the mapping of channels
On 16/11/22 12:20 am, Daniel P. Berrangé wrote: On Tue, Nov 15, 2022 at 06:11:30PM +, Daniel P. Berrangé wrote: On Mon, Nov 07, 2022 at 04:51:59PM +, manish.mishra wrote: Current logic assumes that channel connections on the destination side are always established in the same order as the source and the first one will always be the main channel followed by the multifid or post-copy preemption channel. This may not be always true, as even if a channel has a connection established on the source side it can be in the pending state on the destination side and a newer connection can be established first. Basically causing out of order mapping of channels on the destination side. Currently, all channels except post-copy preempt send a magic number, this patch uses that magic number to decide the type of channel. This logic is applicable only for precopy(multifd) live migration, as mentioned, the post-copy preempt channel does not send any magic number. Also, tls live migrations already does tls handshake before creating other channels, so this issue is not possible with tls, hence this logic is avoided for tls live migrations. This patch uses MSG_PEEK to check the magic number of channels so that current data/control stream management remains un-effected. Suggested-by: Daniel P. Berrangé Signed-off-by: manish.mishra v2: TLS does not support MSG_PEEK, so V1 was broken for tls live migrations. For tls live migration, while initializing main channel tls handshake is done before we can create other channels, so this issue is not possible for tls live migrations. In V2 added a check to avoid checking magic number for tls live migration and fallback to older method to decide mapping of channels on destination side. --- include/io/channel.h | 25 +++ io/channel-socket.c | 27 io/channel.c | 39 +++ migration/migration.c| 44 +--- migration/multifd.c | 12 --- migration/multifd.h | 2 +- migration/postcopy-ram.c | 5 + migration/postcopy-ram.h | 2 +- 8 files changed, 130 insertions(+), 26 deletions(-) This should be two commits, because the 'io' and 'migration' code are two separate subsystems in QEMU. diff --git a/include/io/channel.h b/include/io/channel.h index c680ee7480..74177aeeea 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -115,6 +115,10 @@ struct QIOChannelClass { int **fds, size_t *nfds, Error **errp); +ssize_t (*io_read_peek)(QIOChannel *ioc, +void *buf, +size_t nbytes, +Error **errp); This API should be called "io_read_peekv" and use "const struct iovec *iov", such that is matches the design of 'io_readv'. There should also be a QIOChannelFeature flag registered to indicate whether a given channel impl supports peeking at data. @@ -475,6 +479,27 @@ int qio_channel_write_all(QIOChannel *ioc, size_t buflen, Error **errp); +/** + * qio_channel_read_peek_all: + * @ioc: the channel object + * @buf: the memory region to read in data + * @nbytes: the number of bytes to read + * @errp: pointer to a NULL-initialized error object + * + * Read given @nbytes data from peek of channel into + * memory region @buf. + * + * The function will be blocked until read size is + * equal to requested size. + * + * Returns: 1 if all bytes were read, 0 if end-of-file + * occurs without data, or -1 on error + */ +int qio_channel_read_peek_all(QIOChannel *ioc, + void* buf, + size_t nbytes, + Error **errp); There should be qio_channel_read_peek, qio_channel_read_peekv, qio_channel_read_peek_all and qio_channel_read_peekv_all. Actually ignore that. We should not add any new APIs at all. Instead the io_readv callback, and the qio_channel_read*all() methods should gain a 'int flags' parameter, in the same way that the write methods have one. Then there should be as QIO_CHANNEL_READ_FLAG_PEEK constant defined. Hi Daniel, As MSG_PEEK always reads from top even if there were previos partial reads, so current |qio_channel_readv_all_eofmay not work? I can keep things upto ||qio_channel_readv as common for both with flags parameters but have separate ||qio_channel_readv_peek_all_eof? Does something like that works.||| thanks Manish Mishra With regards, Daniel
Re: [PATCH v2] migration: check magic value for deciding the mapping of channels
On 15/11/22 11:06 pm, Peter Xu wrote: Hi, Manish, On Mon, Nov 07, 2022 at 04:51:59PM +, manish.mishra wrote: Current logic assumes that channel connections on the destination side are always established in the same order as the source and the first one will always be the main channel followed by the multifid or post-copy preemption channel. This may not be always true, as even if a channel has a connection established on the source side it can be in the pending state on the destination side and a newer connection can be established first. Basically causing out of order mapping of channels on the destination side. Currently, all channels except post-copy preempt send a magic number, this patch uses that magic number to decide the type of channel. This logic is applicable only for precopy(multifd) live migration, as mentioned, the post-copy preempt channel does not send any magic number. Also, tls live migrations already does tls handshake before creating other channels, so this issue is not possible with tls, hence this logic is avoided for tls live migrations. This patch uses MSG_PEEK to check the magic number of channels so that current data/control stream management remains un-effected. Suggested-by: Daniel P. Berrangé Signed-off-by: manish.mishra v2: TLS does not support MSG_PEEK, so V1 was broken for tls live migrations. For tls live migration, while initializing main channel tls handshake is done before we can create other channels, so this issue is not possible for tls live migrations. In V2 added a check to avoid checking magic number for tls live migration and fallback to older method to decide mapping of channels on destination side. --- include/io/channel.h | 25 +++ io/channel-socket.c | 27 io/channel.c | 39 +++ migration/migration.c| 44 +--- migration/multifd.c | 12 --- migration/multifd.h | 2 +- migration/postcopy-ram.c | 5 + migration/postcopy-ram.h | 2 +- 8 files changed, 130 insertions(+), 26 deletions(-) diff --git a/include/io/channel.h b/include/io/channel.h index c680ee7480..74177aeeea 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -115,6 +115,10 @@ struct QIOChannelClass { int **fds, size_t *nfds, Error **errp); +ssize_t (*io_read_peek)(QIOChannel *ioc, +void *buf, +size_t nbytes, +Error **errp); int (*io_close)(QIOChannel *ioc, Error **errp); GSource * (*io_create_watch)(QIOChannel *ioc, @@ -475,6 +479,27 @@ int qio_channel_write_all(QIOChannel *ioc, size_t buflen, Error **errp); +/** + * qio_channel_read_peek_all: + * @ioc: the channel object + * @buf: the memory region to read in data + * @nbytes: the number of bytes to read + * @errp: pointer to a NULL-initialized error object + * + * Read given @nbytes data from peek of channel into + * memory region @buf. + * + * The function will be blocked until read size is + * equal to requested size. + * + * Returns: 1 if all bytes were read, 0 if end-of-file + * occurs without data, or -1 on error + */ +int qio_channel_read_peek_all(QIOChannel *ioc, + void* buf, + size_t nbytes, + Error **errp); + /** * qio_channel_set_blocking: * @ioc: the channel object diff --git a/io/channel-socket.c b/io/channel-socket.c index b76dca9cc1..b99f5dfda6 100644 --- a/io/channel-socket.c +++ b/io/channel-socket.c @@ -705,6 +705,32 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc, } #endif /* WIN32 */ +static ssize_t qio_channel_socket_read_peek(QIOChannel *ioc, +void *buf, +size_t nbytes, +Error **errp) +{ +QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc); +ssize_t bytes = 0; + +retry: +bytes = recv(sioc->fd, buf, nbytes, MSG_PEEK); + +if (bytes < 0) { +if (errno == EINTR) { +goto retry; +} +if (errno == EAGAIN) { +return QIO_CHANNEL_ERR_BLOCK; +} + +error_setg_errno(errp, errno, + "Unable to read from peek of socket"); +return -1; +} + +return bytes; +} #ifdef QEMU_MSG_ZEROCOPY static int qio_channel_socket_flush(QIOChannel *ioc, @@ -902,6 +928,7 @@ static void qio_channel_socket_class_init(ObjectClass *klass, ioc_klass->io_writev = qio_channel_socket_writev; ioc_klass->io_readv = qio_channel_socket_readv; +ioc_klass->io_read_peek = qio_channel_socket_r
Re: [PATCH] migration: check magic value for deciding the mapping of channels
Thanks Peter On 14/11/22 10:21 pm, Peter Xu wrote: Manish, On Thu, Nov 03, 2022 at 11:47:51PM +0530, manish.mishra wrote: Yes, but if we try to read early on main channel with tls enabled case it is an issue. Sorry i may not have put above comment cleary. I will try to put scenario step wise. 1. main channel is created and tls handshake is done for main channel. 2. Destionation side tries to read magic early on main channel in migration_ioc_process_incoming but it is not yet sent by source. 3. Source has written magic to main channel file buffer but it is not yet flushed, it is flushed first time in ram_save_setup, i mean data is sent on channel only if qemu file buffer is full or explicitly flushed. 4. Source side blocks on multifd_send_sync_main in ram_save_setup before flushing qemu file. But multifd_send_sync_main is blocked for sem_sync until handshake is done for multiFD channels. 5. Destination side is still waiting for reading magic on main channel, so unless we return from migration_ioc_process_incoming we can not accept new channel, so handshake of multiFD channel is blocked. 6. So basically source is blocked on multiFD channels handshake before sending data on main channel, but destination is blocked waiting for data before it can acknowledge multiFD channels and do handshake, so it kind of creates a deadlock situation. Why is this issue only happening with TLS? It sounds like it'll happen as long as multifd enabled. Actually this was happening with tls because with tls we do handshake, so a connection is assumed establised only after a tls handshake and we flush data from source only after all channels are established, but with normal live migration even if connection is not accepted on destination side we can continue as we do not do any handshake. Basically in normal live migration a connection is assumed established if connect() call was successful even if it is not accepted/ack by destination, so that's why this deadlock was not hapening. I'm also thinking whether we should flush in qemu_savevm_state_header() so at least upgraded src qemu will always flush the headers if it never hurts. yes sure Peter. Thanks Manish Mishra
Re: [PATCH v2] migration: check magic value for deciding the mapping of channels
On 11/11/22 4:17 am, Peter Xu wrote: On Thu, Nov 10, 2022 at 05:59:45PM +0530, manish.mishra wrote: Hi Everyone, Just a gentle reminder for review. :) Hi, Manish, I've got a slightly busy week, sorry! If Daniel and Juan won't have time to look at it I'll have a closer look at it next Monday (holiday tomorrow). Yes sure Peter, Thank you and Happy Holiday :).
Re: [PATCH v2] migration: check magic value for deciding the mapping of channels
Hi Everyone, Just a gentle reminder for review. :) Thanks Manish Mishra On 07/11/22 10:21 pm, manish.mishra wrote: Current logic assumes that channel connections on the destination side are always established in the same order as the source and the first one will always be the main channel followed by the multifid or post-copy preemption channel. This may not be always true, as even if a channel has a connection established on the source side it can be in the pending state on the destination side and a newer connection can be established first. Basically causing out of order mapping of channels on the destination side. Currently, all channels except post-copy preempt send a magic number, this patch uses that magic number to decide the type of channel. This logic is applicable only for precopy(multifd) live migration, as mentioned, the post-copy preempt channel does not send any magic number. Also, tls live migrations already does tls handshake before creating other channels, so this issue is not possible with tls, hence this logic is avoided for tls live migrations. This patch uses MSG_PEEK to check the magic number of channels so that current data/control stream management remains un-effected. Suggested-by: Daniel P. Berrangé Signed-off-by: manish.mishra v2: TLS does not support MSG_PEEK, so V1 was broken for tls live migrations. For tls live migration, while initializing main channel tls handshake is done before we can create other channels, so this issue is not possible for tls live migrations. In V2 added a check to avoid checking magic number for tls live migration and fallback to older method to decide mapping of channels on destination side. --- include/io/channel.h | 25 +++ io/channel-socket.c | 27 io/channel.c | 39 +++ migration/migration.c| 44 +--- migration/multifd.c | 12 --- migration/multifd.h | 2 +- migration/postcopy-ram.c | 5 + migration/postcopy-ram.h | 2 +- 8 files changed, 130 insertions(+), 26 deletions(-) diff --git a/include/io/channel.h b/include/io/channel.h index c680ee7480..74177aeeea 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -115,6 +115,10 @@ struct QIOChannelClass { int **fds, size_t *nfds, Error **errp); +ssize_t (*io_read_peek)(QIOChannel *ioc, +void *buf, +size_t nbytes, +Error **errp); int (*io_close)(QIOChannel *ioc, Error **errp); GSource * (*io_create_watch)(QIOChannel *ioc, @@ -475,6 +479,27 @@ int qio_channel_write_all(QIOChannel *ioc, size_t buflen, Error **errp); +/** + * qio_channel_read_peek_all: + * @ioc: the channel object + * @buf: the memory region to read in data + * @nbytes: the number of bytes to read + * @errp: pointer to a NULL-initialized error object + * + * Read given @nbytes data from peek of channel into + * memory region @buf. + * + * The function will be blocked until read size is + * equal to requested size. + * + * Returns: 1 if all bytes were read, 0 if end-of-file + * occurs without data, or -1 on error + */ +int qio_channel_read_peek_all(QIOChannel *ioc, + void* buf, + size_t nbytes, + Error **errp); + /** * qio_channel_set_blocking: * @ioc: the channel object diff --git a/io/channel-socket.c b/io/channel-socket.c index b76dca9cc1..b99f5dfda6 100644 --- a/io/channel-socket.c +++ b/io/channel-socket.c @@ -705,6 +705,32 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc, } #endif /* WIN32 */ +static ssize_t qio_channel_socket_read_peek(QIOChannel *ioc, +void *buf, +size_t nbytes, +Error **errp) +{ +QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc); +ssize_t bytes = 0; + +retry: +bytes = recv(sioc->fd, buf, nbytes, MSG_PEEK); + +if (bytes < 0) { +if (errno == EINTR) { +goto retry; +} +if (errno == EAGAIN) { +return QIO_CHANNEL_ERR_BLOCK; +} + +error_setg_errno(errp, errno, + "Unable to read from peek of socket"); +return -1; +} + +return bytes; +} #ifdef QEMU_MSG_ZEROCOPY static int qio_channel_socket_flush(QIOChannel *ioc, @@ -902,6 +928,7 @@ static void qio_channel_socket_class_init(ObjectClass *klass, ioc_klass->io_writev = qio_channel_socket_writev; ioc_klass->io_readv = qio_channel_socket_readv; +ioc_klass->io_read_peek = qio_channel_socket_r
Re: [PATCH v2] migration: check magic value for deciding the mapping of channels
On 07/11/22 10:21 pm, manish.mishra wrote: Current logic assumes that channel connections on the destination side are always established in the same order as the source and the first one will always be the main channel followed by the multifid or post-copy preemption channel. This may not be always true, as even if a channel has a connection established on the source side it can be in the pending state on the destination side and a newer connection can be established first. Basically causing out of order mapping of channels on the destination side. Currently, all channels except post-copy preempt send a magic number, this patch uses that magic number to decide the type of channel. This logic is applicable only for precopy(multifd) live migration, as mentioned, the post-copy preempt channel does not send any magic number. Also, tls live migrations already does tls handshake before creating other channels, so this issue is not possible with tls, hence this logic is avoided for tls live migrations. This patch uses MSG_PEEK to check the magic number of channels so that current data/control stream management remains un-effected. Suggested-by: Daniel P. Berrangé Signed-off-by: manish.mishra v2: TLS does not support MSG_PEEK, so V1 was broken for tls live migrations. For tls live migration, while initializing main channel tls handshake is done before we can create other channels, so this issue is not possible for tls live migrations. In V2 added a check to avoid checking magic number for tls live migration and fallback to older method to decide mapping of channels on destination side. Hi Daniel, This is what i concluded from discussion on V1, if this is not what you expected, please let me know, we can continue discussion on V1 thread. Thanks Manish Mishra --- include/io/channel.h | 25 +++ io/channel-socket.c | 27 io/channel.c | 39 +++ migration/migration.c| 44 +--- migration/multifd.c | 12 --- migration/multifd.h | 2 +- migration/postcopy-ram.c | 5 + migration/postcopy-ram.h | 2 +- 8 files changed, 130 insertions(+), 26 deletions(-) diff --git a/include/io/channel.h b/include/io/channel.h index c680ee7480..74177aeeea 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -115,6 +115,10 @@ struct QIOChannelClass { int **fds, size_t *nfds, Error **errp); +ssize_t (*io_read_peek)(QIOChannel *ioc, +void *buf, +size_t nbytes, +Error **errp); int (*io_close)(QIOChannel *ioc, Error **errp); GSource * (*io_create_watch)(QIOChannel *ioc, @@ -475,6 +479,27 @@ int qio_channel_write_all(QIOChannel *ioc, size_t buflen, Error **errp); +/** + * qio_channel_read_peek_all: + * @ioc: the channel object + * @buf: the memory region to read in data + * @nbytes: the number of bytes to read + * @errp: pointer to a NULL-initialized error object + * + * Read given @nbytes data from peek of channel into + * memory region @buf. + * + * The function will be blocked until read size is + * equal to requested size. + * + * Returns: 1 if all bytes were read, 0 if end-of-file + * occurs without data, or -1 on error + */ +int qio_channel_read_peek_all(QIOChannel *ioc, + void* buf, + size_t nbytes, + Error **errp); + /** * qio_channel_set_blocking: * @ioc: the channel object diff --git a/io/channel-socket.c b/io/channel-socket.c index b76dca9cc1..b99f5dfda6 100644 --- a/io/channel-socket.c +++ b/io/channel-socket.c @@ -705,6 +705,32 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc, } #endif /* WIN32 */ +static ssize_t qio_channel_socket_read_peek(QIOChannel *ioc, +void *buf, +size_t nbytes, +Error **errp) +{ +QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc); +ssize_t bytes = 0; + +retry: +bytes = recv(sioc->fd, buf, nbytes, MSG_PEEK); + +if (bytes < 0) { +if (errno == EINTR) { +goto retry; +} +if (errno == EAGAIN) { +return QIO_CHANNEL_ERR_BLOCK; +} + +error_setg_errno(errp, errno, + "Unable to read from peek of socket"); +return -1; +} + +return bytes; +} #ifdef QEMU_MSG_ZEROCOPY static int qio_channel_socket_flush(QIOChannel *ioc, @@ -902,6 +928,7 @@ static void qio_channel_socket_class_init(ObjectClass *klass, ioc_klass->io_writev = qio_channel_socket_write
[PATCH v2] migration: check magic value for deciding the mapping of channels
Current logic assumes that channel connections on the destination side are always established in the same order as the source and the first one will always be the main channel followed by the multifid or post-copy preemption channel. This may not be always true, as even if a channel has a connection established on the source side it can be in the pending state on the destination side and a newer connection can be established first. Basically causing out of order mapping of channels on the destination side. Currently, all channels except post-copy preempt send a magic number, this patch uses that magic number to decide the type of channel. This logic is applicable only for precopy(multifd) live migration, as mentioned, the post-copy preempt channel does not send any magic number. Also, tls live migrations already does tls handshake before creating other channels, so this issue is not possible with tls, hence this logic is avoided for tls live migrations. This patch uses MSG_PEEK to check the magic number of channels so that current data/control stream management remains un-effected. Suggested-by: Daniel P. Berrangé Signed-off-by: manish.mishra v2: TLS does not support MSG_PEEK, so V1 was broken for tls live migrations. For tls live migration, while initializing main channel tls handshake is done before we can create other channels, so this issue is not possible for tls live migrations. In V2 added a check to avoid checking magic number for tls live migration and fallback to older method to decide mapping of channels on destination side. --- include/io/channel.h | 25 +++ io/channel-socket.c | 27 io/channel.c | 39 +++ migration/migration.c| 44 +--- migration/multifd.c | 12 --- migration/multifd.h | 2 +- migration/postcopy-ram.c | 5 + migration/postcopy-ram.h | 2 +- 8 files changed, 130 insertions(+), 26 deletions(-) diff --git a/include/io/channel.h b/include/io/channel.h index c680ee7480..74177aeeea 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -115,6 +115,10 @@ struct QIOChannelClass { int **fds, size_t *nfds, Error **errp); +ssize_t (*io_read_peek)(QIOChannel *ioc, +void *buf, +size_t nbytes, +Error **errp); int (*io_close)(QIOChannel *ioc, Error **errp); GSource * (*io_create_watch)(QIOChannel *ioc, @@ -475,6 +479,27 @@ int qio_channel_write_all(QIOChannel *ioc, size_t buflen, Error **errp); +/** + * qio_channel_read_peek_all: + * @ioc: the channel object + * @buf: the memory region to read in data + * @nbytes: the number of bytes to read + * @errp: pointer to a NULL-initialized error object + * + * Read given @nbytes data from peek of channel into + * memory region @buf. + * + * The function will be blocked until read size is + * equal to requested size. + * + * Returns: 1 if all bytes were read, 0 if end-of-file + * occurs without data, or -1 on error + */ +int qio_channel_read_peek_all(QIOChannel *ioc, + void* buf, + size_t nbytes, + Error **errp); + /** * qio_channel_set_blocking: * @ioc: the channel object diff --git a/io/channel-socket.c b/io/channel-socket.c index b76dca9cc1..b99f5dfda6 100644 --- a/io/channel-socket.c +++ b/io/channel-socket.c @@ -705,6 +705,32 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc, } #endif /* WIN32 */ +static ssize_t qio_channel_socket_read_peek(QIOChannel *ioc, +void *buf, +size_t nbytes, +Error **errp) +{ +QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc); +ssize_t bytes = 0; + +retry: +bytes = recv(sioc->fd, buf, nbytes, MSG_PEEK); + +if (bytes < 0) { +if (errno == EINTR) { +goto retry; +} +if (errno == EAGAIN) { +return QIO_CHANNEL_ERR_BLOCK; +} + +error_setg_errno(errp, errno, + "Unable to read from peek of socket"); +return -1; +} + +return bytes; +} #ifdef QEMU_MSG_ZEROCOPY static int qio_channel_socket_flush(QIOChannel *ioc, @@ -902,6 +928,7 @@ static void qio_channel_socket_class_init(ObjectClass *klass, ioc_klass->io_writev = qio_channel_socket_writev; ioc_klass->io_readv = qio_channel_socket_readv; +ioc_klass->io_read_peek = qio_channel_socket_read_peek; ioc_klass->io_set_blocking = qio_channel_socket_set_blocking; ioc_klass->io_close = qio_channel_socket_clo
Re: [PATCH] migration: check magic value for deciding the mapping of channels
On 03/11/22 11:47 pm, manish.mishra wrote: On 03/11/22 11:27 pm, Daniel P. Berrangé wrote: On Thu, Nov 03, 2022 at 11:06:23PM +0530, manish.mishra wrote: On 03/11/22 10:57 pm, Daniel P. Berrangé wrote: On Thu, Nov 03, 2022 at 10:04:54PM +0530, manish.mishra wrote: On 03/11/22 2:59 pm, Daniel P. Berrangé wrote: On Thu, Nov 03, 2022 at 02:50:25PM +0530, manish.mishra wrote: On 01/11/22 9:15 pm, Daniel P. Berrangé wrote: On Tue, Nov 01, 2022 at 09:10:14PM +0530, manish.mishra wrote: On 01/11/22 8:21 pm, Daniel P. Berrangé wrote: On Tue, Nov 01, 2022 at 02:30:29PM +, manish.mishra wrote: diff --git a/migration/migration.c b/migration/migration.c index 739bb683f3..f4b6f278a9 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -733,31 +733,40 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) { MigrationIncomingState *mis = migration_incoming_get_current(); Error *local_err = NULL; - bool start_migration; QEMUFile *f; + bool default_channel = true; + uint32_t channel_magic = 0; + int ret = 0; - if (!mis->from_src_file) { - /* The first connection (multifd may have multiple) */ + if (migrate_use_multifd() && !migration_in_postcopy()) { + ret = qio_channel_read_peek_all(ioc, (void *)&channel_magic, + sizeof(channel_magic), &local_err); + + if (ret != 1) { + error_propagate(errp, local_err); + return; + } and thus this will fail for TLS channels AFAICT. Yes, thanks for quick review Daniel. You pointed this earlier too, sorry missed it, will put another check !migrate_use_tls() in V2. But we need this problem fixed with TLS too, so just excluding it isn't right. IMHO we need to modify the migration code so we can read the magic earlier, instead of peeking. With regards, Daniel Hi Daniel, I was trying tls migrations. What i see is that tls session creation does handshake. So if we read ahead in ioc_process_incoming for default channel. Because client sends magic only after multiFD channels are setup, which too requires tls handshake. By the time we get to migrate_ioc_process_incoming, the TLS handshake has already been performed. migration_channel_process_incoming -> migration_ioc_process_incoming vs migration_channel_process_incoming -> migration_tls_channel_process_incoming -> migration_tls_incoming_handshake -> migration_channel_process_incoming -> migration_ioc_process_incoming Yes sorry i thought we block on source side till handshake is done but that is not true. I checked then why that deadlock is happening. So this where the dealock is happening. static int ram_save_setup(QEMUFile *f, void *opaque) { + + ram_control_before_iterate(f, RAM_CONTROL_SETUP); ram_control_after_iterate(f, RAM_CONTROL_SETUP); ret = multifd_send_sync_main(f); if (ret < 0) { return ret; } qemu_put_be64(f, RAM_SAVE_FLAG_EOS); qemu_fflush(f); return 0; } Now if we block in migration_ioc_process_incoming for reading magic value from channel, which is actually sent by client when this qemu_fflush is done. Before this qemu_fflush we wait for multifd_send_sync_main which actually requires that tls handshake is done for multiFD channels as it blocks on sem_sync which posted by multifd_send_thread which is called after handshake||. But then on destination side we are blocked in migration_ioc_process_incoming() waiting to read something from default channel hence handshake for multiFD channels can not happen. This to me looks unresolvable whatever way we try to manipulate stream until we do some changes on source side. The TLS handshake is already complete when migration_ioc_process_incoming is blocking on read. Regardless of which channel we're talking about, thue TLS handshake is always performed & finished before we try to either send or recv any data. Yes Daniel, agree on that, in this case tls handshake is done for default channel so we went in migration_ioc_process_incoming for default channel. But if we try to read some data there, it does not get because data on default channel is not yet flushed by source. From source side data in flushed in above function i pointed. Which blocks for multiFD channels to be setup with handshake, before flushing data. I mean data is sent only when buffer is full or explicitly flushed, till then it is just in buffer. But multiFD handshake can not complete until we return from migration_ioc_process_incoming for default channel which infintely waits because nothing is sent yet on channel. On the source side, if we're in ram_save_setup then the TLS handshake is already complete for the main channel. In fact I think the TLS handshake should act as a serialization point that prevents the entire bug. It should be guaranteed that the main channel is fully received b
Re: [PATCH] migration: check magic value for deciding the mapping of channels
On 03/11/22 11:47 pm, manish.mishra wrote: On 03/11/22 11:27 pm, Daniel P. Berrangé wrote: On Thu, Nov 03, 2022 at 11:06:23PM +0530, manish.mishra wrote: On 03/11/22 10:57 pm, Daniel P. Berrangé wrote: On Thu, Nov 03, 2022 at 10:04:54PM +0530, manish.mishra wrote: On 03/11/22 2:59 pm, Daniel P. Berrangé wrote: On Thu, Nov 03, 2022 at 02:50:25PM +0530, manish.mishra wrote: On 01/11/22 9:15 pm, Daniel P. Berrangé wrote: On Tue, Nov 01, 2022 at 09:10:14PM +0530, manish.mishra wrote: On 01/11/22 8:21 pm, Daniel P. Berrangé wrote: On Tue, Nov 01, 2022 at 02:30:29PM +, manish.mishra wrote: diff --git a/migration/migration.c b/migration/migration.c index 739bb683f3..f4b6f278a9 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -733,31 +733,40 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) { MigrationIncomingState *mis = migration_incoming_get_current(); Error *local_err = NULL; - bool start_migration; QEMUFile *f; + bool default_channel = true; + uint32_t channel_magic = 0; + int ret = 0; - if (!mis->from_src_file) { - /* The first connection (multifd may have multiple) */ + if (migrate_use_multifd() && !migration_in_postcopy()) { + ret = qio_channel_read_peek_all(ioc, (void *)&channel_magic, + sizeof(channel_magic), &local_err); + + if (ret != 1) { + error_propagate(errp, local_err); + return; + } and thus this will fail for TLS channels AFAICT. Yes, thanks for quick review Daniel. You pointed this earlier too, sorry missed it, will put another check !migrate_use_tls() in V2. But we need this problem fixed with TLS too, so just excluding it isn't right. IMHO we need to modify the migration code so we can read the magic earlier, instead of peeking. With regards, Daniel Hi Daniel, I was trying tls migrations. What i see is that tls session creation does handshake. So if we read ahead in ioc_process_incoming for default channel. Because client sends magic only after multiFD channels are setup, which too requires tls handshake. By the time we get to migrate_ioc_process_incoming, the TLS handshake has already been performed. migration_channel_process_incoming -> migration_ioc_process_incoming vs migration_channel_process_incoming -> migration_tls_channel_process_incoming -> migration_tls_incoming_handshake -> migration_channel_process_incoming -> migration_ioc_process_incoming Yes sorry i thought we block on source side till handshake is done but that is not true. I checked then why that deadlock is happening. So this where the dealock is happening. static int ram_save_setup(QEMUFile *f, void *opaque) { + + ram_control_before_iterate(f, RAM_CONTROL_SETUP); ram_control_after_iterate(f, RAM_CONTROL_SETUP); ret = multifd_send_sync_main(f); if (ret < 0) { return ret; } qemu_put_be64(f, RAM_SAVE_FLAG_EOS); qemu_fflush(f); return 0; } Now if we block in migration_ioc_process_incoming for reading magic value from channel, which is actually sent by client when this qemu_fflush is done. Before this qemu_fflush we wait for multifd_send_sync_main which actually requires that tls handshake is done for multiFD channels as it blocks on sem_sync which posted by multifd_send_thread which is called after handshake||. But then on destination side we are blocked in migration_ioc_process_incoming() waiting to read something from default channel hence handshake for multiFD channels can not happen. This to me looks unresolvable whatever way we try to manipulate stream until we do some changes on source side. The TLS handshake is already complete when migration_ioc_process_incoming is blocking on read. Regardless of which channel we're talking about, thue TLS handshake is always performed & finished before we try to either send or recv any data. Yes Daniel, agree on that, in this case tls handshake is done for default channel so we went in migration_ioc_process_incoming for default channel. But if we try to read some data there, it does not get because data on default channel is not yet flushed by source. From source side data in flushed in above function i pointed. Which blocks for multiFD channels to be setup with handshake, before flushing data. I mean data is sent only when buffer is full or explicitly flushed, till then it is just in buffer. But multiFD handshake can not complete until we return from migration_ioc_process_incoming for default channel which infintely waits because nothing is sent yet on channel. On the source side, if we're in ram_save_setup then the TLS handshake is already complete for the main channel. In fact I think the TLS handshake should act as a serialization point that prevents the entire bug. It should be guaranteed that the main channel is fully received b
Re: [PATCH] migration: check magic value for deciding the mapping of channels
On 03/11/22 11:27 pm, Daniel P. Berrangé wrote: On Thu, Nov 03, 2022 at 11:06:23PM +0530, manish.mishra wrote: On 03/11/22 10:57 pm, Daniel P. Berrangé wrote: On Thu, Nov 03, 2022 at 10:04:54PM +0530, manish.mishra wrote: On 03/11/22 2:59 pm, Daniel P. Berrangé wrote: On Thu, Nov 03, 2022 at 02:50:25PM +0530, manish.mishra wrote: On 01/11/22 9:15 pm, Daniel P. Berrangé wrote: On Tue, Nov 01, 2022 at 09:10:14PM +0530, manish.mishra wrote: On 01/11/22 8:21 pm, Daniel P. Berrangé wrote: On Tue, Nov 01, 2022 at 02:30:29PM +, manish.mishra wrote: diff --git a/migration/migration.c b/migration/migration.c index 739bb683f3..f4b6f278a9 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -733,31 +733,40 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) { MigrationIncomingState *mis = migration_incoming_get_current(); Error *local_err = NULL; -bool start_migration; QEMUFile *f; +bool default_channel = true; +uint32_t channel_magic = 0; +int ret = 0; -if (!mis->from_src_file) { -/* The first connection (multifd may have multiple) */ +if (migrate_use_multifd() && !migration_in_postcopy()) { +ret = qio_channel_read_peek_all(ioc, (void *)&channel_magic, +sizeof(channel_magic), &local_err); + +if (ret != 1) { +error_propagate(errp, local_err); +return; +} and thus this will fail for TLS channels AFAICT. Yes, thanks for quick review Daniel. You pointed this earlier too, sorry missed it, will put another check !migrate_use_tls() in V2. But we need this problem fixed with TLS too, so just excluding it isn't right. IMHO we need to modify the migration code so we can read the magic earlier, instead of peeking. With regards, Daniel Hi Daniel, I was trying tls migrations. What i see is that tls session creation does handshake. So if we read ahead in ioc_process_incoming for default channel. Because client sends magic only after multiFD channels are setup, which too requires tls handshake. By the time we get to migrate_ioc_process_incoming, the TLS handshake has already been performed. migration_channel_process_incoming -> migration_ioc_process_incoming vs migration_channel_process_incoming -> migration_tls_channel_process_incoming -> migration_tls_incoming_handshake -> migration_channel_process_incoming -> migration_ioc_process_incoming Yes sorry i thought we block on source side till handshake is done but that is not true. I checked then why that deadlock is happening. So this where the dealock is happening. static int ram_save_setup(QEMUFile *f, void *opaque) { + + ram_control_before_iterate(f, RAM_CONTROL_SETUP); ram_control_after_iterate(f, RAM_CONTROL_SETUP); ret = multifd_send_sync_main(f); if (ret < 0) { return ret; } qemu_put_be64(f, RAM_SAVE_FLAG_EOS); qemu_fflush(f); return 0; } Now if we block in migration_ioc_process_incoming for reading magic value from channel, which is actually sent by client when this qemu_fflush is done. Before this qemu_fflush we wait for multifd_send_sync_main which actually requires that tls handshake is done for multiFD channels as it blocks on sem_sync which posted by multifd_send_thread which is called after handshake||. But then on destination side we are blocked in migration_ioc_process_incoming() waiting to read something from default channel hence handshake for multiFD channels can not happen. This to me looks unresolvable whatever way we try to manipulate stream until we do some changes on source side. The TLS handshake is already complete when migration_ioc_process_incoming is blocking on read. Regardless of which channel we're talking about, thue TLS handshake is always performed & finished before we try to either send or recv any data. Yes Daniel, agree on that, in this case tls handshake is done for default channel so we went in migration_ioc_process_incoming for default channel. But if we try to read some data there, it does not get because data on default channel is not yet flushed by source. From source side data in flushed in above function i pointed. Which blocks for multiFD channels to be setup with handshake, before flushing data. I mean data is sent only when buffer is full or explicitly flushed, till then it is just in buffer. But multiFD handshake can not complete until we return from migration_ioc_process_incoming for default channel which infintely waits because nothing is sent yet on channel. On the source side, if we're in ram_save_setup then the TLS handshake is already complete for the main channel. In fact I think the TLS handshake should act as a serialization point that prevents the entire bug. It should be guaranteed that the main channel is fully received b
Re: [PATCH] migration: check magic value for deciding the mapping of channels
On 03/11/22 10:57 pm, Daniel P. Berrangé wrote: On Thu, Nov 03, 2022 at 10:04:54PM +0530, manish.mishra wrote: On 03/11/22 2:59 pm, Daniel P. Berrangé wrote: On Thu, Nov 03, 2022 at 02:50:25PM +0530, manish.mishra wrote: On 01/11/22 9:15 pm, Daniel P. Berrangé wrote: On Tue, Nov 01, 2022 at 09:10:14PM +0530, manish.mishra wrote: On 01/11/22 8:21 pm, Daniel P. Berrangé wrote: On Tue, Nov 01, 2022 at 02:30:29PM +, manish.mishra wrote: diff --git a/migration/migration.c b/migration/migration.c index 739bb683f3..f4b6f278a9 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -733,31 +733,40 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) { MigrationIncomingState *mis = migration_incoming_get_current(); Error *local_err = NULL; -bool start_migration; QEMUFile *f; +bool default_channel = true; +uint32_t channel_magic = 0; +int ret = 0; -if (!mis->from_src_file) { -/* The first connection (multifd may have multiple) */ +if (migrate_use_multifd() && !migration_in_postcopy()) { +ret = qio_channel_read_peek_all(ioc, (void *)&channel_magic, +sizeof(channel_magic), &local_err); + +if (ret != 1) { +error_propagate(errp, local_err); +return; +} and thus this will fail for TLS channels AFAICT. Yes, thanks for quick review Daniel. You pointed this earlier too, sorry missed it, will put another check !migrate_use_tls() in V2. But we need this problem fixed with TLS too, so just excluding it isn't right. IMHO we need to modify the migration code so we can read the magic earlier, instead of peeking. With regards, Daniel Hi Daniel, I was trying tls migrations. What i see is that tls session creation does handshake. So if we read ahead in ioc_process_incoming for default channel. Because client sends magic only after multiFD channels are setup, which too requires tls handshake. By the time we get to migrate_ioc_process_incoming, the TLS handshake has already been performed. migration_channel_process_incoming -> migration_ioc_process_incoming vs migration_channel_process_incoming -> migration_tls_channel_process_incoming -> migration_tls_incoming_handshake -> migration_channel_process_incoming -> migration_ioc_process_incoming Yes sorry i thought we block on source side till handshake is done but that is not true. I checked then why that deadlock is happening. So this where the dealock is happening. static int ram_save_setup(QEMUFile *f, void *opaque) { + + ram_control_before_iterate(f, RAM_CONTROL_SETUP); ram_control_after_iterate(f, RAM_CONTROL_SETUP); ret = multifd_send_sync_main(f); if (ret < 0) { return ret; } qemu_put_be64(f, RAM_SAVE_FLAG_EOS); qemu_fflush(f); return 0; } Now if we block in migration_ioc_process_incoming for reading magic value from channel, which is actually sent by client when this qemu_fflush is done. Before this qemu_fflush we wait for multifd_send_sync_main which actually requires that tls handshake is done for multiFD channels as it blocks on sem_sync which posted by multifd_send_thread which is called after handshake||. But then on destination side we are blocked in migration_ioc_process_incoming() waiting to read something from default channel hence handshake for multiFD channels can not happen. This to me looks unresolvable whatever way we try to manipulate stream until we do some changes on source side. The TLS handshake is already complete when migration_ioc_process_incoming is blocking on read. Regardless of which channel we're talking about, thue TLS handshake is always performed & finished before we try to either send or recv any data. With regards, Daniel Yes Daniel, agree on that, in this case tls handshake is done for default channel so we went in migration_ioc_process_incoming for default channel. But if we try to read some data there, it does not get because data on default channel is not yet flushed by source. From source side data in flushed in above function i pointed. Which blocks for multiFD channels to be setup with handshake, before flushing data. I mean data is sent only when buffer is full or explicitly flushed, till then it is just in buffer. But multiFD handshake can not complete until we return from migration_ioc_process_incoming for default channel which infintely waits because nothing is sent yet on channel. Thanks Manish Mishra
Re: [PATCH] migration: check magic value for deciding the mapping of channels
On 03/11/22 2:59 pm, Daniel P. Berrangé wrote: On Thu, Nov 03, 2022 at 02:50:25PM +0530, manish.mishra wrote: On 01/11/22 9:15 pm, Daniel P. Berrangé wrote: On Tue, Nov 01, 2022 at 09:10:14PM +0530, manish.mishra wrote: On 01/11/22 8:21 pm, Daniel P. Berrangé wrote: On Tue, Nov 01, 2022 at 02:30:29PM +, manish.mishra wrote: diff --git a/migration/migration.c b/migration/migration.c index 739bb683f3..f4b6f278a9 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -733,31 +733,40 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) { MigrationIncomingState *mis = migration_incoming_get_current(); Error *local_err = NULL; -bool start_migration; QEMUFile *f; +bool default_channel = true; +uint32_t channel_magic = 0; +int ret = 0; -if (!mis->from_src_file) { -/* The first connection (multifd may have multiple) */ +if (migrate_use_multifd() && !migration_in_postcopy()) { +ret = qio_channel_read_peek_all(ioc, (void *)&channel_magic, +sizeof(channel_magic), &local_err); + +if (ret != 1) { +error_propagate(errp, local_err); +return; +} and thus this will fail for TLS channels AFAICT. Yes, thanks for quick review Daniel. You pointed this earlier too, sorry missed it, will put another check !migrate_use_tls() in V2. But we need this problem fixed with TLS too, so just excluding it isn't right. IMHO we need to modify the migration code so we can read the magic earlier, instead of peeking. With regards, Daniel Hi Daniel, I was trying tls migrations. What i see is that tls session creation does handshake. So if we read ahead in ioc_process_incoming for default channel. Because client sends magic only after multiFD channels are setup, which too requires tls handshake. By the time we get to migrate_ioc_process_incoming, the TLS handshake has already been performed. migration_channel_process_incoming -> migration_ioc_process_incoming vs migration_channel_process_incoming -> migration_tls_channel_process_incoming -> migration_tls_incoming_handshake -> migration_channel_process_incoming -> migration_ioc_process_incoming Yes sorry i thought we block on source side till handshake is done but that is not true. I checked then why that deadlock is happening. So this where the dealock is happening. static int ram_save_setup(QEMUFile *f, void *opaque) { + + ram_control_before_iterate(f, RAM_CONTROL_SETUP); ram_control_after_iterate(f, RAM_CONTROL_SETUP); ret = multifd_send_sync_main(f); if (ret < 0) { return ret; } qemu_put_be64(f, RAM_SAVE_FLAG_EOS); qemu_fflush(f); return 0; } Now if we block in migration_ioc_process_incoming for reading magic value from channel, which is actually sent by client when this qemu_fflush is done. Before this qemu_fflush we wait for multifd_send_sync_main which actually requires that tls handshake is done for multiFD channels as it blocks on sem_sync which posted by multifd_send_thread which is called after handshake||. But then on destination side we are blocked in migration_ioc_process_incoming() waiting to read something from default channel hence handshake for multiFD channels can not happen. This to me looks unresolvable whatever way we try to manipulate stream until we do some changes on source side. With regards, Daniel
Re: [PATCH] migration: check magic value for deciding the mapping of channels
On 01/11/22 9:15 pm, Daniel P. Berrangé wrote: On Tue, Nov 01, 2022 at 09:10:14PM +0530, manish.mishra wrote: On 01/11/22 8:21 pm, Daniel P. Berrangé wrote: On Tue, Nov 01, 2022 at 02:30:29PM +, manish.mishra wrote: Current logic assumes that channel connections on the destination side are always established in the same order as the source and the first one will always be the default channel followed by the multifid or post-copy preemption channel. This may not be always true, as even if a channel has a connection established on the source side it can be in the pending state on the destination side and a newer connection can be established first. Basically causing out of order mapping of channels on the destination side. Currently, all channels except post-copy preempt send a magic number, this patch uses that magic number to decide the type of channel. This logic is applicable only for precopy(multifd) live migration, as mentioned, the post-copy preempt channel does not send any magic number. Also, this patch uses MSG_PEEK to check the magic number of channels so that current data/control stream management remains un-effected. Signed-off-by: manish.mishra --- include/io/channel.h | 25 + io/channel-socket.c | 27 +++ io/channel.c | 39 +++ migration/migration.c| 33 + migration/multifd.c | 12 migration/multifd.h | 2 +- migration/postcopy-ram.c | 5 + migration/postcopy-ram.h | 2 +- 8 files changed, 119 insertions(+), 26 deletions(-) diff --git a/include/io/channel.h b/include/io/channel.h index c680ee7480..74177aeeea 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -115,6 +115,10 @@ struct QIOChannelClass { int **fds, size_t *nfds, Error **errp); +ssize_t (*io_read_peek)(QIOChannel *ioc, +void *buf, +size_t nbytes, +Error **errp); int (*io_close)(QIOChannel *ioc, Error **errp); GSource * (*io_create_watch)(QIOChannel *ioc, @@ -475,6 +479,27 @@ int qio_channel_write_all(QIOChannel *ioc, size_t buflen, Error **errp); +/** + * qio_channel_read_peek_all: + * @ioc: the channel object + * @buf: the memory region to read in data + * @nbytes: the number of bytes to read + * @errp: pointer to a NULL-initialized error object + * + * Read given @nbytes data from peek of channel into + * memory region @buf. + * + * The function will be blocked until read size is + * equal to requested size. + * + * Returns: 1 if all bytes were read, 0 if end-of-file + * occurs without data, or -1 on error + */ +int qio_channel_read_peek_all(QIOChannel *ioc, + void* buf, + size_t nbytes, + Error **errp); + /** * qio_channel_set_blocking: * @ioc: the channel object diff --git a/io/channel-socket.c b/io/channel-socket.c index b76dca9cc1..b99f5dfda6 100644 --- a/io/channel-socket.c +++ b/io/channel-socket.c @@ -705,6 +705,32 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc, } #endif /* WIN32 */ +static ssize_t qio_channel_socket_read_peek(QIOChannel *ioc, +void *buf, +size_t nbytes, +Error **errp) +{ +QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc); +ssize_t bytes = 0; + +retry: +bytes = recv(sioc->fd, buf, nbytes, MSG_PEEK); + +if (bytes < 0) { +if (errno == EINTR) { +goto retry; +} +if (errno == EAGAIN) { +return QIO_CHANNEL_ERR_BLOCK; +} + +error_setg_errno(errp, errno, + "Unable to read from peek of socket"); +return -1; +} + +return bytes; +} #ifdef QEMU_MSG_ZEROCOPY static int qio_channel_socket_flush(QIOChannel *ioc, @@ -902,6 +928,7 @@ static void qio_channel_socket_class_init(ObjectClass *klass, ioc_klass->io_writev = qio_channel_socket_writev; ioc_klass->io_readv = qio_channel_socket_readv; +ioc_klass->io_read_peek = qio_channel_socket_read_peek; ioc_klass->io_set_blocking = qio_channel_socket_set_blocking; ioc_klass->io_close = qio_channel_socket_close; ioc_klass->io_shutdown = qio_channel_socket_shutdown; diff --git a/io/channel.c b/io/channel.c index 0640941ac5..a2d9b96f3f 100644 --- a/io/channel.c +++ b/io/channel.c @@ -346,6 +346,45 @@ int qio_channel_write_all(QIOChannel *ioc, return qio_channel_writev_all(ioc, &iov, 1, errp); } +int qio_channel_read_peek_all(QIOChannel *ioc, +
Re: [PATCH] migration: check magic value for deciding the mapping of channels
On 01/11/22 9:15 pm, Daniel P. Berrangé wrote: On Tue, Nov 01, 2022 at 09:10:14PM +0530, manish.mishra wrote: On 01/11/22 8:21 pm, Daniel P. Berrangé wrote: On Tue, Nov 01, 2022 at 02:30:29PM +, manish.mishra wrote: Current logic assumes that channel connections on the destination side are always established in the same order as the source and the first one will always be the default channel followed by the multifid or post-copy preemption channel. This may not be always true, as even if a channel has a connection established on the source side it can be in the pending state on the destination side and a newer connection can be established first. Basically causing out of order mapping of channels on the destination side. Currently, all channels except post-copy preempt send a magic number, this patch uses that magic number to decide the type of channel. This logic is applicable only for precopy(multifd) live migration, as mentioned, the post-copy preempt channel does not send any magic number. Also, this patch uses MSG_PEEK to check the magic number of channels so that current data/control stream management remains un-effected. Signed-off-by: manish.mishra --- include/io/channel.h | 25 + io/channel-socket.c | 27 +++ io/channel.c | 39 +++ migration/migration.c| 33 + migration/multifd.c | 12 migration/multifd.h | 2 +- migration/postcopy-ram.c | 5 + migration/postcopy-ram.h | 2 +- 8 files changed, 119 insertions(+), 26 deletions(-) diff --git a/include/io/channel.h b/include/io/channel.h index c680ee7480..74177aeeea 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -115,6 +115,10 @@ struct QIOChannelClass { int **fds, size_t *nfds, Error **errp); +ssize_t (*io_read_peek)(QIOChannel *ioc, +void *buf, +size_t nbytes, +Error **errp); int (*io_close)(QIOChannel *ioc, Error **errp); GSource * (*io_create_watch)(QIOChannel *ioc, @@ -475,6 +479,27 @@ int qio_channel_write_all(QIOChannel *ioc, size_t buflen, Error **errp); +/** + * qio_channel_read_peek_all: + * @ioc: the channel object + * @buf: the memory region to read in data + * @nbytes: the number of bytes to read + * @errp: pointer to a NULL-initialized error object + * + * Read given @nbytes data from peek of channel into + * memory region @buf. + * + * The function will be blocked until read size is + * equal to requested size. + * + * Returns: 1 if all bytes were read, 0 if end-of-file + * occurs without data, or -1 on error + */ +int qio_channel_read_peek_all(QIOChannel *ioc, + void* buf, + size_t nbytes, + Error **errp); + /** * qio_channel_set_blocking: * @ioc: the channel object diff --git a/io/channel-socket.c b/io/channel-socket.c index b76dca9cc1..b99f5dfda6 100644 --- a/io/channel-socket.c +++ b/io/channel-socket.c @@ -705,6 +705,32 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc, } #endif /* WIN32 */ +static ssize_t qio_channel_socket_read_peek(QIOChannel *ioc, +void *buf, +size_t nbytes, +Error **errp) +{ +QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc); +ssize_t bytes = 0; + +retry: +bytes = recv(sioc->fd, buf, nbytes, MSG_PEEK); + +if (bytes < 0) { +if (errno == EINTR) { +goto retry; +} +if (errno == EAGAIN) { +return QIO_CHANNEL_ERR_BLOCK; +} + +error_setg_errno(errp, errno, + "Unable to read from peek of socket"); +return -1; +} + +return bytes; +} #ifdef QEMU_MSG_ZEROCOPY static int qio_channel_socket_flush(QIOChannel *ioc, @@ -902,6 +928,7 @@ static void qio_channel_socket_class_init(ObjectClass *klass, ioc_klass->io_writev = qio_channel_socket_writev; ioc_klass->io_readv = qio_channel_socket_readv; +ioc_klass->io_read_peek = qio_channel_socket_read_peek; ioc_klass->io_set_blocking = qio_channel_socket_set_blocking; ioc_klass->io_close = qio_channel_socket_close; ioc_klass->io_shutdown = qio_channel_socket_shutdown; diff --git a/io/channel.c b/io/channel.c index 0640941ac5..a2d9b96f3f 100644 --- a/io/channel.c +++ b/io/channel.c @@ -346,6 +346,45 @@ int qio_channel_write_all(QIOChannel *ioc, return qio_channel_writev_all(ioc, &iov, 1, errp); } +int qio_channel_read_peek_all(QIOChannel *ioc, +
Re: [PATCH] migration: check magic value for deciding the mapping of channels
On 01/11/22 8:21 pm, Daniel P. Berrangé wrote: On Tue, Nov 01, 2022 at 02:30:29PM +, manish.mishra wrote: Current logic assumes that channel connections on the destination side are always established in the same order as the source and the first one will always be the default channel followed by the multifid or post-copy preemption channel. This may not be always true, as even if a channel has a connection established on the source side it can be in the pending state on the destination side and a newer connection can be established first. Basically causing out of order mapping of channels on the destination side. Currently, all channels except post-copy preempt send a magic number, this patch uses that magic number to decide the type of channel. This logic is applicable only for precopy(multifd) live migration, as mentioned, the post-copy preempt channel does not send any magic number. Also, this patch uses MSG_PEEK to check the magic number of channels so that current data/control stream management remains un-effected. Signed-off-by: manish.mishra --- include/io/channel.h | 25 + io/channel-socket.c | 27 +++ io/channel.c | 39 +++ migration/migration.c| 33 + migration/multifd.c | 12 migration/multifd.h | 2 +- migration/postcopy-ram.c | 5 + migration/postcopy-ram.h | 2 +- 8 files changed, 119 insertions(+), 26 deletions(-) diff --git a/include/io/channel.h b/include/io/channel.h index c680ee7480..74177aeeea 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -115,6 +115,10 @@ struct QIOChannelClass { int **fds, size_t *nfds, Error **errp); +ssize_t (*io_read_peek)(QIOChannel *ioc, +void *buf, +size_t nbytes, +Error **errp); int (*io_close)(QIOChannel *ioc, Error **errp); GSource * (*io_create_watch)(QIOChannel *ioc, @@ -475,6 +479,27 @@ int qio_channel_write_all(QIOChannel *ioc, size_t buflen, Error **errp); +/** + * qio_channel_read_peek_all: + * @ioc: the channel object + * @buf: the memory region to read in data + * @nbytes: the number of bytes to read + * @errp: pointer to a NULL-initialized error object + * + * Read given @nbytes data from peek of channel into + * memory region @buf. + * + * The function will be blocked until read size is + * equal to requested size. + * + * Returns: 1 if all bytes were read, 0 if end-of-file + * occurs without data, or -1 on error + */ +int qio_channel_read_peek_all(QIOChannel *ioc, + void* buf, + size_t nbytes, + Error **errp); + /** * qio_channel_set_blocking: * @ioc: the channel object diff --git a/io/channel-socket.c b/io/channel-socket.c index b76dca9cc1..b99f5dfda6 100644 --- a/io/channel-socket.c +++ b/io/channel-socket.c @@ -705,6 +705,32 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc, } #endif /* WIN32 */ +static ssize_t qio_channel_socket_read_peek(QIOChannel *ioc, +void *buf, +size_t nbytes, +Error **errp) +{ +QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc); +ssize_t bytes = 0; + +retry: +bytes = recv(sioc->fd, buf, nbytes, MSG_PEEK); + +if (bytes < 0) { +if (errno == EINTR) { +goto retry; +} +if (errno == EAGAIN) { +return QIO_CHANNEL_ERR_BLOCK; +} + +error_setg_errno(errp, errno, + "Unable to read from peek of socket"); +return -1; +} + +return bytes; +} #ifdef QEMU_MSG_ZEROCOPY static int qio_channel_socket_flush(QIOChannel *ioc, @@ -902,6 +928,7 @@ static void qio_channel_socket_class_init(ObjectClass *klass, ioc_klass->io_writev = qio_channel_socket_writev; ioc_klass->io_readv = qio_channel_socket_readv; +ioc_klass->io_read_peek = qio_channel_socket_read_peek; ioc_klass->io_set_blocking = qio_channel_socket_set_blocking; ioc_klass->io_close = qio_channel_socket_close; ioc_klass->io_shutdown = qio_channel_socket_shutdown; diff --git a/io/channel.c b/io/channel.c index 0640941ac5..a2d9b96f3f 100644 --- a/io/channel.c +++ b/io/channel.c @@ -346,6 +346,45 @@ int qio_channel_write_all(QIOChannel *ioc, return qio_channel_writev_all(ioc, &iov, 1, errp); } +int qio_channel_read_peek_all(QIOChannel *ioc, + void* buf, + size_t nbytes, + Error
Re: [PATCH] migration: check magic value for deciding the mapping of channels
Sorry for late patch on this. I mentioned i will send it last week itself, but later reliased it was festival week in India, so was mostly holidays. Thanks Manish Mishra On 01/11/22 8:00 pm, manish.mishra wrote: Current logic assumes that channel connections on the destination side are always established in the same order as the source and the first one will always be the default channel followed by the multifid or post-copy preemption channel. This may not be always true, as even if a channel has a connection established on the source side it can be in the pending state on the destination side and a newer connection can be established first. Basically causing out of order mapping of channels on the destination side. Currently, all channels except post-copy preempt send a magic number, this patch uses that magic number to decide the type of channel. This logic is applicable only for precopy(multifd) live migration, as mentioned, the post-copy preempt channel does not send any magic number. Also, this patch uses MSG_PEEK to check the magic number of channels so that current data/control stream management remains un-effected. Signed-off-by: manish.mishra --- include/io/channel.h | 25 + io/channel-socket.c | 27 +++ io/channel.c | 39 +++ migration/migration.c| 33 + migration/multifd.c | 12 migration/multifd.h | 2 +- migration/postcopy-ram.c | 5 + migration/postcopy-ram.h | 2 +- 8 files changed, 119 insertions(+), 26 deletions(-) diff --git a/include/io/channel.h b/include/io/channel.h index c680ee7480..74177aeeea 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -115,6 +115,10 @@ struct QIOChannelClass { int **fds, size_t *nfds, Error **errp); +ssize_t (*io_read_peek)(QIOChannel *ioc, +void *buf, +size_t nbytes, +Error **errp); int (*io_close)(QIOChannel *ioc, Error **errp); GSource * (*io_create_watch)(QIOChannel *ioc, @@ -475,6 +479,27 @@ int qio_channel_write_all(QIOChannel *ioc, size_t buflen, Error **errp); +/** + * qio_channel_read_peek_all: + * @ioc: the channel object + * @buf: the memory region to read in data + * @nbytes: the number of bytes to read + * @errp: pointer to a NULL-initialized error object + * + * Read given @nbytes data from peek of channel into + * memory region @buf. + * + * The function will be blocked until read size is + * equal to requested size. + * + * Returns: 1 if all bytes were read, 0 if end-of-file + * occurs without data, or -1 on error + */ +int qio_channel_read_peek_all(QIOChannel *ioc, + void* buf, + size_t nbytes, + Error **errp); + /** * qio_channel_set_blocking: * @ioc: the channel object diff --git a/io/channel-socket.c b/io/channel-socket.c index b76dca9cc1..b99f5dfda6 100644 --- a/io/channel-socket.c +++ b/io/channel-socket.c @@ -705,6 +705,32 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc, } #endif /* WIN32 */ +static ssize_t qio_channel_socket_read_peek(QIOChannel *ioc, +void *buf, +size_t nbytes, +Error **errp) +{ +QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc); +ssize_t bytes = 0; + +retry: +bytes = recv(sioc->fd, buf, nbytes, MSG_PEEK); + +if (bytes < 0) { +if (errno == EINTR) { +goto retry; +} +if (errno == EAGAIN) { +return QIO_CHANNEL_ERR_BLOCK; +} + +error_setg_errno(errp, errno, + "Unable to read from peek of socket"); +return -1; +} + +return bytes; +} #ifdef QEMU_MSG_ZEROCOPY static int qio_channel_socket_flush(QIOChannel *ioc, @@ -902,6 +928,7 @@ static void qio_channel_socket_class_init(ObjectClass *klass, ioc_klass->io_writev = qio_channel_socket_writev; ioc_klass->io_readv = qio_channel_socket_readv; +ioc_klass->io_read_peek = qio_channel_socket_read_peek; ioc_klass->io_set_blocking = qio_channel_socket_set_blocking; ioc_klass->io_close = qio_channel_socket_close; ioc_klass->io_shutdown = qio_channel_socket_shutdown; diff --git a/io/channel.c b/io/channel.c index 0640941ac5..a2d9b96f3f 100644 --- a/io/channel.c +++ b/io/channel.c @@ -346,6 +346,45 @@ int qio_channel_write_all(QIOChannel *ioc, return qio_channel_writev_all(ioc, &iov, 1, errp); } +int qio_channel_read_peek_all(QIOChannel *ioc, +
[PATCH] migration: check magic value for deciding the mapping of channels
Current logic assumes that channel connections on the destination side are always established in the same order as the source and the first one will always be the default channel followed by the multifid or post-copy preemption channel. This may not be always true, as even if a channel has a connection established on the source side it can be in the pending state on the destination side and a newer connection can be established first. Basically causing out of order mapping of channels on the destination side. Currently, all channels except post-copy preempt send a magic number, this patch uses that magic number to decide the type of channel. This logic is applicable only for precopy(multifd) live migration, as mentioned, the post-copy preempt channel does not send any magic number. Also, this patch uses MSG_PEEK to check the magic number of channels so that current data/control stream management remains un-effected. Signed-off-by: manish.mishra --- include/io/channel.h | 25 + io/channel-socket.c | 27 +++ io/channel.c | 39 +++ migration/migration.c| 33 + migration/multifd.c | 12 migration/multifd.h | 2 +- migration/postcopy-ram.c | 5 + migration/postcopy-ram.h | 2 +- 8 files changed, 119 insertions(+), 26 deletions(-) diff --git a/include/io/channel.h b/include/io/channel.h index c680ee7480..74177aeeea 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -115,6 +115,10 @@ struct QIOChannelClass { int **fds, size_t *nfds, Error **errp); +ssize_t (*io_read_peek)(QIOChannel *ioc, +void *buf, +size_t nbytes, +Error **errp); int (*io_close)(QIOChannel *ioc, Error **errp); GSource * (*io_create_watch)(QIOChannel *ioc, @@ -475,6 +479,27 @@ int qio_channel_write_all(QIOChannel *ioc, size_t buflen, Error **errp); +/** + * qio_channel_read_peek_all: + * @ioc: the channel object + * @buf: the memory region to read in data + * @nbytes: the number of bytes to read + * @errp: pointer to a NULL-initialized error object + * + * Read given @nbytes data from peek of channel into + * memory region @buf. + * + * The function will be blocked until read size is + * equal to requested size. + * + * Returns: 1 if all bytes were read, 0 if end-of-file + * occurs without data, or -1 on error + */ +int qio_channel_read_peek_all(QIOChannel *ioc, + void* buf, + size_t nbytes, + Error **errp); + /** * qio_channel_set_blocking: * @ioc: the channel object diff --git a/io/channel-socket.c b/io/channel-socket.c index b76dca9cc1..b99f5dfda6 100644 --- a/io/channel-socket.c +++ b/io/channel-socket.c @@ -705,6 +705,32 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc, } #endif /* WIN32 */ +static ssize_t qio_channel_socket_read_peek(QIOChannel *ioc, +void *buf, +size_t nbytes, +Error **errp) +{ +QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc); +ssize_t bytes = 0; + +retry: +bytes = recv(sioc->fd, buf, nbytes, MSG_PEEK); + +if (bytes < 0) { +if (errno == EINTR) { +goto retry; +} +if (errno == EAGAIN) { +return QIO_CHANNEL_ERR_BLOCK; +} + +error_setg_errno(errp, errno, + "Unable to read from peek of socket"); +return -1; +} + +return bytes; +} #ifdef QEMU_MSG_ZEROCOPY static int qio_channel_socket_flush(QIOChannel *ioc, @@ -902,6 +928,7 @@ static void qio_channel_socket_class_init(ObjectClass *klass, ioc_klass->io_writev = qio_channel_socket_writev; ioc_klass->io_readv = qio_channel_socket_readv; +ioc_klass->io_read_peek = qio_channel_socket_read_peek; ioc_klass->io_set_blocking = qio_channel_socket_set_blocking; ioc_klass->io_close = qio_channel_socket_close; ioc_klass->io_shutdown = qio_channel_socket_shutdown; diff --git a/io/channel.c b/io/channel.c index 0640941ac5..a2d9b96f3f 100644 --- a/io/channel.c +++ b/io/channel.c @@ -346,6 +346,45 @@ int qio_channel_write_all(QIOChannel *ioc, return qio_channel_writev_all(ioc, &iov, 1, errp); } +int qio_channel_read_peek_all(QIOChannel *ioc, + void* buf, + size_t nbytes, + Error **errp) +{ + QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc); + ssize_t bytes = 0; + + if (!klass->io_read_peek) { + error_setg(
Re: MultiFD and default channel out of order mapping on receive side.
On 21/10/22 3:37 am, Daniel P. Berrangé wrote: On Thu, Oct 20, 2022 at 12:32:06PM -0400, Peter Xu wrote: On Thu, Oct 20, 2022 at 08:14:19PM +0530, manish.mishra wrote: I had one concern, during recover we do not send any magic. As of now we do not support multifd with postcopy so it should be fine, we can do explict checking for non-recovery case. But i remember from some discussion in future there may be support for multiFD with postcopy or have multiple postcopy preempt channels too, then proper handshake will be required? So at some point we want to take that path? For now i agree approach 1 will be good as suggested by Daniel it can be backported easily to older qemu's too. Yes for the long run I think we should provide a generic solution for all the channels to be established for migration purpose. Not to mention that as I replied previously to my original email, the trick won't easily work with dest QEMU where we need further change to allow qemu to accept new channels during loading of the VM. Considering the complexity that it'll take just to resolve the prempt channel ordering, I think maybe it's cleaner we just look for the long term goal. I think we should just ignore the preempt channel. Lets just do the easy bit and fix the main vs multifd channel mixup, as that's the one that is definitely actively hitting people today. We can solve that as a quick win in a way that is easy to backport to existing releases of QEMU for those affected. Yes, that works for now Daniel. I can send a patch as per your earlier suggestions on this, early next week if it fine? Separately from that, lets define a clean slate migration protocol to solve many of our historic problems and mistakes that can't be dealt with through retrofitting, not limited to just this ordering mistake. We had a significant discussion about it at the start of the year in this thread, which I think we should take forward and write into a formal protocol spec. https://urldefense.proofpoint.com/v2/url?u=https-3A__lists.gnu.org_archive_html_qemu-2Ddevel_2022-2D03_msg03655.html&d=DwIBaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=vjRKUpojJeAWFcJNi7YETWaLjjOcLWIcdk8KO-HMu8_A3veG2aaGmZoLxFVHgLt0&s=uv5cKdIMcT8Wo3erYQZJqnMXvRCbda4gNtxsy9EXrC8&e= This was a good read for me. :) With regards, Daniel Thanks Manish Mishra
Re: MultiFD and default channel out of order mapping on receive side.
On 19/10/22 2:30 am, Peter Xu wrote: On Tue, Oct 18, 2022 at 10:51:12AM -0400, Peter Xu wrote: On Tue, Oct 18, 2022 at 09:18:28AM +0100, Daniel P. Berrangé wrote: On Mon, Oct 17, 2022 at 05:15:35PM -0400, Peter Xu wrote: On Mon, Oct 17, 2022 at 12:38:30PM +0100, Daniel P. Berrangé wrote: On Mon, Oct 17, 2022 at 01:06:00PM +0530, manish.mishra wrote: Hi Daniel, I was thinking for some solutions for this so wanted to discuss that before going ahead. Also added Juan and Peter in loop. 1. Earlier i was thinking, on destination side as of now for default and multi-FD channel first data to be sent is MAGIC_NUMBER and VERSION so may be we can decide mapping based on that. But then that does not work for newly added post copy preempt channel as it does not send any MAGIC number. Also even for multiFD just MAGIC number does not tell which multifd channel number is it, even though as per my thinking it does not matter. So MAGIC number should be good for indentifying default vs multiFD channel? Yep, you don't need to know more than the MAGIC value. In migration_io_process_incoming, we need to use MSG_PEEK to look at the first 4 bytes pendingon the wire. If those bytes are 'QEVM' that's the primary channel, if those bytes are big endian 0x11223344, that's a multifd channel. Using MSG_PEEK aviods need to modify thue later code that actually reads this data. The challenge is how long to wait with the MSG_PEEK. If we do it in a blocking mode, its fine for main channel and multifd, but IIUC for the post-copy pre-empt channel we'd be waiting for something that will never arrive. Having suggested MSG_PEEK though, this may well not work if the channel has TLS present. In fact it almost definitely won't work. To cope with TLS migration_io_process_incoming would need to actually read the data off the wire, and later methods be taught to skip reading the magic. 2. For post-copy preempt may be we can initiate this channel only after we have received a request from remote e.g. remote page fault. This to me looks safest considering post-copy recorvery case too. I can not think of any depedency on post copy preempt channel which requires it to be initialised very early. May be Peter can confirm this. I guess that could work Currently all preempt code still assumes when postcopy activated it's in preempt mode. IIUC such a change will bring an extra phase of postcopy with no-preempt before preempt enabled. We may need to teach qemu to understand that if it's needed. Meanwhile the initial page requests will not be able to benefit from the new preempt channel too. 3. Another thing we can do is to have 2-way handshake on every channel creation with some additional metadata, this to me looks like cleanest approach and durable, i understand that can break migration to/from old qemu, but then that can come as migration capability? The benefit of (1) is that the fix can be deployed for all existing QEMU releases by backporting it. (3) will meanwhile need mgmt app updates to make it work, which is much more work to deploy. We really shoulud have had a more formal handshake, and I've described ways to achieve this in the past, but it is quite alot of work. I don't know whether (1) is a valid option if there are use cases that it cannot cover (on either tls or preempt). The handshake is definitely the clean approach. What's the outcome of such wrongly ordered connections? Will migration fail immediately and safely? For multifd, I think it should fail immediately after the connection established. For preempt, I'd also expect the same thing because the only wrong order to happen right now is having the preempt channel to be the migration channel, then it should also fail immediately on the first qemu_get_byte(). Hopefully that's still not too bad - I mean, if we can fail constantly and safely (never fail during postcopy), we can always retry and as long as connections created successfully we can start the migration safely. But please correct me if it's not the case. It should typically fail as the magic bytes are different, which will not pass validation. The exception being the postcopy pre-empt channel which may well cause migration to stall as nothing will be sent initially by the src. Hmm right.. Actually if preempt channel is special we can fix it alone. As both of you discussed, we can postpone the preempt channel setup, maybe not as late as when we receive the 1st page request, but: (1) For newly established migration, we can postpone preempt channel setup (postcopy_preempt_setup, resume=false) to the entrance of postcopy_start(). (2) For a postcopy recovery process, we can postpone preempt channel setup (postcopy_preempt_setup, resume=true) to postcopy_do_resume(), maybe between qemu_savevm_state_resume_prepare() and the final handshake of postcopy_resume_handshake(). Yes Peter
Re: MultiFD and default channel out of order mapping on receive side.
Hi Daniel, I was thinking for some solutions for this so wanted to discuss that before going ahead. Also added Juan and Peter in loop. 1. Earlier i was thinking, on destination side as of now for default and multi-FD channel first data to be sent is MAGIC_NUMBER and VERSION so may be we can decide mapping based on that. But then that does not work for newly added post copy preempt channel as it does not send any MAGIC number. Also even for multiFD just MAGIC number does not tell which multifd channel number is it, even though as per my thinking it does not matter. So MAGIC number should be good for indentifying default vs multiFD channel? 2. For post-copy preempt may be we can initiate this channel only after we have received a request from remote e.g. remote page fault. This to me looks safest considering post-copy recorvery case too. I can not think of any depedency on post copy preempt channel which requires it to be initialised very early. May be Peter can confirm this. 3. Another thing we can do is to have 2-way handshake on every channel creation with some additional metadata, this to me looks like cleanest approach and durable, i understand that can break migration to/from old qemu, but then that can come as migration capability? Please let me know if any of these works or if you have some other suggestions? Thanks Manish Mishra On 13/10/22 1:45 pm, Daniel P. Berrangé wrote: On Thu, Oct 13, 2022 at 01:23:40AM +0530, manish.mishra wrote: Hi Everyone, Hope everyone is doing great. I have seen some live migration issues with qemu-4.2 when using multiFD. Signature of issue is something like this. 2022-10-01T09:57:53.972864Z qemu-kvm: failed to receive packet via multifd channel 0: multifd: received packet magic 5145564d expected 11223344 Basically default live migration channel packet is received on multiFD channel. I see a older patch explaining potential reason for this behavior. https://urldefense.proofpoint.com/v2/url?u=https-3A__lists.gnu.org_archive_html_qemu-2Ddevel_2019-2D10_msg05920.html&d=DwIBaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=LZBcU_C3HMbpUCFZgqxkS-pV8C2mHOjqUTzt45LlLwa26DA0pCAjJVDoamnX8vnC&s=B-b_HMnn_ee6JeA87-PVNBrBqxzdWYgo5PpaP91dqT8&e= [PATCH 3/3] migration/multifd: fix potential wrong acception order of IO. But i see this patch was not merged. By looking at qemu master code, i could not find any other patch too which can handle this issue. So as per my understanding this is still a potential issue even in qemu master. I mainly wanted to check why this patch was dropped? See my repllies in that message - it broke compatilibity of data on the wire, meaning old QEMU can't talk to new QEMU and vica-verca. We need a fix for this issue, but it needs to take into account wire compatibility. With regards, Daniel
Re: MultiFD and default channel out of order mapping on receive side.
On 13/10/22 1:45 pm, Daniel P. Berrangé wrote: On Thu, Oct 13, 2022 at 01:23:40AM +0530, manish.mishra wrote: Hi Everyone, Hope everyone is doing great. I have seen some live migration issues with qemu-4.2 when using multiFD. Signature of issue is something like this. 2022-10-01T09:57:53.972864Z qemu-kvm: failed to receive packet via multifd channel 0: multifd: received packet magic 5145564d expected 11223344 Basically default live migration channel packet is received on multiFD channel. I see a older patch explaining potential reason for this behavior. https://urldefense.proofpoint.com/v2/url?u=https-3A__lists.gnu.org_archive_html_qemu-2Ddevel_2019-2D10_msg05920.html&d=DwIBaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=LZBcU_C3HMbpUCFZgqxkS-pV8C2mHOjqUTzt45LlLwa26DA0pCAjJVDoamnX8vnC&s=B-b_HMnn_ee6JeA87-PVNBrBqxzdWYgo5PpaP91dqT8&e= [PATCH 3/3] migration/multifd: fix potential wrong acception order of IO. But i see this patch was not merged. By looking at qemu master code, i could not find any other patch too which can handle this issue. So as per my understanding this is still a potential issue even in qemu master. I mainly wanted to check why this patch was dropped? See my repllies in that message - it broke compatilibity of data on the wire, meaning old QEMU can't talk to new QEMU and vica-verca. We need a fix for this issue, but it needs to take into account wire compatibility. With regards, Daniel ok got it, thank you so much Daniel, in that case i will try to create some patch considering backward compatibility and send for review. Mainly i wanted to understand if it is handled somehow differently in upstream master, but manually looking code it did not look like that, so just wanted to confirm. Thanks Manish Mishra
MultiFD and default channel out of order mapping on receive side.
Hi Everyone, Hope everyone is doing great. I have seen some live migration issues with qemu-4.2 when using multiFD. Signature of issue is something like this. 2022-10-01T09:57:53.972864Z qemu-kvm: failed to receive packet via multifd channel 0: multifd: received packet magic 5145564d expected 11223344 Basically default live migration channel packet is received on multiFD channel. I see a older patch explaining potential reason for this behavior. https://lists.gnu.org/archive/html/qemu-devel/2019-10/msg05920.html > [PATCH 3/3] migration/multifd: fix potential wrong acception order of IO. But i see this patch was not merged. By looking at qemu master code, i could not find any other patch too which can handle this issue. So as per my understanding this is still a potential issue even in qemu master. I mainly wanted to check why this patch was dropped? Sorry if mis-understood something. It will be great if someone can provide some pointers on this. Thanks Manish Mishra
Re: [PATCH 0/4] Multiple interface support on top of Multi-FD
On 16/06/22 9:20 pm, Dr. David Alan Gilbert wrote: * Daniel P. Berrangé (berra...@redhat.com) wrote: On Wed, Jun 15, 2022 at 05:43:28PM +0100, Daniel P. Berrangé wrote: On Fri, Jun 10, 2022 at 05:58:31PM +0530, manish.mishra wrote: On 09/06/22 9:17 pm, Daniel P. Berrangé wrote: On Thu, Jun 09, 2022 at 07:33:01AM +, Het Gala wrote: As of now, the multi-FD feature supports connection over the default network only. This Patchset series is a Qemu side implementation of providing multiple interfaces support for multi-FD. This enables us to fully utilize dedicated or multiple NICs in case bonding of NICs is not possible. Introduction - Multi-FD Qemu implementation currently supports connection only on the default network. This forbids us from advantages like: - Separating VM live migration traffic from the default network. Hi Daniel, I totally understand your concern around this approach increasing compexity inside qemu, when similar things can be done with NIC teaming. But we thought this approach provides much more flexibility to user in few cases like. 1. We checked our customer data, almost all of the host had multiple NIC, but LACP support in their setups was very rare. So for those cases this approach can help in utilise multiple NICs as teaming is not possible there. AFAIK, LACP is not required in order to do link aggregation with Linux. Traditional Linux bonding has no special NIC hardware or switch requirements, so LACP is merely a "nice to have" in order to simplify some aspects. IOW, migration with traffic spread across multiple NICs is already possible AFAICT. I can understand that some people may not have actually configured bonding on their hosts, but it is not unreasonable to request that they do so, if they want to take advantage fo aggrated bandwidth. It has the further benefit that it will be fault tolerant. With this proposal if any single NIC has a problem, the whole migration will get stuck. With kernel level bonding, if any single NIC haus a problem, it'll get offlined by the kernel and migration will continue to work across remaining active NICs. 2. We have seen requests recently to separate out traffic of storage, VM netwrok, migration over different vswitch which can be backed by 1 or more NICs as this give better predictability and assurance. So host with multiple ips/vswitches can be very common environment. In this kind of enviroment this approach gives per vm or migration level flexibilty, like for critical VM we can still use bandwidth from all available vswitch/interface but for normal VM they can keep live migration only on dedicated NICs without changing complete host network topology. At final we want it to be something like this [, , ] to provide bandwidth_control per interface. Again, it is already possible to separate migration traffic from storage traffic, from other network traffic. The target IP given will influence which NIC is used based on routing table and I know this is already done widely with OpenStack deployments. Actually I should clarify this is only practical if the two NICs are using different IP subnets, otherwise routing rules are not viable. So needing to set source IP would be needed to select between a pair of NICs on the same IP subnet. Yeh so I think that's one reason that the idea in this series is OK (together with the idea for the NUMA stuff) and I suspect there are other cases as well. Dave yes, David multiFD per NUMA seems interesting idea, I was just curious how much throughput diff we can experience per multiFD channel with local vs remote NIC? thanks Manish Mishra Previous usage I've seen has always setup fully distinct IP subnets for generic vs storage vs migration network traffic. With regards, Daniel -- |: https://urldefense.proofpoint.com/v2/url?u=https-3A__berrange.com&d=DwIDAw&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=qfclRDP-GXttuWQ3knJS2RHXmg2XjmG7Pju002cBrHugZE8hpO3DRbKdHphItFr-&s=1RKIz6cO82_JwgkJ-QLP3SRWaG2Lo6J8w4O0Z2YVJ4Q&e= -o- https://urldefense.proofpoint.com/v2/url?u=https-3A__www.flickr.com_photos_dberrange&d=DwIDAw&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=qfclRDP-GXttuWQ3knJS2RHXmg2XjmG7Pju002cBrHugZE8hpO3DRbKdHphItFr-&s=BkGiCLXloxlYYBJeJ_0XGRUgkUraRPJdIu26ukR6erI&e= :| |: https://urldefense.proofpoint.com/v2/url?u=https-3A__libvirt.org&d=DwIDAw&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=qfclRDP-GXttuWQ3knJS2RHXmg2XjmG7Pju002cBrHugZE8hpO3DRbKdHphItFr-&s=KOz_zQXuQzFxwhNLINm-FrADPcBgnVjjmULmZ6iZTi4&e= -o- https://urldefense.proofpoint.com/v2/url?u=https-3A__fstop138.berrange.com&d=DwIDAw&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m
Re: [PATCH 4/4] Adding support for multi-FD connections dynamically
On 17/06/22 12:17 am, Dr. David Alan Gilbert wrote: * Het Gala (het.g...@nutanix.com) wrote: i) Dynamically decide appropriate source and destination ip pairs for the corresponding multi-FD channel to be connected. ii) Removed the support for setting the number of multi-fd channels from qmp commands. As now all multiFD parameters will be passed via qmp: migrate command or incoming flag itself. We can't do that, because it's part of the API already; what you'll need to do is check that the number of entries in your list corresponds to the value set there and error if it's different. Dave thanks for review David. Yes, we will make sure in V2 that nothing existing breaks. - Manish Mishra Suggested-by: Manish Mishra Signed-off-by: Het Gala --- migration/migration.c | 15 --- migration/migration.h | 1 - migration/multifd.c | 42 +- migration/socket.c| 42 +- migration/socket.h| 4 +++- monitor/hmp-cmds.c| 4 qapi/migration.json | 6 -- 7 files changed, 57 insertions(+), 57 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 9b0ad732e7..57dd4494b4 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1585,9 +1585,6 @@ static void migrate_params_test_apply(MigrateSetParameters *params, if (params->has_block_incremental) { dest->block_incremental = params->block_incremental; } -if (params->has_multifd_channels) { -dest->multifd_channels = params->multifd_channels; -} if (params->has_multifd_compression) { dest->multifd_compression = params->multifd_compression; } @@ -1702,9 +1699,6 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) if (params->has_block_incremental) { s->parameters.block_incremental = params->block_incremental; } -if (params->has_multifd_channels) { -s->parameters.multifd_channels = params->multifd_channels; -} if (params->has_multifd_compression) { s->parameters.multifd_compression = params->multifd_compression; } @@ -2686,15 +2680,6 @@ bool migrate_pause_before_switchover(void) MIGRATION_CAPABILITY_PAUSE_BEFORE_SWITCHOVER]; } -int migrate_multifd_channels(void) -{ -MigrationState *s; - -s = migrate_get_current(); - -return s->parameters.multifd_channels; -} - MultiFDCompression migrate_multifd_compression(void) { MigrationState *s; diff --git a/migration/migration.h b/migration/migration.h index fa8717ec9e..9464de8ef7 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -372,7 +372,6 @@ bool migrate_validate_uuid(void); bool migrate_auto_converge(void); bool migrate_use_multifd(void); bool migrate_pause_before_switchover(void); -int migrate_multifd_channels(void); MultiFDCompression migrate_multifd_compression(void); int migrate_multifd_zlib_level(void); int migrate_multifd_zstd_level(void); diff --git a/migration/multifd.c b/migration/multifd.c index 9282ab6aa4..ce017436fb 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -225,7 +225,7 @@ static int multifd_recv_initial_packet(QIOChannel *c, Error **errp) return -1; } -if (msg.id > migrate_multifd_channels()) { +if (msg.id > total_multifd_channels()) { error_setg(errp, "multifd: received channel version %u " "expected %u", msg.version, MULTIFD_VERSION); return -1; @@ -410,8 +410,8 @@ static int multifd_send_pages(QEMUFile *f) * using more channels, so ensure it doesn't overflow if the * limit is lower now. */ -next_channel %= migrate_multifd_channels(); -for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) { +next_channel %= total_multifd_channels(); +for (i = next_channel;; i = (i + 1) % total_multifd_channels()) { p = &multifd_send_state->params[i]; qemu_mutex_lock(&p->mutex); @@ -422,7 +422,7 @@ static int multifd_send_pages(QEMUFile *f) } if (!p->pending_job) { p->pending_job++; -next_channel = (i + 1) % migrate_multifd_channels(); +next_channel = (i + 1) % total_multifd_channels(); break; } qemu_mutex_unlock(&p->mutex); @@ -500,7 +500,7 @@ static void multifd_send_terminate_threads(Error *err) return; } -for (i = 0; i < migrate_multifd_channels(); i++) { +for (i = 0; i < total_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; qemu_mutex_lock(&p->mutex); @@ -521,14 +521,14 @@ void multifd_save_cleanup(void) return; } multifd_send_terminate_threads(NULL); -for (i = 0; i < migrate_multifd_channels(); i++) { +for (i = 0; i < total_multifd_channels(); i++) { MultiFDSendPar
Re: [PATCH 3/4] Establishing connection between any non-default source and destination pair
Hi Daniel, David, Thank you so much for review on patches. I am posting this message on behalf of Het. We wanted to get a early feedback so sorry if code was not in best of shape. Het is currently on break intership break so does not have access to nutanix mail, he will join in first week of july and will definately post v2 on this by 2nd week of july. thanks Manish MIshra On 16/06/22 11:09 pm, Daniel P. Berrangé wrote: On Thu, Jun 09, 2022 at 07:33:04AM +, Het Gala wrote: i) Binding of the socket to source ip address and port on the non-default interface has been implemented for multi-FD connection, which was not necessary earlier because the binding was on the default interface itself. ii) Created an end to end connection between all multi-FD source and destination pairs. Suggested-by: Manish Mishra Signed-off-by: Het Gala --- chardev/char-socket.c | 4 +- include/io/channel-socket.h | 26 ++- include/qemu/sockets.h | 6 ++- io/channel-socket.c | 50 ++-- migration/socket.c | 15 +++--- nbd/client-connection.c | 2 +- qemu-nbd.c | 4 +- scsi/pr-manager-helper.c| 1 + tests/unit/test-char.c | 8 ++-- tests/unit/test-io-channel-socket.c | 4 +- tests/unit/test-util-sockets.c | 16 +++ ui/input-barrier.c | 2 +- ui/vnc.c| 3 +- util/qemu-sockets.c | 71 - 14 files changed, 135 insertions(+), 77 deletions(-) diff --git a/chardev/char-socket.c b/chardev/char-socket.c index dc4e218eeb..f3725238c5 100644 --- a/chardev/char-socket.c +++ b/chardev/char-socket.c @@ -932,7 +932,7 @@ static int tcp_chr_connect_client_sync(Chardev *chr, Error **errp) QIOChannelSocket *sioc = qio_channel_socket_new(); tcp_chr_change_state(s, TCP_CHARDEV_STATE_CONNECTING); tcp_chr_set_client_ioc_name(chr, sioc); -if (qio_channel_socket_connect_sync(sioc, s->addr, errp) < 0) { +if (qio_channel_socket_connect_sync(sioc, s->addr, NULL, errp) < 0) { tcp_chr_change_state(s, TCP_CHARDEV_STATE_DISCONNECTED); object_unref(OBJECT(sioc)); return -1; @@ -1120,7 +1120,7 @@ static void tcp_chr_connect_client_task(QIOTask *task, SocketAddress *addr = opaque; Error *err = NULL; -qio_channel_socket_connect_sync(ioc, addr, &err); +qio_channel_socket_connect_sync(ioc, addr, NULL, &err); qio_task_set_error(task, err); } diff --git a/include/io/channel-socket.h b/include/io/channel-socket.h index 513c428fe4..59d5b1b349 100644 --- a/include/io/channel-socket.h +++ b/include/io/channel-socket.h @@ -83,41 +83,45 @@ qio_channel_socket_new_fd(int fd, /** * qio_channel_socket_connect_sync: * @ioc: the socket channel object - * @addr: the address to connect to + * @dst_addr: the destination address to connect to + * @src_addr: the source address to be connected * @errp: pointer to a NULL-initialized error object * - * Attempt to connect to the address @addr. This method - * will run in the foreground so the caller will not regain - * execution control until the connection is established or + * Attempt to connect to the address @dst_addr with @src_addr. + * This method will run in the foreground so the caller will not + * regain execution control until the connection is established or * an error occurs. */ int qio_channel_socket_connect_sync(QIOChannelSocket *ioc, -SocketAddress *addr, +SocketAddress *dst_addr, +SocketAddress *src_addr, Error **errp); /** * qio_channel_socket_connect_async: * @ioc: the socket channel object - * @addr: the address to connect to + * @dst_addr: the destination address to connect to * @callback: the function to invoke on completion * @opaque: user data to pass to @callback * @destroy: the function to free @opaque * @context: the context to run the async task. If %NULL, the default * context will be used. + * @src_addr: the source address to be connected * - * Attempt to connect to the address @addr. This method - * will run in the background so the caller will regain + * Attempt to connect to the address @dst_addr with the @src_addr. + * This method will run in the background so the caller will regain * execution control immediately. The function @callback - * will be invoked on completion or failure. The @addr + * will be invoked on completion or failure. The @dst_addr * parameter will be copied, so may be freed as soon * as this function returns without waiting for completion. */ void qio_channel_socket_connect_async(QIOChannelSocket *ioc, - SocketAddress *addr, +
Re: [PATCH 0/4] Multiple interface support on top of Multi-FD
On 16/06/22 1:46 pm, Daniel P. Berrangé wrote: On Wed, Jun 15, 2022 at 08:14:26PM +0100, Dr. David Alan Gilbert wrote: * Daniel P. Berrangé (berra...@redhat.com) wrote: On Fri, Jun 10, 2022 at 05:58:31PM +0530, manish.mishra wrote: On 09/06/22 9:17 pm, Daniel P. Berrangé wrote: On Thu, Jun 09, 2022 at 07:33:01AM +, Het Gala wrote: As of now, the multi-FD feature supports connection over the default network only. This Patchset series is a Qemu side implementation of providing multiple interfaces support for multi-FD. This enables us to fully utilize dedicated or multiple NICs in case bonding of NICs is not possible. Introduction - Multi-FD Qemu implementation currently supports connection only on the default network. This forbids us from advantages like: - Separating VM live migration traffic from the default network. Hi Daniel, I totally understand your concern around this approach increasing compexity inside qemu, when similar things can be done with NIC teaming. But we thought this approach provides much more flexibility to user in few cases like. 1. We checked our customer data, almost all of the host had multiple NIC, but LACP support in their setups was very rare. So for those cases this approach can help in utilise multiple NICs as teaming is not possible there. AFAIK, LACP is not required in order to do link aggregation with Linux. Traditional Linux bonding has no special NIC hardware or switch requirements, so LACP is merely a "nice to have" in order to simplify some aspects. IOW, migration with traffic spread across multiple NICs is already possible AFAICT. Are we sure that works with multifd? I've seen a lot of bonding NIC setups which spread based on a hash of source/destination IP and port numbers; given that we use the same dest port and IP at the moment what happens in reality? That hashing can be quite delicate for high bandwidth single streams. The simplest Linux bonding mode does per-packet round-robin across NICs, so traffic from the collection of multifd connections should fill up all the NICs in the bond. There are of course other modes which may be sub-optimal for the reasons you describe. Which mode to pick depends on the type of service traffic patterns you're aiming to balance. My understanding on networking is not good enough so apologies in advance if something does not make sense. As per my understanding it is easy to do load balancing on sender side because we have full control where to send packet but complicated on receive side if we do not have LACP like support. I see there are some teaming technique which does load balancing of incoming traffic by possibly sending different slaves mac address on arp requests but that does not work for our use case and may require a complicated setup for proper usage. Our use case can be something like this e.g. both source and destination has 2-2 NICs of 10Gbps each and we want to get a throughput of 20Gbps for live migration. thanks Manish Mishra Multi-interface with Multi-FD - Multiple-interface support over basic multi-FD has been implemented in the patches. Advantages of this implementation are: - Able to separate live migration traffic from default network interface by creating multiFD channels on ip addresses of multiple non-default interfaces. - Can optimize the number of multi-FD channels on a particular interface depending upon the network bandwidth limit on a particular interface. Manually assigning individual channels to different NICs is a pretty inefficient way to optimizing traffic. Feels like you could easily get into a situation where one NIC ends up idle while the other is busy, especially if the traffic patterns are different. For example with post-copy there's an extra channel for OOB async page requests, and its far from clear that manually picking NICs per chanel upfront is going work for that. The kernel can continually dynamically balance load on the fly and so do much better than any static mapping QEMU tries to apply, especially if there are multiple distinct QEMU's competing for bandwidth. Yes, Daniel current solution is only for pre-copy. As with postcopy multiFD is not yet supported but in future we can extend it for postcopy I had been thinking about explicit selection of network device for NUMA use though; ideally I'd like to be able to associate a set of multifd threads to each NUMA node, and then associate a NIC with that set of threads; so that the migration happens down the NIC that's on the node the RAM is on. On a really good day you'd have one NIC per top level NUMA node. Now that's an interesting idea, and not one that can be dealt with by bonding, since the network layer won't be aware of the NUMA affinity constraints. With regards, Daniel
Re: [PATCH v17 6/8] softmmu/dirtylimit: Implement virtual CPU throttle
On 13/06/22 8:03 pm, Peter Xu wrote: On Mon, Jun 13, 2022 at 03:28:34PM +0530, manish.mishra wrote: On 26/05/22 8:21 am, Jason Wang wrote: On Wed, May 25, 2022 at 11:56 PM Peter Xu wrote: On Wed, May 25, 2022 at 11:38:26PM +0800, Hyman Huang wrote: 2. Also this algorithm only control or limits dirty rate by guest writes. There can be some memory dirtying done by virtio based devices which is accounted only at qemu level so may not be accounted through dirty rings so do we have plan for that in future? Those are not issue for auto-converge as it slows full VM but dirty rate limit only slows guest writes. From the migration point of view, time spent on migrating memory is far greater than migrating devices emulated by qemu. I think we can do that when migrating device costs the same magnitude time as migrating memory. As to auto-converge, it throttle vcpu by kicking it and force it to sleep periodically. The two seems has no much difference from the perspective of internal method but the auto-converge is kind of "offensive" when doing restraint. I'll read the auto-converge implementation code and figure out the problem you point out. This seems to be not virtio-specific, but can be applied to any device DMA writting to guest mem (if not including vfio). But indeed virtio can be normally faster. I'm also curious how fast a device DMA could dirty memories. This could be a question to answer to all vcpu-based throttling approaches (including the quota based approach that was proposed on KVM list). Maybe for kernel virtio drivers we can have some easier estimation? As you said below, it really depends on the speed of the backend. My guess is it'll be much harder for DPDK-in-guest (aka userspace drivers) because IIUC that could use a large chunk of guest mem. Probably, for vhost-user backend, it could be ~20Mpps or even higher. Sorry for late response on this. We did experiment with IO on virtio-scsi based disk. Thanks for trying this and sharing it out. We could see dirty rate of ~500MBps on my system and most of that was not tracked as kvm_dirty_log. Also for reference i am attaching test we used to avoid tacking in KVM. (as attached file). The number looks sane as it seems to be the sequential bandwidth for a disk, though I'm not 100% sure it'll work as expected since you mmap()ed the region with private pages rather than shared, so after you did I'm wondering whether below will happen (also based on the fact that you mapped twice the size of guest mem as you mentioned in the comment): (1) Swap out will start to trigger after you read a lot of data into the mem already, then old-read pages will be swapped out to disk (and hopefully the swap device does not reside on the same virtio-scsi disk or it'll be even more complicated scenario of mixture IOs..), meanwhile when you finish reading a round and start to read from offset 0 swap-in will start to happen too. Swapping can slow down things already, and I'm wondering whether the 500MB/s was really caused by the swapout rather than backend disk reads. More below. (2) Another attribute of private pages AFAICT is after you read it once it does not need to be read again from the virtio-scsi disks. In other words, I'm thinking whether starting from the 2nd iteration your program won't trigger any DMA at all but purely torturing the swap device. Maybe changing MAP_PRIVATE to MAP_SHARED can emulate better on what we want to measure, but I'm also not 100% sure on whether it could be accurate.. Thanks, Thanks Peter, Yes agree MAP_SHARED should be used here, sorry i missed that 😁. Yes, my purpose of taking file size larger than RAM_SIZE was to cause frequent page cache flush and re-populating page-cache pages, not to trigger swaps. I checked on my VM i had swapping disabled, may be MAP_PRIVATE did not make difference because it was read-only. I tested again with MAP_SHARED it comes around ~500MBps. Thanks Manish Mishra Thanks [copy Jason too] -- Peter Xu #include #include #include #include #include #include #include #include #define PAGE_SIZE 4096 #define GB (1024 * 1024 * 1024) int main() { char *buff; size_t size; struct stat stat; // Take file of size atleast double of RAM size to // achieve max dirty rate possible. const char * file_name = "file_10_gb"; int fd; size_t i = 0, count = 0; struct timespec ts1, ts0; double time_diff; fd = open(file_name, O_RDONLY); if (fd == -1) { perror("Error opening file"); exit(1); } fstat (fd, &stat); size = stat.st_size; printf("File size %ld\n", (long)size); buff = (char *)mmap(0, size, PROT_READ, MAP_PRIVATE, fd, 0); if (buff == MAP_FAILED) { perror("Mmap Error&qu
Re: [PATCH v17 6/8] softmmu/dirtylimit: Implement virtual CPU throttle
On 26/05/22 8:21 am, Jason Wang wrote: On Wed, May 25, 2022 at 11:56 PM Peter Xu wrote: On Wed, May 25, 2022 at 11:38:26PM +0800, Hyman Huang wrote: 2. Also this algorithm only control or limits dirty rate by guest writes. There can be some memory dirtying done by virtio based devices which is accounted only at qemu level so may not be accounted through dirty rings so do we have plan for that in future? Those are not issue for auto-converge as it slows full VM but dirty rate limit only slows guest writes. From the migration point of view, time spent on migrating memory is far greater than migrating devices emulated by qemu. I think we can do that when migrating device costs the same magnitude time as migrating memory. As to auto-converge, it throttle vcpu by kicking it and force it to sleep periodically. The two seems has no much difference from the perspective of internal method but the auto-converge is kind of "offensive" when doing restraint. I'll read the auto-converge implementation code and figure out the problem you point out. This seems to be not virtio-specific, but can be applied to any device DMA writting to guest mem (if not including vfio). But indeed virtio can be normally faster. I'm also curious how fast a device DMA could dirty memories. This could be a question to answer to all vcpu-based throttling approaches (including the quota based approach that was proposed on KVM list). Maybe for kernel virtio drivers we can have some easier estimation? As you said below, it really depends on the speed of the backend. My guess is it'll be much harder for DPDK-in-guest (aka userspace drivers) because IIUC that could use a large chunk of guest mem. Probably, for vhost-user backend, it could be ~20Mpps or even higher. Sorry for late response on this. We did experiment with IO on virtio-scsi based disk. We could see dirty rate of ~500MBps on my system and most of that was not tracked as kvm_dirty_log. Also for reference i am attaching test we used to avoid tacking in KVM. (as attached file). Thanks [copy Jason too] -- Peter Xu #include #include #include #include #include #include #include #include #define PAGE_SIZE 4096 #define GB (1024 * 1024 * 1024) int main() { char *buff; size_t size; struct stat stat; // Take file of size atleast double of RAM size to // achieve max dirty rate possible. const char * file_name = "file_10_gb"; int fd; size_t i = 0, count = 0; struct timespec ts1, ts0; double time_diff; fd = open(file_name, O_RDONLY); if (fd == -1) { perror("Error opening file"); exit(1); } fstat (fd, &stat); size = stat.st_size; printf("File size %ld\n", (long)size); buff = (char *)mmap(0, size, PROT_READ, MAP_PRIVATE, fd, 0); if (buff == MAP_FAILED) { perror("Mmap Error"); exit(1); } (void)clock_gettime(CLOCK_MONOTONIC, &ts0); while(1) { char c; i = (i + PAGE_SIZE) % size; c = buff[i]; count++; // Check on every 10K pages for rate. if (count % 1 == 0) { (void)clock_gettime(CLOCK_MONOTONIC, &ts1); time_diff = ((double)ts1.tv_sec + ts1.tv_nsec * 1.0e-9) -((double)ts0.tv_sec + ts0.tv_nsec * 1.0e-9); printf("Expected Dirty rate %f\n", (1.0 * PAGE_SIZE) / GB / time_diff); ts0 = ts1; } } close(fd); return 0; }
Re: [PATCH 0/4] Multiple interface support on top of Multi-FD
On 09/06/22 9:17 pm, Daniel P. Berrangé wrote: On Thu, Jun 09, 2022 at 07:33:01AM +, Het Gala wrote: As of now, the multi-FD feature supports connection over the default network only. This Patchset series is a Qemu side implementation of providing multiple interfaces support for multi-FD. This enables us to fully utilize dedicated or multiple NICs in case bonding of NICs is not possible. Introduction - Multi-FD Qemu implementation currently supports connection only on the default network. This forbids us from advantages like: - Separating VM live migration traffic from the default network. Hi Daniel, I totally understand your concern around this approach increasing compexity inside qemu, when similar things can be done with NIC teaming. But we thought this approach provides much more flexibility to user in few cases like. 1. We checked our customer data, almost all of the host had multiple NIC, but LACP support in their setups was very rare. So for those cases this approach can help in utilise multiple NICs as teaming is not possible there. 2. We have seen requests recently to separate out traffic of storage, VM netwrok, migration over different vswitch which can be backed by 1 or more NICs as this give better predictability and assurance. So host with multiple ips/vswitches can be very common environment. In this kind of enviroment this approach gives per vm or migration level flexibilty, like for critical VM we can still use bandwidth from all available vswitch/interface but for normal VM they can keep live migration only on dedicated NICs without changing complete host network topology. At final we want it to be something like this [, , ] to provide bandwidth_control per interface. 3. Dedicated NIC we mentioned as a use case, agree with you it can be done without this approach too. Perhaps I'm mis-understanding your intent here, but AFAIK it has been possible to separate VM migration traffic from general host network traffic essentially forever. If you have two NICs with IP addresses on different subnets, then the kernel will pick which NIC to use automatically based on the IP address of the target matching the kernel routing table entries. Management apps have long used this ability in order to control which NIC migration traffic flows over. - Fully utilize all NICs’ capacity in cases where creating a LACP bond (Link Aggregation Control Protocol) is not supported. Can you elaborate on scenarios in which it is impossible to use LACP bonding at the kernel level ? Yes, as mentioned above LACP support was rare in customer setups. Multi-interface with Multi-FD - Multiple-interface support over basic multi-FD has been implemented in the patches. Advantages of this implementation are: - Able to separate live migration traffic from default network interface by creating multiFD channels on ip addresses of multiple non-default interfaces. - Can optimize the number of multi-FD channels on a particular interface depending upon the network bandwidth limit on a particular interface. Manually assigning individual channels to different NICs is a pretty inefficient way to optimizing traffic. Feels like you could easily get into a situation where one NIC ends up idle while the other is busy, especially if the traffic patterns are different. For example with post-copy there's an extra channel for OOB async page requests, and its far from clear that manually picking NICs per chanel upfront is going work for that. The kernel can continually dynamically balance load on the fly and so do much better than any static mapping QEMU tries to apply, especially if there are multiple distinct QEMU's competing for bandwidth. Yes, Daniel current solution is only for pre-copy. As with postcopy multiFD is not yet supported but in future we can extend it for postcopy channels too. Implementation -- Earlier the 'migrate' qmp command: { "execute": "migrate", "arguments": { "uri": "tcp:0:4446" } } Modified qmp command: { "execute": "migrate", "arguments": { "uri": "tcp:0:4446", "multi-fd-uri-list": [ { "source-uri": "tcp::6900", "destination-uri": "tcp:0:4480", "multifd-channels": 4}, { "source-uri": "tcp:10.0.0.0: ", "destination-uri": "tcp:11.0.0.0:7789", "multifd-channels": 5} ] } } -- Earlier the 'migrate-incoming' qmp command: { "execute": "migrate-incoming", "arguments": { "uri": "tcp::4446" } } Modified 'migrate-incoming' qmp command: { "execute": "migrate-incoming", "arguments": {"uri": "tcp::6789", "multi-fd-uri-list" : [ {"destination-uri" : "tcp::6900", "multifd-channels": 4}, {"destination-uri" : "tcp:11.0.0.0:7789", "multifd-channels": 5} ] } }
Re: [PATCH v17 6/8] softmmu/dirtylimit: Implement virtual CPU throttle
On 26/05/22 8:21 am, Jason Wang wrote: On Wed, May 25, 2022 at 11:56 PM Peter Xu wrote: On Wed, May 25, 2022 at 11:38:26PM +0800, Hyman Huang wrote: 2. Also this algorithm only control or limits dirty rate by guest writes. There can be some memory dirtying done by virtio based devices which is accounted only at qemu level so may not be accounted through dirty rings so do we have plan for that in future? Those are not issue for auto-converge as it slows full VM but dirty rate limit only slows guest writes. From the migration point of view, time spent on migrating memory is far greater than migrating devices emulated by qemu. I think we can do that when migrating device costs the same magnitude time as migrating memory. As to auto-converge, it throttle vcpu by kicking it and force it to sleep periodically. The two seems has no much difference from the perspective of internal method but the auto-converge is kind of "offensive" when doing restraint. I'll read the auto-converge implementation code and figure out the problem you point out. This seems to be not virtio-specific, but can be applied to any device DMA writting to guest mem (if not including vfio). But indeed virtio can be normally faster. I'm also curious how fast a device DMA could dirty memories. This could be a question to answer to all vcpu-based throttling approaches (including the quota based approach that was proposed on KVM list). Maybe for kernel virtio drivers we can have some easier estimation? As you said below, it really depends on the speed of the backend. My guess is it'll be much harder for DPDK-in-guest (aka userspace drivers) because IIUC that could use a large chunk of guest mem. Probably, for vhost-user backend, it could be ~20Mpps or even higher. Thanks [copy Jason too] I will try to get some numbers of this by next week. Jason Just wanted to get more context why it should be ~20Mbps like it can be as much as throughput limit of storage/network in worst case? Also we were internally discussing to keep this kind of throttling not as an alternate to auto-converge but somehow to run orthogonal to auto-converge with some modifications. In cases where most dirty is by guest writes auto-converge anyway will not be active as it decides throttle based on ratio of dirty/2*transferred which if is forced correctly by e.g. dirty rate limit will be ~1. This is easiest approach we could think for start but can definately be improved in future. May be something similar can be done for this dirty limit approach too? Surely not for this patch series but in future. thanks Manish Mishra -- Peter Xu
Re: [PATCH v6 10/13] migration: Respect postcopy request order in preemption mode
On 23/05/22 4:26 pm, Dr. David Alan Gilbert wrote: * Peter Xu (pet...@redhat.com) wrote: With preemption mode on, when we see a postcopy request that was requesting for exactly the page that we have preempted before (so we've partially sent the page already via PRECOPY channel and it got preempted by another postcopy request), currently we drop the request so that after all the other postcopy requests are serviced then we'll go back to precopy stream and start to handle that. We dropped the request because we can't send it via postcopy channel since the precopy channel already contains partial of the data, and we can only send a huge page via one channel as a whole. We can't split a huge page into two channels. That's a very corner case and that works, but there's a change on the order of postcopy requests that we handle since we're postponing this (unlucky) postcopy request to be later than the other queued postcopy requests. The problem is there's a possibility that when the guest was very busy, the postcopy queue can be always non-empty, it means this dropped request will never be handled until the end of postcopy migration. So, there's a chance that there's one dest QEMU vcpu thread waiting for a page fault for an extremely long time just because it's unluckily accessing the specific page that was preempted before. The worst case time it needs can be as long as the whole postcopy migration procedure. It's extremely unlikely to happen, but when it happens it's not good. The root cause of this problem is because we treat pss->postcopy_requested variable as with two meanings bound together, as the variable shows: 1. Whether this page request is urgent, and, 2. Which channel we should use for this page request. With the old code, when we set postcopy_requested it means either both (1) and (2) are true, or both (1) and (2) are false. We can never have (1) and (2) to have different values. However it doesn't necessarily need to be like that. It's very legal that there's one request that has (1) very high urgency, but (2) we'd like to use the precopy channel. Just like the corner case we were discussing above. To differenciate the two meanings better, introduce a new field called postcopy_target_channel, showing which channel we should use for this page request, so as to cover the old meaning (2) only. Then we leave the postcopy_requested variable to stand only for meaning (1), which is the urgency of this page request. With this change, we can easily boost priority of a preempted precopy page as long as we know that page is also requested as a postcopy page. So with the new approach in get_queued_page() instead of dropping that request, we send it right away with the precopy channel so we get back the ordering of the page faults just like how they're requested on dest. Alongside, I touched up find_dirty_block() to only set the postcopy fields in the pss section if we're going through a postcopy migration. That's a very light optimization and shouldn't affect much. Reported-by: manish.mis...@nutanix.com Signed-off-by: Peter Xu So I think this is OK; getting a bit complicated! Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Manish Mishra --- migration/ram.c | 69 +++-- 1 file changed, 55 insertions(+), 14 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 9d76db8491..fdcd9984fa 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -441,8 +441,28 @@ struct PageSearchStatus { unsigned long page; /* Set once we wrap around */ bool complete_round; -/* Whether current page is explicitly requested by postcopy */ +/* + * [POSTCOPY-ONLY] Whether current page is explicitly requested by + * postcopy. When set, the request is "urgent" because the dest QEMU + * threads are waiting for us. + */ bool postcopy_requested; +/* + * [POSTCOPY-ONLY] The target channel to use to send current page. + * + * Note: This may _not_ match with the value in postcopy_requested + * above. Let's imagine the case where the postcopy request is exactly + * the page that we're sending in progress during precopy. In this case + * we'll have postcopy_requested set to true but the target channel + * will be the precopy channel (so that we don't split brain on that + * specific page since the precopy channel already contains partial of + * that page data). + * + * Besides that specific use case, postcopy_target_channel should + * always be equal to postcopy_requested, because by default we send + * postcopy pages via postcopy preempt channel. + */ +bool postcopy_target_channel; }; typedef struct PageSearchStatus PageSearchStatus; @@ -496,6 +516,9 @@ static QemuCond decomp_done_cond; static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, ram_add
Re: [PATCH v17 6/8] softmmu/dirtylimit: Implement virtual CPU throttle
On 17/05/22 1:49 pm, Hyman Huang wrote: Thanks Manish for the comment, i'll give my explanation and any supplement are welcomed. Really sorry for such late reply Hyman, this slipped my mind. 在 2022/5/17 1:13, manish.mishra 写道: Hi Hyman Huang, I had few doubts regarding this patch series. For the first point, m'm rudely guessing that you want to figure out how should we set the vcpu dirty limit correctly during live migration to make it convergent. This can be achieved by set a single dirtylimit value on all vcpus, the value need not be equivalent of the half of available bandwidth so precisely since the dirtylimit is sufficient conditions of migration sucess, but not necessary condition. We can set the dirtylimit as the minimum of what user can tolerate, in most case, migration can achieve convergent in advance and do the switchover with the real dirtyrate greater than dirtylimit. This can be implemented because Qemu will check the criteria every iteration, once it meet the condition, Qemu will do the switch over no matter what convergent algo is. Yes got it Hyman, my question was in direction that if we control dirty rate per vcpu, total dirty of VM become very unpredictable. For example if we have set dirty rate limit of each vcpu 50MBps for 10vcpu VM. Then total dirty rate of VM can be anywhere from 0-500MBps based on how many vcpu are active and how much. So if we had dirty rate limit control per VM it would have been much more predictable for user to use. I mean we can keep account of total dirty rate of VM and individual dirty rate and then assign throttle_sleep according to their weights to keep total dirty rate within limit of per vm dirty rate limit. But definately it can be targetted in future and should not be a blocker for now. 1. Why we choose for dirty rate limit per vcpu. I mean it becomes very hard for user to decide per vcpu dirty rate limit. For e.g. we have 1Gbps network and 10 vcpu vm. Now if someone wants to keep criteria for convergence as total dirty rate of VM should be lesser than half of available bandwidth. For this case to ensure convergence user has to give dirty rate limit per vcpu as 1Gbps/ 2 / 10 = 50Mbps. But assume then that VM has only 1 thread which is actively dirtying memory, in that case so much of available quota will be wasted. This is a good and frequent question about dirtylimit, as mentioned above, throttle occurs only when dirty ring full and exit to user space. A vcpu is set up with dirtylimit during live migration, but it does not dirty memory, it may never get throttled. The dirtylimit only throttle those vcpu who dirty memory and dirtyrate greater then dirtylimit. So would not it be better to use dirty rate limit control per VM instead of vcpu? 2. Also Here we are adaptively trying to adjust sleep time based on current obsered dirty rate and dirty rate limit. Can it be more forceful like. Assume we have dirty rate limit of 10pages per sec and auto-converge/ dirty rate limit was triggered at time 0. Now at any point of time assume at time 10 sec if number of pages dirtyed are more than 100pages we sleep for interpolated amount of time. Basically at every dirty ring exit we can check if current number of pages dirtied are more than what should be allowed by this time? Yes, indeed, but as memtioned above, if dirty ring exit, it give Qemu a hint that vcpu is dirting memory, we should check it. I post the series of dirtylimit capability for RFC, may be it can help me to explain the usage of vcpu dirty limit, it can be found here: https://urldefense.proofpoint.com/v2/url?u=https-3A__lore.kernel.org_qemu-2Ddevel_cover.1652762652.git.huangy81-40chinatelecom.cn_&d=DwIDaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=eTtzbPA0FcwY1xwq3KPGhj-Nk5zT41MwAjVGH8a-yeQokG7j3pJxtGsFVCzMDH2X&s=iitKUTNXv8Xkvs-n-K1Aow8MxLEP64RdTXw532_oLIY&e= Thanks, Yong thanks I read this. Also i had few additional things in mind. 1. I see there is no limit on cpu->throttle_us_per_full. I see below line but then ring_full_time_us can be very high value so in some rare cases cpu->throttle_us_per_full can be very high. I know few database applications which can not tolerate continous sleep of more than 2 secs. I agree user should not configure very low dirty rate limit to avoid such situation but then user may not have enough idea of algorithm so better we keep out internal limits? cpu->throttle_us_per_full = MIN(cpu->throttle_us_per_full, ring_full_time_us * DIRTYLIMIT_THROTTLE_PCT_MAX); 2. Also this algorithm only control or limits dirty rate by guest writes. There can be some memory dirtying done by virtio based devices which is accounted only at qemu level so may not be accounted through dirty rings so do we have plan for that in future? Those are not issue for auto-converge
Re: [PATCH v17 6/8] softmmu/dirtylimit: Implement virtual CPU throttle
Hi Hyman Huang, I had few doubts regarding this patch series. 1. Why we choose for dirty rate limit per vcpu. I mean it becomes very hard for user to decide per vcpu dirty rate limit. For e.g. we have 1Gbps network and 10 vcpu vm. Now if someone wants to keep criteria for convergence as total dirty rate of VM should be lesser than half of available bandwidth. For this case to ensure convergence user has to give dirty rate limit per vcpu as 1Gbps/ 2 / 10 = 50Mbps. But assume then that VM has only 1 thread which is actively dirtying memory, in that case so much of available quota will be wasted. So would not it be better to use dirty rate limit control per VM instead of vcpu? 2. Also Here we are adaptively trying to adjust sleep time based on current obsered dirty rate and dirty rate limit. Can it be more forceful like. Assume we have dirty rate limit of 10pages per sec and auto-converge/ dirty rate limit was triggered at time 0. Now at any point of time assume at time 10 sec if number of pages dirtyed are more than 100pages we sleep for interpolated amount of time. Basically at every dirty ring exit we can check if current number of pages dirtied are more than what should be allowed by this time? thanks Manish Mishra On 02/03/22 11:25 pm, huang...@chinatelecom.cn wrote: From: Hyman Huang(黄勇) Setup a negative feedback system when vCPU thread handling KVM_EXIT_DIRTY_RING_FULL exit by introducing throttle_us_per_full field in struct CPUState. Sleep throttle_us_per_full microseconds to throttle vCPU if dirtylimit is in service. Signed-off-by: Hyman Huang(黄勇) Reviewed-by: Peter Xu --- accel/kvm/kvm-all.c | 19 ++- include/hw/core/cpu.h | 6 + include/sysemu/dirtylimit.h | 15 +++ softmmu/dirtylimit.c| 291 softmmu/trace-events| 7 ++ 5 files changed, 337 insertions(+), 1 deletion(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 8821d80..98e43e6 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -45,6 +45,7 @@ #include "qemu/guest-random.h" #include "sysemu/hw_accel.h" #include "kvm-cpus.h" +#include "sysemu/dirtylimit.h" #include "hw/boards.h" @@ -476,6 +477,7 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp) cpu->kvm_state = s; cpu->vcpu_dirty = true; cpu->dirty_pages = 0; +cpu->throttle_us_per_full = 0; mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); if (mmap_size < 0) { @@ -1469,6 +1471,11 @@ static void *kvm_dirty_ring_reaper_thread(void *data) */ sleep(1); +/* keep sleeping so that dirtylimit not be interfered by reaper */ +if (dirtylimit_in_service()) { +continue; +} + trace_kvm_dirty_ring_reaper("wakeup"); r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING; @@ -2964,8 +2971,18 @@ int kvm_cpu_exec(CPUState *cpu) */ trace_kvm_dirty_ring_full(cpu->cpu_index); qemu_mutex_lock_iothread(); -kvm_dirty_ring_reap(kvm_state, NULL); +/* We throttle vCPU by making it sleep once it exit from kernel + * due to dirty ring full. In the dirtylimit scenario, reaping + * all vCPUs after a single vCPU dirty ring get full result in + * the miss of sleep, so just reap the ring-fulled vCPU. + */ +if (dirtylimit_in_service()) { +kvm_dirty_ring_reap(kvm_state, cpu); +} else { +kvm_dirty_ring_reap(kvm_state, NULL); +} qemu_mutex_unlock_iothread(); +dirtylimit_vcpu_execute(cpu); ret = 0; break; case KVM_EXIT_SYSTEM_EVENT: diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h index 76ab3b8..dbeb31a 100644 --- a/include/hw/core/cpu.h +++ b/include/hw/core/cpu.h @@ -411,6 +411,12 @@ struct CPUState { */ bool throttle_thread_scheduled; +/* + * Sleep throttle_us_per_full microseconds once dirty ring is full + * if dirty page rate limit is enabled. + */ +int64_t throttle_us_per_full; + bool ignore_memory_transaction_failures; /* Used for user-only emulation of prctl(PR_SET_UNALIGN). */ diff --git a/include/sysemu/dirtylimit.h b/include/sysemu/dirtylimit.h index da459f0..8d2c1f3 100644 --- a/include/sysemu/dirtylimit.h +++ b/include/sysemu/dirtylimit.h @@ -19,4 +19,19 @@ void vcpu_dirty_rate_stat_start(void); void vcpu_dirty_rate_stat_stop(void); void vcpu_dirty_rate_stat_initialize(void); void vcpu_dirty_rate_stat_finalize(void); + +void dirtylimit_state_lock(void); +void dirtylimit_state_unlock(void); +void dirtylimit_state_initialize(void); +void dirtylimit_state_finalize(void); +bool dirtylimit_in_service(void); +bool dirtylimit_vcpu_index_valid(int cpu_index); +void dirtylimit_process(void); +void dirtyl
Re: [PATCH v5 13/21] migration: Postcopy recover with preempt enabled
On 16/05/22 8:21 pm, manish.mishra wrote: On 16/05/22 7:41 pm, Peter Xu wrote: Hi, Manish, On Mon, May 16, 2022 at 07:01:35PM +0530, manish.mishra wrote: On 26/04/22 5:08 am, Peter Xu wrote: LGTM, Peter, I wanted to give review-tag for this and ealier patch too. I am new to qemu review process so not sure how give review-tag, did not find any reference on google too. So if you please let me know how to do it. It's here: https://urldefense.proofpoint.com/v2/url?u=https-3A__git.qemu.org_-3Fp-3Dqemu.git-3Ba-3Dblob-3Bf-3Ddocs_devel_submitting-2Da-2Dpatch.rst-3Bhb-3DHEAD-23l492&d=DwIBaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=8LU6rphEJ5GMSXEpSxe8JZ_hpn6TQDUXfjWM6Vt7DdShxnU3X5zYXbAMBLPYchdK&s=TUNUCtdl7LWhrdlfnIx1F08kC0d9IMvArl6cNMpfXkc&e= Since afaict QEMU is mostly following what Linux does, you can also reference to this one with more context: https://urldefense.proofpoint.com/v2/url?u=https-3A__www.kernel.org_doc_html_v4.17_process_submitting-2Dpatches.html-23using-2Dreported-2Dby-2Dtested-2Dby-2Dreviewed-2Dby-2Dsuggested-2Dby-2Dand-2Dfixes&d=DwIBaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=8LU6rphEJ5GMSXEpSxe8JZ_hpn6TQDUXfjWM6Vt7DdShxnU3X5zYXbAMBLPYchdK&s=TJmr_eC4LAccVY1EqgkLleXfJhUgtIjTJmLc3cedYr0&e= But since you're still having question regarding this patch, no rush on providing your R-bs; let's finish the discussion first. [...] +static void postcopy_pause_ram_fast_load(MigrationIncomingState *mis) +{ +trace_postcopy_pause_fast_load(); +qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex); I may have misunderstood synchronisation here but very very rare chance, as both threads are working independently is it possible qemu_sem_post on postcopy_pause_sem_fast_load is called by main thread even before we go to qemu_sem_wait in next line, causing some kind of deadlock. That's should not be possible as i understand it requires manually calling qmp_migration_recover so chances are almost impossible. Just wanted to confirm it. Sorry I don't quite get the question, could you elaborate? E.g., when the described deadlock happened, what is both main thread and preempt load thread doing? What are they waiting at? Note: we have already released mutex before waiting on sem. What i meant here is deadlock could be due the reason that we infinately wait on qemu_sem_wait(&mis->postcopy_pause_sem_fast_load), because main thread already called post on postcopy_pause_sem_fast_load after recovery even before we moved to qemu_sem_wait(&mis->postcopy_pause_sem_fast_load) in next line. Basically if we miss a post on postcopy_pause_sem_fast_load. This is nearly impossibily case becuase it requires full recovery path to be completed before this thread executes just next line. Also as recovery needs to be called manually, So please ignore this. Basically i wanted to check if we should use something like int pthread_cond_wait(pthread_cond_t *restrict cond, pthread_mutex_t *restrict mutex); so that there is no race between releasing mutex and calling wait. Really sorry, please ignore this it is sem_post and sem_wait so that is not possible. +qemu_sem_wait(&mis->postcopy_pause_sem_fast_load); Just wanted to confirm why postcopy_pause_incoming is not called here itself. postcopy_pause_incoming() is only used in the main ram load thread, while this function (postcopy_pause_ram_fast_load) is only called by the preempt load thread. ok got it, thanks Peter, i meant if we should close both the channels as soon as we relise there is some failure instead of main thread waiting for error event and then closing and pausing post-copy. But agree current approach is good. Is it based on assumption that if there is error in any of the channel it will eventually be paused on source side, closing both channels, resulting postcopy_pause_incoming will be called from main thread on destination? Yes. Usually it should be good to call as early as possible. It is left to main thread default path so that we do not have any synchronisation overhead? What's the sync overhead you mentioned? What we want to do here is simply to put all the dest QEMU migration threads into a halted state rather than quitting, so that they can be continued when necessary. Also Peter, i was trying to understand postcopy recovery model so is use case of qmp_migrate_pause just for debugging purpose? Yes. It's also a way to cleanly stop using the network (comparing to force unplug the nic ports?) for whatever reason with a shutdown() syscall upon the socket. I just don't know whether there's any real use case of that in reality. Thanks,
Re: [PATCH v5 00/21] migration: Postcopy Preemption
On 26/04/22 5:08 am, Peter Xu wrote: This is v5 of postcopy preempt series. It can also be found here: https://github.com/xzpeter/qemu/tree/postcopy-preempt RFC: https://lore.kernel.org/qemu-devel/20220119080929.39485-1-pet...@redhat.com V1: https://lore.kernel.org/qemu-devel/20220216062809.57179-1-pet...@redhat.com V2: https://lore.kernel.org/qemu-devel/20220301083925.33483-1-pet...@redhat.com V3: https://lore.kernel.org/qemu-devel/20220330213908.26608-1-pet...@redhat.com V4: https://lore.kernel.org/qemu-devel/20220331150857.74406-1-pet...@redhat.com v4->v5 changelog: - Fixed all checkpatch.pl warnings - Picked up leftover patches from Dan's tls test case series: https://lore.kernel.org/qemu-devel/20220310171821.3724080-1-berra...@redhat.com/ - Rebased to v7.0.0 tag, collected more R-bs from Dave/Dan - In migrate_fd_cleanup(), use g_clear_pointer() for s->hostname [Dan] - Mark postcopy-preempt capability for 7.1 not 7.0 [Dan] - Moved migrate_channel_requires_tls() into tls.[ch] [Dan] - Mention the bug-fixing side effect of patch "migration: Export tls-[creds|hostname|authz] params to cmdline too" on tls_authz [Dan] - Use g_autoptr where proper [Dan] - Drop a few (probably over-cautious) asserts on local_err being set [Dan] - Quite a few renamings in the qtest in the last few test patches [Dan] Abstract This series contains two parts now: (1) Leftover patches from Dan's tls unit tests v2, which is first half (2) Leftover patches from my postcopy preempt v4, which is second half This series added a new migration capability called "postcopy-preempt". It can be enabled when postcopy is enabled, and it'll simply (but greatly) speed up postcopy page requests handling process. Below are some initial postcopy page request latency measurements after the new series applied. For each page size, I measured page request latency for three cases: (a) Vanilla:the old postcopy (b) Preempt no-break-huge: preempt enabled, x-postcopy-preempt-break-huge=off (c) Preempt full: preempt enabled, x-postcopy-preempt-break-huge=on (this is the default option when preempt enabled) Here x-postcopy-preempt-break-huge parameter is just added in v2 so as to conditionally disable the behavior to break sending a precopy huge page for debugging purpose. So when it's off, postcopy will not preempt precopy sending a huge page, but still postcopy will use its own channel. I tested it separately to give a rough idea on which part of the change helped how much of it. The overall benefit should be the comparison between case (a) and (c). |---+-+---+--| | Page size | Vanilla | Preempt no-break-huge | Preempt full | |---+-+---+--| | 4K| 10.68 | N/A [*] | 0.57 | | 2M| 10.58 | 5.49 | 5.02 | | 1G| 2046.65 | 933.185 | 649.445 | |---+-+---+--| [*]: This case is N/A because 4K page does not contain huge page at all [1] https://github.com/xzpeter/small-stuffs/blob/master/tools/huge_vm/uffd-latency.bpf Hi Peter, Just wanted understand what setup was used for these experiments like number of vcpu, workload, network bandwidth so that i can make sense of these numbers. Also i could not understand reason for so much difference between preempt full and Preempt no-break-huge especially for 1G case, so if you please share little more details on this. TODO List = Avoid precopy write() blocks postcopy - I didn't prove this, but I always think the write() syscalls being blocked for precopy pages can affect postcopy services. If we can solve this problem then my wild guess is we can further reduce the average page latency. Two solutions at least in mind: (1) we could have made the write side of the migration channel NON_BLOCK too, or (2) multi-threads on send side, just like multifd, but we may use lock to protect which page to send too (e.g., the core idea is we should _never_ rely anything on the main thread, multifd has that dependency on queuing pages only on main thread). That can definitely be done and thought about later. Multi-channel for preemption threads Currently the postcopy preempt feature use only one extra channel and one extra thread on dest (no new thread on src QEMU). It should be mostly good enough for major use cases, but when the postcopy queue is long enough (e.g. hundreds of vCPUs faulted on different pages) logically we could still observe more delays in average. Whether growing threads/channels can solve it is debatable, but sounds worthwhile a try. That's yet another thing we can think about after this patchset lands. Logically the design provides space for that -
Re: [PATCH v5 15/21] migration: Parameter x-postcopy-preempt-break-huge
On 26/04/22 5:08 am, Peter Xu wrote: Add a parameter that can conditionally disable the "break sending huge page" behavior in postcopy preemption. By default it's enabled. It should only be used for debugging purposes, and we should never remove the "x-" prefix. Signed-off-by: Peter Xu Reviewed-by: Manish Mishra --- migration/migration.c | 2 ++ migration/migration.h | 7 +++ migration/ram.c | 7 +++ 3 files changed, 16 insertions(+) diff --git a/migration/migration.c b/migration/migration.c index cce741e20e..cd9650f2e2 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -4329,6 +4329,8 @@ static Property migration_properties[] = { DEFINE_PROP_SIZE("announce-step", MigrationState, parameters.announce_step, DEFAULT_MIGRATE_ANNOUNCE_STEP), +DEFINE_PROP_BOOL("x-postcopy-preempt-break-huge", MigrationState, + postcopy_preempt_break_huge, true), /* Migration capabilities */ DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE), diff --git a/migration/migration.h b/migration/migration.h index f898b8547a..6ee520642f 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -340,6 +340,13 @@ struct MigrationState { bool send_configuration; /* Whether we send section footer during migration */ bool send_section_footer; +/* + * Whether we allow break sending huge pages when postcopy preempt is + * enabled. When disabled, we won't interrupt precopy within sending a + * host huge page, which is the old behavior of vanilla postcopy. + * NOTE: this parameter is ignored if postcopy preempt is not enabled. + */ +bool postcopy_preempt_break_huge; /* Needed by postcopy-pause state */ QemuSemaphore postcopy_pause_sem; diff --git a/migration/ram.c b/migration/ram.c index a4b39e3675..f3a79c8556 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -2266,11 +2266,18 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss) static bool postcopy_needs_preempt(RAMState *rs, PageSearchStatus *pss) { +MigrationState *ms = migrate_get_current(); + /* Not enabled eager preempt? Then never do that. */ if (!migrate_postcopy_preempt()) { return false; } +/* If the user explicitly disabled breaking of huge page, skip */ +if (!ms->postcopy_preempt_break_huge) { +return false; +} + /* If the ramblock we're sending is a small page? Never bother. */ if (qemu_ram_pagesize(pss->block) == TARGET_PAGE_SIZE) { return false;
Re: [PATCH v5 13/21] migration: Postcopy recover with preempt enabled
On 16/05/22 7:41 pm, Peter Xu wrote: Hi, Manish, On Mon, May 16, 2022 at 07:01:35PM +0530, manish.mishra wrote: On 26/04/22 5:08 am, Peter Xu wrote: LGTM, Peter, I wanted to give review-tag for this and ealier patch too. I am new to qemu review process so not sure how give review-tag, did not find any reference on google too. So if you please let me know how to do it. It's here: https://urldefense.proofpoint.com/v2/url?u=https-3A__git.qemu.org_-3Fp-3Dqemu.git-3Ba-3Dblob-3Bf-3Ddocs_devel_submitting-2Da-2Dpatch.rst-3Bhb-3DHEAD-23l492&d=DwIBaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=8LU6rphEJ5GMSXEpSxe8JZ_hpn6TQDUXfjWM6Vt7DdShxnU3X5zYXbAMBLPYchdK&s=TUNUCtdl7LWhrdlfnIx1F08kC0d9IMvArl6cNMpfXkc&e= Since afaict QEMU is mostly following what Linux does, you can also reference to this one with more context: https://urldefense.proofpoint.com/v2/url?u=https-3A__www.kernel.org_doc_html_v4.17_process_submitting-2Dpatches.html-23using-2Dreported-2Dby-2Dtested-2Dby-2Dreviewed-2Dby-2Dsuggested-2Dby-2Dand-2Dfixes&d=DwIBaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=8LU6rphEJ5GMSXEpSxe8JZ_hpn6TQDUXfjWM6Vt7DdShxnU3X5zYXbAMBLPYchdK&s=TJmr_eC4LAccVY1EqgkLleXfJhUgtIjTJmLc3cedYr0&e= But since you're still having question regarding this patch, no rush on providing your R-bs; let's finish the discussion first. [...] +static void postcopy_pause_ram_fast_load(MigrationIncomingState *mis) +{ +trace_postcopy_pause_fast_load(); +qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex); I may have misunderstood synchronisation here but very very rare chance, as both threads are working independently is it possible qemu_sem_post on postcopy_pause_sem_fast_load is called by main thread even before we go to qemu_sem_wait in next line, causing some kind of deadlock. That's should not be possible as i understand it requires manually calling qmp_migration_recover so chances are almost impossible. Just wanted to confirm it. Sorry I don't quite get the question, could you elaborate? E.g., when the described deadlock happened, what is both main thread and preempt load thread doing? What are they waiting at? Note: we have already released mutex before waiting on sem. What i meant here is deadlock could be due the reason that we infinately wait on qemu_sem_wait(&mis->postcopy_pause_sem_fast_load), because main thread already called post on postcopy_pause_sem_fast_load after recovery even before we moved to qemu_sem_wait(&mis->postcopy_pause_sem_fast_load) in next line. Basically if we miss a post on postcopy_pause_sem_fast_load. This is nearly impossibily case becuase it requires full recovery path to be completed before this thread executes just next line. Also as recovery needs to be called manually, So please ignore this. Basically i wanted to check if we should use something like int pthread_cond_wait(pthread_cond_t *restrict cond, pthread_mutex_t *restrict mutex); so that there is no race between releasing mutex and calling wait. +qemu_sem_wait(&mis->postcopy_pause_sem_fast_load); Just wanted to confirm why postcopy_pause_incoming is not called here itself. postcopy_pause_incoming() is only used in the main ram load thread, while this function (postcopy_pause_ram_fast_load) is only called by the preempt load thread. ok got it, thanks Peter, i meant if we should close both the channels as soon as we relise there is some failure instead of main thread waiting for error event and then closing and pausing post-copy. But agree current approach is good. Is it based on assumption that if there is error in any of the channel it will eventually be paused on source side, closing both channels, resulting postcopy_pause_incoming will be called from main thread on destination? Yes. Usually it should be good to call as early as possible. It is left to main thread default path so that we do not have any synchronisation overhead? What's the sync overhead you mentioned? What we want to do here is simply to put all the dest QEMU migration threads into a halted state rather than quitting, so that they can be continued when necessary. Also Peter, i was trying to understand postcopy recovery model so is use case of qmp_migrate_pause just for debugging purpose? Yes. It's also a way to cleanly stop using the network (comparing to force unplug the nic ports?) for whatever reason with a shutdown() syscall upon the socket. I just don't know whether there's any real use case of that in reality. Thanks,
Re: [PATCH v5 14/21] migration: Create the postcopy preempt channel asynchronously
On 26/04/22 5:08 am, Peter Xu wrote: This patch allows the postcopy preempt channel to be created asynchronously. The benefit is that when the connection is slow, we won't take the BQL (and potentially block all things like QMP) for a long time without releasing. A function postcopy_preempt_wait_channel() is introduced, allowing the migration thread to be able to wait on the channel creation. The channel is always created by the main thread, in which we'll kick a new semaphore to tell the migration thread that the channel has created. We'll need to wait for the new channel in two places: (1) when there's a new postcopy migration that is starting, or (2) when there's a postcopy migration to resume. For the start of migration, we don't need to wait for this channel until when we want to start postcopy, aka, postcopy_start(). We'll fail the migration if we found that the channel creation failed (which should probably not happen at all in 99% of the cases, because the main channel is using the same network topology). For a postcopy recovery, we'll need to wait in postcopy_pause(). In that case if the channel creation failed, we can't fail the migration or we'll crash the VM, instead we keep in PAUSED state, waiting for yet another recovery. Signed-off-by: Peter Xu Reviewed-by: Manish Mishra This also looks good to me, sorry if i mis-understood and this is not correct way to add review tag. --- migration/migration.c| 16 migration/migration.h| 7 + migration/postcopy-ram.c | 56 +++- migration/postcopy-ram.h | 1 + 4 files changed, 68 insertions(+), 12 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index a0db5de685..cce741e20e 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -3020,6 +3020,12 @@ static int postcopy_start(MigrationState *ms) int64_t bandwidth = migrate_max_postcopy_bandwidth(); bool restart_block = false; int cur_state = MIGRATION_STATUS_ACTIVE; + +if (postcopy_preempt_wait_channel(ms)) { +migrate_set_state(&ms->state, ms->state, MIGRATION_STATUS_FAILED); +return -1; +} + if (!migrate_pause_before_switchover()) { migrate_set_state(&ms->state, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_POSTCOPY_ACTIVE); @@ -3501,6 +3507,14 @@ static MigThrError postcopy_pause(MigrationState *s) if (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) { /* Woken up by a recover procedure. Give it a shot */ +if (postcopy_preempt_wait_channel(s)) { +/* + * Preempt enabled, and new channel create failed; loop + * back to wait for another recovery. + */ +continue; +} + /* * Firstly, let's wake up the return path now, with a new * return path channel. @@ -4360,6 +4374,7 @@ static void migration_instance_finalize(Object *obj) qemu_sem_destroy(&ms->postcopy_pause_sem); qemu_sem_destroy(&ms->postcopy_pause_rp_sem); qemu_sem_destroy(&ms->rp_state.rp_sem); +qemu_sem_destroy(&ms->postcopy_qemufile_src_sem); error_free(ms->error); } @@ -4406,6 +4421,7 @@ static void migration_instance_init(Object *obj) qemu_sem_init(&ms->rp_state.rp_sem, 0); qemu_sem_init(&ms->rate_limit_sem, 0); qemu_sem_init(&ms->wait_unplug_sem, 0); +qemu_sem_init(&ms->postcopy_qemufile_src_sem, 0); qemu_mutex_init(&ms->qemu_file_lock); } diff --git a/migration/migration.h b/migration/migration.h index 91f845e9e4..f898b8547a 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -219,6 +219,13 @@ struct MigrationState { QEMUFile *to_dst_file; /* Postcopy specific transfer channel */ QEMUFile *postcopy_qemufile_src; +/* + * It is posted when the preempt channel is established. Note: this is + * used for both the start or recover of a postcopy migration. We'll + * post to this sem every time a new preempt channel is created in the + * main thread, and we keep post() and wait() in pair. + */ +QemuSemaphore postcopy_qemufile_src_sem; QIOChannelBuffer *bioc; /* * Protects to_dst_file/from_dst_file pointers. We need to make sure we diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index b3c81b46f6..1bb603051a 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -1552,10 +1552,50 @@ bool postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file) return true; } -int postcopy_preempt_setup(MigrationState *s, Error **errp) +static void +postcopy_preempt_send_channel_new(QIOTask *task, gpointer opaque) { -QIOChannel *ioc; +MigrationState *s = opaque; +QIOChannel *ioc = QIO_CHANNEL(qio_task_get_source(task)); +Error *local_err = NULL; + +if (qio_task_propagate_err
Re: [PATCH v5 13/21] migration: Postcopy recover with preempt enabled
On 26/04/22 5:08 am, Peter Xu wrote: LGTM, Peter, I wanted to give review-tag for this and ealier patch too. I am new to qemu review process so not sure how give review-tag, did not find any reference on google too. So if you please let me know how to do it. To allow postcopy recovery, the ram fast load (preempt-only) dest QEMU thread needs similar handling on fault tolerance. When ram_load_postcopy() fails, instead of stopping the thread it halts with a semaphore, preparing to be kicked again when recovery is detected. A mutex is introduced to make sure there's no concurrent operation upon the socket. To make it simple, the fast ram load thread will take the mutex during its whole procedure, and only release it if it's paused. The fast-path socket will be properly released by the main loading thread safely when there's network failures during postcopy with that mutex held. Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Peter Xu --- migration/migration.c| 27 +++ migration/migration.h| 19 +++ migration/postcopy-ram.c | 25 +++-- migration/qemu-file.c| 27 +++ migration/qemu-file.h| 1 + migration/savevm.c | 26 -- migration/trace-events | 2 ++ 7 files changed, 119 insertions(+), 8 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 8264b03d4d..a0db5de685 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -215,9 +215,11 @@ void migration_object_init(void) current_incoming->postcopy_remote_fds = g_array_new(FALSE, TRUE, sizeof(struct PostCopyFD)); qemu_mutex_init(¤t_incoming->rp_mutex); +qemu_mutex_init(¤t_incoming->postcopy_prio_thread_mutex); qemu_event_init(¤t_incoming->main_thread_load_event, false); qemu_sem_init(¤t_incoming->postcopy_pause_sem_dst, 0); qemu_sem_init(¤t_incoming->postcopy_pause_sem_fault, 0); +qemu_sem_init(¤t_incoming->postcopy_pause_sem_fast_load, 0); qemu_mutex_init(¤t_incoming->page_request_mutex); current_incoming->page_requested = g_tree_new(page_request_addr_cmp); @@ -697,9 +699,9 @@ static bool postcopy_try_recover(void) /* * Here, we only wake up the main loading thread (while the - * fault thread will still be waiting), so that we can receive + * rest threads will still be waiting), so that we can receive * commands from source now, and answer it if needed. The - * fault thread will be woken up afterwards until we are sure + * rest threads will be woken up afterwards until we are sure * that source is ready to reply to page requests. */ qemu_sem_post(&mis->postcopy_pause_sem_dst); @@ -3470,6 +3472,18 @@ static MigThrError postcopy_pause(MigrationState *s) qemu_file_shutdown(file); qemu_fclose(file); +/* + * Do the same to postcopy fast path socket too if there is. No + * locking needed because no racer as long as we do this before setting + * status to paused. + */ +if (s->postcopy_qemufile_src) { +migration_ioc_unregister_yank_from_file(s->postcopy_qemufile_src); +qemu_file_shutdown(s->postcopy_qemufile_src); +qemu_fclose(s->postcopy_qemufile_src); +s->postcopy_qemufile_src = NULL; +} + migrate_set_state(&s->state, s->state, MIGRATION_STATUS_POSTCOPY_PAUSED); @@ -3525,8 +3539,13 @@ static MigThrError migration_detect_error(MigrationState *s) return MIG_THR_ERR_FATAL; } -/* Try to detect any file errors */ -ret = qemu_file_get_error_obj(s->to_dst_file, &local_error); +/* + * Try to detect any file errors. Note that postcopy_qemufile_src will + * be NULL when postcopy preempt is not enabled. + */ +ret = qemu_file_get_error_obj_any(s->to_dst_file, + s->postcopy_qemufile_src, + &local_error); if (!ret) { /* Everything is fine */ assert(!local_error); diff --git a/migration/migration.h b/migration/migration.h index b8aacfe3af..91f845e9e4 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -118,6 +118,18 @@ struct MigrationIncomingState { /* Postcopy priority thread is used to receive postcopy requested pages */ QemuThread postcopy_prio_thread; bool postcopy_prio_thread_created; +/* + * Used to sync between the ram load main thread and the fast ram load + * thread. It protects postcopy_qemufile_dst, which is the postcopy + * fast channel. + * + * The ram fast load thread will take it mostly for the whole lifecycle + * because it needs to continuously read data from the channel, and + * it'll only release this mutex if postcopy is interrupted,
Re: [PATCH v4 10/19] migration: Postcopy preemption enablement
On 12/05/22 9:52 pm, Peter Xu wrote: Hi, Manish, On Wed, May 11, 2022 at 09:24:28PM +0530, manish.mishra wrote: @@ -1962,9 +2038,17 @@ static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) RAMBlock *block; ram_addr_t offset; +again: block = unqueue_page(rs, &offset); -if (!block) { +if (block) { +/* See comment above postcopy_preempted_contains() */ +if (postcopy_preempted_contains(rs, block, offset)) { +trace_postcopy_preempt_hit(block->idstr, offset); +/* This request is dropped */ +goto again; +} If we continuosly keep on getting new post-copy request. Is it possible this case can starve post-copy request which is in precopy preemtion? I didn't fully get your thoughts, could you elaborate? Here we're checking against the case where the postcopy requested page is exactly the one that we have preempted in previous precopy sessions. If true, we drop this postcopy page and continue with the rest. When there'll be no postcopy requests pending then we'll continue with the precopy page, which is exactly the request we've dropped. Why we did this is actually in comment above postcopy_preempted_contains(), and quotting from there: /* * This should really happen very rarely, because it means when we were sending * during background migration for postcopy we're sending exactly the page that * some vcpu got faulted on on dest node. When it happens, we probably don't * need to do much but drop the request, because we know right after we restore * the precopy stream it'll be serviced. It'll slightly affect the order of * postcopy requests to be serviced (e.g. it'll be the same as we move current * request to the end of the queue) but it shouldn't be a big deal. The most * imporant thing is we can _never_ try to send a partial-sent huge page on the * POSTCOPY channel again, otherwise that huge page will got "split brain" on * two channels (PRECOPY, POSTCOPY). */ [...] Hi Peter, what i meant here is that as we go to precopy sending only when there is no post-copy request left so if there is some workload which is continuosly generating new post-copy fault request, It may take very long before we resume on precopy channel. So basically precopy channel may have a post-copy request pending for very long in this case? Earlier as it was FCFS there was a guarantee a post-copy request will be served after a fixed amount of time. @@ -2211,7 +2406,16 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) return 0; } +if (migrate_postcopy_preempt() && migration_in_postcopy()) { I see why there is only one extra channel, multiFD is not supported for postcopy. Peter, Any particular reason for that. We used one channel not because multifd is not enabled - if you read into the series the channels are separately managed because they're servicing different goals. It's because I don't really know whether multiple channels would be necessary, because postcopy requests should not be the major channel that pages will be sent, kind of a fast-path. One of the major goal of this series is to avoid interruptions made to postcopy urgent pages due to sending of precopy pages. One extra channel already serviced it well, so I just stopped there as the initial version. I actually raised that question myself too in the cover letter in the todo section, I think we can always evaluate the possibility of that in the future without major reworks (but we may need another parameter to specify the num of threads just like multifd). >because postcopy requests should not be the major channel that pages will be sent, kind of a fast-path. Yes, agree Peter, but in worst case scenario it is possible we may have to transfer full memory of VM by post-copy requests? So in that case we may require higher number of threads. But agree there can not be be binding with number of mutliFD channels as multiFD uses 256KB buffer size but here we may have to 4k in small page case so there can be big diff in throughput limits. Also smaller the buffer size much higher will be cpu usage so it needs to be decided carefully. As it must be very bad without multiFD, we have seen we can not utilise NIC higher than 10 Gbps without multiFD. If it is something in TODO can we help with that? Yes, that should be on Juan's todo list (in the cc list as well), and AFAICT he'll be happy if anyone would like to take items out of the list. We can further discuss it somewhere. One thing to mention is that I suspect the thread models will still need to be separate even if multifd joins the equation. I mean, IMHO multifd threads take chunks of pages and send things in bulk, while if you read into this series postcopy preempt threads send page one by one and asap. T
Re: [PATCH v5 11/21] migration: Postcopy preemption preparation on channel creation
LGTM On 26/04/22 5:08 am, Peter Xu wrote: Create a new socket for postcopy to be prepared to send postcopy requested pages via this specific channel, so as to not get blocked by precopy pages. A new thread is also created on dest qemu to receive data from this new channel based on the ram_load_postcopy() routine. The ram_load_postcopy(POSTCOPY) branch and the thread has not started to function, and that'll be done in follow up patches. Cleanup the new sockets on both src/dst QEMUs, meanwhile look after the new thread too to make sure it'll be recycled properly. Signed-off-by: Peter Xu Reviewed-by: Daniel P. Berrangé --- migration/migration.c| 62 +++ migration/migration.h| 8 migration/postcopy-ram.c | 92 ++-- migration/postcopy-ram.h | 10 + migration/ram.c | 25 --- migration/ram.h | 4 +- migration/savevm.c | 20 - migration/socket.c | 22 +- migration/socket.h | 1 + migration/trace-events | 5 ++- 10 files changed, 218 insertions(+), 31 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 75d9185c3a..e27aa306bc 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -321,6 +321,12 @@ void migration_incoming_state_destroy(void) mis->page_requested = NULL; } +if (mis->postcopy_qemufile_dst) { +migration_ioc_unregister_yank_from_file(mis->postcopy_qemufile_dst); +qemu_fclose(mis->postcopy_qemufile_dst); +mis->postcopy_qemufile_dst = NULL; +} + yank_unregister_instance(MIGRATION_YANK_INSTANCE); } @@ -714,15 +720,21 @@ void migration_fd_process_incoming(QEMUFile *f, Error **errp) migration_incoming_process(); } +static bool migration_needs_multiple_sockets(void) +{ +return migrate_use_multifd() || migrate_postcopy_preempt(); +} + void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) { MigrationIncomingState *mis = migration_incoming_get_current(); Error *local_err = NULL; bool start_migration; +QEMUFile *f; if (!mis->from_src_file) { /* The first connection (multifd may have multiple) */ -QEMUFile *f = qemu_fopen_channel_input(ioc); +f = qemu_fopen_channel_input(ioc); if (!migration_incoming_setup(f, errp)) { return; @@ -730,13 +742,18 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) /* * Common migration only needs one channel, so we can start - * right now. Multifd needs more than one channel, we wait. + * right now. Some features need more than one channel, we wait. */ -start_migration = !migrate_use_multifd(); +start_migration = !migration_needs_multiple_sockets(); } else { /* Multiple connections */ -assert(migrate_use_multifd()); -start_migration = multifd_recv_new_channel(ioc, &local_err); +assert(migration_needs_multiple_sockets()); +if (migrate_use_multifd()) { +start_migration = multifd_recv_new_channel(ioc, &local_err); +} else if (migrate_postcopy_preempt()) { +f = qemu_fopen_channel_input(ioc); +start_migration = postcopy_preempt_new_channel(mis, f); +} if (local_err) { error_propagate(errp, local_err); return; @@ -761,11 +778,20 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) bool migration_has_all_channels(void) { MigrationIncomingState *mis = migration_incoming_get_current(); -bool all_channels; -all_channels = multifd_recv_all_channels_created(); +if (!mis->from_src_file) { +return false; +} + +if (migrate_use_multifd()) { +return multifd_recv_all_channels_created(); +} + +if (migrate_postcopy_preempt()) { +return mis->postcopy_qemufile_dst != NULL; +} -return all_channels && mis->from_src_file != NULL; +return true; } /* @@ -1862,6 +1888,12 @@ static void migrate_fd_cleanup(MigrationState *s) qemu_fclose(tmp); } +if (s->postcopy_qemufile_src) { +migration_ioc_unregister_yank_from_file(s->postcopy_qemufile_src); +qemu_fclose(s->postcopy_qemufile_src); +s->postcopy_qemufile_src = NULL; +} + assert(!migration_is_active(s)); if (s->state == MIGRATION_STATUS_CANCELLING) { @@ -3237,6 +3269,11 @@ static void migration_completion(MigrationState *s) qemu_savevm_state_complete_postcopy(s->to_dst_file); qemu_mutex_unlock_iothread(); +/* Shutdown the postcopy fast path thread */ +if (migrate_postcopy_preempt()) { +postcopy_preempt_shutdown_file(s); +} + trace_migration_completion_postcopy_end_after_complete(); } else { goto fail; @@ -41
Re: [PATCH v4 10/19] migration: Postcopy preemption enablement
On 31/03/22 8:38 pm, Peter Xu wrote: LGTM This patch enables postcopy-preempt feature. It contains two major changes to the migration logic: (1) Postcopy requests are now sent via a different socket from precopy background migration stream, so as to be isolated from very high page request delays. (2) For huge page enabled hosts: when there's postcopy requests, they can now intercept a partial sending of huge host pages on src QEMU. After this patch, we'll live migrate a VM with two channels for postcopy: (1) PRECOPY channel, which is the default channel that transfers background pages; and (2) POSTCOPY channel, which only transfers requested pages. There's no strict rule of which channel to use, e.g., if a requested page is already being transferred on precopy channel, then we will keep using the same precopy channel to transfer the page even if it's explicitly requested. In 99% of the cases we'll prioritize the channels so we send requested page via the postcopy channel as long as possible. On the source QEMU, when we found a postcopy request, we'll interrupt the PRECOPY channel sending process and quickly switch to the POSTCOPY channel. After we serviced all the high priority postcopy pages, we'll switch back to PRECOPY channel so that we'll continue to send the interrupted huge page again. There's no new thread introduced on src QEMU. On the destination QEMU, one new thread is introduced to receive page data from the postcopy specific socket (done in the preparation patch). This patch has a side effect: after sending postcopy pages, previously we'll assume the guest will access follow up pages so we'll keep sending from there. Now it's changed. Instead of going on with a postcopy requested page, we'll go back and continue sending the precopy huge page (which can be intercepted by a postcopy request so the huge page can be sent partially before). Whether that's a problem is debatable, because "assuming the guest will continue to access the next page" may not really suite when huge pages are used, especially if the huge page is large (e.g. 1GB pages). So that locality hint is much meaningless if huge pages are used. Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Peter Xu --- migration/migration.c | 2 + migration/migration.h | 2 +- migration/ram.c| 250 +++-- migration/trace-events | 7 ++ 4 files changed, 252 insertions(+), 9 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 01b882494d..56d54c186b 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -3158,6 +3158,8 @@ static int postcopy_start(MigrationState *ms) MIGRATION_STATUS_FAILED); } +trace_postcopy_preempt_enabled(migrate_postcopy_preempt()); + return ret; fail_closefb: diff --git a/migration/migration.h b/migration/migration.h index caa910d956..b8aacfe3af 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -68,7 +68,7 @@ typedef struct { struct MigrationIncomingState { QEMUFile *from_src_file; /* Previously received RAM's RAMBlock pointer */ -RAMBlock *last_recv_block; +RAMBlock *last_recv_block[RAM_CHANNEL_MAX]; /* A hook to allow cleanup at the end of incoming migration */ void *transport_data; void (*transport_cleanup)(void *data); diff --git a/migration/ram.c b/migration/ram.c index c7ea1d9215..518d511874 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -295,6 +295,20 @@ struct RAMSrcPageRequest { QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; }; +typedef struct { +/* + * Cached ramblock/offset values if preempted. They're only meaningful if + * preempted==true below. + */ +RAMBlock *ram_block; +unsigned long ram_page; +/* + * Whether a postcopy preemption just happened. Will be reset after + * precopy recovered to background migration. + */ +bool preempted; +} PostcopyPreemptState; + /* State of RAM for migration */ struct RAMState { /* QEMUFile used for this migration */ @@ -349,6 +363,14 @@ struct RAMState { /* Queue of outstanding page requests from the destination */ QemuMutex src_page_req_mutex; QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; + +/* Postcopy preemption informations */ +PostcopyPreemptState postcopy_preempt_state; +/* + * Current channel we're using on src VM. Only valid if postcopy-preempt + * is enabled. + */ +unsigned int postcopy_channel; }; typedef struct RAMState RAMState; @@ -356,6 +378,11 @@ static RAMState *ram_state; static NotifierWithReturnList precopy_notifier_list; +static void postcopy_preempt_reset(RAMState *rs) +{ +memset(&rs->postcopy_preempt_state, 0, sizeof(PostcopyPreemptState)); +} + /* Whether postcopy has queued requests? */ static bool postcopy_has_request(RAMState *rs) { @@ -1947,6 +1974,55 @@ void ram_wr
[PATCH] migration/ram: Avoid throttling VM in the first iteration
In some cases for large size VM, memory_global_dirty_log_sync() and ramblock_sync_dirty_bitmap() combined can take more than 1 sec. As a result diff of end_time and rs->time_last_bitmap_sync can be more than 1 sec even when migration_bitmap_sync is called the first time, setting up the throttle in the first iteration itself. When migration_bitmap_sync is called the first-time num_dirty_pages_period is equal to VM total pages and ram_counters.transferred is zero which forces VM to throttle to 99 from migration start. So we should always check if dirty_sync_count is greater than 1 before trying throttling. Signed-off-by: manish.mishra --- migration/ram.c | 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 7a43bfd7af..9ba1c8b235 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1006,8 +1006,12 @@ static void migration_bitmap_sync(RAMState *rs) end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); -/* more than 1 second = 1000 millisecons */ -if (end_time > rs->time_last_bitmap_sync + 1000) { +/* + * more than 1 second = 1000 millisecons + * Avoid throttling VM in the first iteration of live migration. + */ +if (end_time > rs->time_last_bitmap_sync + 1000 && +ram_counters.dirty_sync_count > 1) { migration_trigger_throttle(rs); migration_update_rates(rs, end_time); -- 2.22.3