[PATCH v1] target/i386: Always set leaf 0x1f

2024-07-24 Thread manish.mishra
From: Manish Mishra 

QEMU does not set 0x1f in case VM does not have extended CPU topology
and expects guests to fallback to 0xb. Some versions of Windows does not
like this behavior and expects this leaf to be populated. As a result Windows
VM fails with blue screen.

Leaf 0x1f is superset of 0xb, so it makes sense to set 0x1f equivalent
to 0xb by default and workaround windows issue. This change adds a
new property 'cpuid-0x1f-enforce' to set leaf 0x1f equivalent to 0xb in
case extended CPU topology is not configured and behave as before otherwise.

Steps to reproduce this issue:
This requires Windows 10 or 11 VM, with credential guard and HyperV role
enabled. Also this issue shows starting SapphireRapids which raised cpuid
level to 0x20, hence exposing 0x1f to guests.
./qemu-system-x86_64
-drive 
file=/usr/share/OVMF/OVMF_CODE_4MB.secboot.fd,if=pflash,format=raw,unit=0,readonly=on
-drive file=/usr/share/OVMF/OVMF_VARS_4MB.fd,if=pflash,format=raw
-machine pc-q35,smm=on
-global mch.extended-tseg-mbytes=80
-accel kvm
-m 2G
-cpu SapphireRapids-v1,invtsc=on,vmx=on
-smp 2,maxcpus=240,sockets=120,dies=1,cores=2,threads=1
-hda systest_ahv_win10_cg.qcow2

Signed-off-by: Manish Mishra 
---
 hw/i386/pc.c  | 1 +
 target/i386/cpu.c | 7 +--
 target/i386/cpu.h | 5 +
 target/i386/kvm/kvm.c | 4 +++-
 4 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index c74931d577..4cab04e443 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -85,6 +85,7 @@ GlobalProperty pc_compat_9_0[] = {
 { TYPE_X86_CPU, "guest-phys-bits", "0" },
 { "sev-guest", "legacy-vm-type", "on" },
 { TYPE_X86_CPU, "legacy-multi-node", "on" },
+{ TYPE_X86_CPU, "cpuid-0x1f-enforce", "false" },
 };
 const size_t pc_compat_9_0_len = G_N_ELEMENTS(pc_compat_9_0);
 
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 4688d140c2..7b71083bc9 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -6637,7 +6637,8 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, 
uint32_t count,
 break;
 case 0x1F:
 /* V2 Extended Topology Enumeration Leaf */
-if (!x86_has_extended_topo(env->avail_cpu_topo)) {
+if (!x86_has_extended_topo(env->avail_cpu_topo) &&
+!cpu->enable_cpuid_0x1f_enforce) {
 *eax = *ebx = *ecx = *edx = 0;
 break;
 }
@@ -7450,7 +7451,8 @@ void x86_cpu_expand_features(X86CPU *cpu, Error **errp)
  * cpu->vendor_cpuid_only has been unset for compatibility with older
  * machine types.
  */
-if (x86_has_extended_topo(env->avail_cpu_topo) &&
+if ((x86_has_extended_topo(env->avail_cpu_topo) ||
+ cpu->enable_cpuid_0x1f_enforce) &&
 (IS_INTEL_CPU(env) || !cpu->vendor_cpuid_only)) {
 x86_cpu_adjust_level(cpu, &env->cpuid_min_level, 0x1F);
 }
@@ -8316,6 +8318,7 @@ static Property x86_cpu_properties[] = {
 DEFINE_PROP_BOOL("full-cpuid-auto-level", X86CPU, full_cpuid_auto_level, 
true),
 DEFINE_PROP_STRING("hv-vendor-id", X86CPU, hyperv_vendor),
 DEFINE_PROP_BOOL("cpuid-0xb", X86CPU, enable_cpuid_0xb, true),
+DEFINE_PROP_BOOL("cpuid-0x1f-enforce", X86CPU, enable_cpuid_0x1f_enforce, 
true),
 DEFINE_PROP_BOOL("x-vendor-cpuid-only", X86CPU, vendor_cpuid_only, true),
 DEFINE_PROP_BOOL("x-amd-topoext-features-only", X86CPU, 
amd_topoext_features_only, true),
 DEFINE_PROP_BOOL("lmce", X86CPU, enable_lmce, false),
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 1e121acef5..49c5641ba8 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -2102,6 +2102,11 @@ struct ArchCPU {
 /* Compatibility bits for old machine types: */
 bool enable_cpuid_0xb;
 
+/* Always return values for 0x1f leaf. In cases where extended CPU topology
+ * is not configured, return values equivalent of leaf 0xb.
+ */
+bool enable_cpuid_0x1f_enforce;
+
 /* Enable auto level-increase for all CPUID leaves */
 bool full_cpuid_auto_level;
 
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index becca2efa5..a9c6f02900 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -1799,6 +1799,7 @@ static uint32_t kvm_x86_build_cpuid(CPUX86State *env,
 uint32_t limit, i, j;
 uint32_t unused;
 struct kvm_cpuid_entry2 *c;
+X86CPU *cpu = env_archcpu(env);
 
 cpu_x86_cpuid(env, 0, 0, &limit, &unused, &unused, &unused);
 
@@ -1831,7 +1832,8 @@ static uint32_t kvm_x86_build_cpuid(CPUX86State *env,
 break;
 }
 case 0x1f:
-if (!x86_has_extended_topo(env->avail_cpu_topo)) {
+if (!x86_has_extended_topo(env->avail_cpu_topo) &&
+!cpu->enable_cpuid_0x1f_enforce) {
 cpuid_i--;
 break;
 }
-- 
2.43.0




[PATCH] target/i386: Always set leaf 0x1f

2024-07-22 Thread manish.mishra
QEMU does not set 0x1f in case VM does not have extended CPU topology
and expects guests to fallback to 0xb. Some versions of windows i.e.
windows 10, 11 does not like this behavior and expects this leaf to be
populated. This is observed with windows VMs with secure boot, uefi
and HyperV role enabled.

Leaf 0x1f is superset of 0xb, so it makes sense to set 0x1f equivalent
to 0xb by default and workaround windows issue. This change adds a
new property 'cpuid-0x1f-enforce' to set leaf 0x1f equivalent to 0xb in
case extended CPU topology is not present and behave as before otherwise.
---
 hw/i386/pc.c  |  1 +
 target/i386/cpu.c | 71 +++
 target/i386/cpu.h |  5 +++
 target/i386/kvm/kvm.c |  4 ++-
 4 files changed, 53 insertions(+), 28 deletions(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index c74931d577..4cab04e443 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -85,6 +85,7 @@ GlobalProperty pc_compat_9_0[] = {
 { TYPE_X86_CPU, "guest-phys-bits", "0" },
 { "sev-guest", "legacy-vm-type", "on" },
 { TYPE_X86_CPU, "legacy-multi-node", "on" },
+{ TYPE_X86_CPU, "cpuid-0x1f-enforce", "false" },
 };
 const size_t pc_compat_9_0_len = G_N_ELEMENTS(pc_compat_9_0);
 
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 4688d140c2..f89b2ef335 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -416,6 +416,43 @@ static void encode_topo_cpuid1f(CPUX86State *env, uint32_t 
count,
 assert(!(*eax & ~0x1f));
 }
 
+static void encode_topo_cpuid_b(CPUX86State *env, uint32_t count,
+X86CPUTopoInfo *topo_info,
+uint32_t threads_per_pkg,
+uint32_t *eax, uint32_t *ebx,
+uint32_t *ecx, uint32_t *edx)
+{
+X86CPU *cpu = env_archcpu(env);
+
+if (!cpu->enable_cpuid_0xb) {
+*eax = *ebx = *ecx = *edx = 0;
+return;
+}
+
+*ecx = count & 0xff;
+*edx = cpu->apic_id;
+
+switch (count) {
+case 0:
+*eax = apicid_core_offset(topo_info);
+*ebx = topo_info->threads_per_core;
+*ecx |= CPUID_B_ECX_TOPO_LEVEL_SMT << 8;
+break;
+case 1:
+*eax = apicid_pkg_offset(topo_info);
+*ebx = threads_per_pkg;
+*ecx |= CPUID_B_ECX_TOPO_LEVEL_CORE << 8;
+break;
+default:
+*eax = 0;
+*ebx = 0;
+*ecx |= CPUID_B_ECX_TOPO_LEVEL_INVALID << 8;
+}
+
+assert(!(*eax & ~0x1f));
+*ebx &= 0x; /* The count doesn't need to be reliable. */
+}
+
 /* Encode cache info for CPUID[0x8005].ECX or CPUID[0x8005].EDX */
 static uint32_t encode_cache_cpuid8005(CPUCacheInfo *cache)
 {
@@ -6601,33 +6638,8 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, 
uint32_t count,
 break;
 case 0xB:
 /* Extended Topology Enumeration Leaf */
-if (!cpu->enable_cpuid_0xb) {
-*eax = *ebx = *ecx = *edx = 0;
-break;
-}
-
-*ecx = count & 0xff;
-*edx = cpu->apic_id;
-
-switch (count) {
-case 0:
-*eax = apicid_core_offset(&topo_info);
-*ebx = topo_info.threads_per_core;
-*ecx |= CPUID_B_ECX_TOPO_LEVEL_SMT << 8;
-break;
-case 1:
-*eax = apicid_pkg_offset(&topo_info);
-*ebx = threads_per_pkg;
-*ecx |= CPUID_B_ECX_TOPO_LEVEL_CORE << 8;
-break;
-default:
-*eax = 0;
-*ebx = 0;
-*ecx |= CPUID_B_ECX_TOPO_LEVEL_INVALID << 8;
-}
-
-assert(!(*eax & ~0x1f));
-*ebx &= 0x; /* The count doesn't need to be reliable. */
+encode_topo_cpuid_b(env, count, &topo_info, threads_per_pkg,
+eax, ebx, ecx, edx);
 break;
 case 0x1C:
 if (cpu->enable_pmu && (env->features[FEAT_7_0_EDX] & 
CPUID_7_0_EDX_ARCH_LBR)) {
@@ -6639,6 +6651,10 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, 
uint32_t count,
 /* V2 Extended Topology Enumeration Leaf */
 if (!x86_has_extended_topo(env->avail_cpu_topo)) {
 *eax = *ebx = *ecx = *edx = 0;
+if (cpu->enable_cpuid_0x1f_enforce) {
+encode_topo_cpuid_b(env, count, &topo_info, threads_per_pkg,
+eax, ebx, ecx, edx);
+}
 break;
 }
 
@@ -8316,6 +8332,7 @@ static Property x86_cpu_properties[] = {
 DEFINE_PROP_BOOL("full-cpuid-auto-level", X86CPU, full_cpuid_auto_level, 
true),
 DEFINE_PROP_STRING("hv-vendor-id", X86CPU, hyperv_vendor),
 DEFINE_PROP_BOOL("cpuid-0xb", X86CPU, enable_cpuid_0xb, true),
+DEFINE_PROP_BOOL("cpuid-0x1f-enforce", X86CPU, enable_cpuid_0x1f_enforce, 
true),
 DEFINE_PROP_BOOL("x-vendor-cpuid-only", X86CPU, vendor_cpuid_only, true),
 DEFINE_PROP_BOOL("x-amd-topoext-features-on

Re: [Qemu-devel] [PATCH] ahci: enable pci bus master MemoryRegion before loading ahci engines

2023-10-04 Thread manish.mishra


On 21/08/23 5:31 pm, manish.mishra wrote:


Hi Everyone,

We are facing this issue. I see this conversation was never conversed and discussed issue 
is still active on QEMU master. Just for summary, the solution mentioned in this thread 
"temporarily enable bus master memory region" was not taken with the following 
justification.

"Poking at PCI device internals like this seems fragile.  And force

enabling bus master can lead to unpleasantness like corrupting guest

memory, unhandled interrupts, etc.  E.g. it's quite reasonable,

spec-wise, for the guest to move thing in memory around while bus

mastering is off."


There was an alternate solution mentioned in thread, that is to mark 
PORT_CMD_FIS_RX and PORT_CMD_START disabled forcefully if the bus master for 
the device is disabled. But there are no further conclusive discussions on this.


I wanted to start this conversation again to hopefully get a conclusion for 
this.

Thanks

Manish Mishra


On 10/09/19 7:38 pm, John Snow wrote:

On 9/10/19 9:58 AM, Michael S. Tsirkin wrote:

On Tue, Sep 10, 2019 at 09:50:41AM -0400, John Snow wrote:

On 9/10/19 3:04 AM, Michael S. Tsirkin wrote:

On Tue, Sep 10, 2019 at 01:18:37AM +0800, andychiu wrote:

If Windows 10 guests have enabled 'turn off hard disk after idle'
option in power settings, and the guest has a SATA disk plugged in,
the SATA disk will be turned off after a specified idle time.
If the guest is live migrated or saved/loaded with its SATA disk
turned off, the following error will occur:

qemu-system-x86_64: AHCI: Failed to start FIS receive engine: bad FIS receive 
buffer address
qemu-system-x86_64: Failed to load ich9_ahci:ahci
qemu-system-x86_64: error while loading state for instance 0x0 of device 
':00:1a.0/ich9_ahci'
qemu-system-x86_64: load of migration failed: Operation not permitted

Observation from trace logs shows that a while after Windows 10 turns off
a SATA disk (IDE disks don't have the following behavior),
it will disable the PCI_COMMAND_MASTER flag of the pci device containing
the ahci device. When the the disk is turning back on,
the PCI_COMMAND_MASTER flag will be restored first.
But if the guest is migrated or saved/loaded while the disk is off,
the post_load callback of ahci device, ahci_state_post_load(), will fail
at ahci_cond_start_engines() if the MemoryRegion
pci_dev->bus_master_enable_region is not enabled, with pci_dev pointing
to the PCIDevice struct containing the ahci device.

This patch enables pci_dev->bus_master_enable_region before calling
ahci_cond_start_engines() in ahci_state_post_load(), and restore the
MemoryRegion to its original state afterwards.

Signed-off-by: andychiu

Poking at PCI device internals like this seems fragile.  And force
enabling bus master can lead to unpleasantness like corrupting guest
memory, unhandled interrupts, etc.  E.g. it's quite reasonable,
spec-wise, for the guest to move thing in memory around while bus
mastering is off.

Can you teach ahci that region being disabled
during migration is ok, and recover from it?

That's what I'm wondering.

I could try to just disable the FIS RX engine if the mapping fails, but
that will require a change to guest visible state.

My hunch, though, is that when windows re-enables the device it will
need to re-program the address registers anyway, so it might cope well
with the FIS RX bit getting switched off.

(I'm wondering if it isn't a mistake that QEMU is trying to re-map this
address in the first place. Is it legal that the PCI device has pci bus
master disabled but we've held on to a mapping?

If you are poking at guest memory when bus master is off, then most likely yes.


Should there be some
callback where AHCI knows to invalidate mappings at that point...?)

ATM the callback is the config write, you check
proxy->pci_dev.config[PCI_COMMAND] & PCI_COMMAND_MASTER
and if disabled invalidate the mapping.

virtio at least has code that pokes at
proxy->pci_dev.config[PCI_COMMAND] too, I'm quite
open to a function along the lines of
pci_is_bus_master_enabled()
that will do this.


Well, that's not a callback. I don't think it's right to check the
PCI_COMMAND register *every* time AHCI does anything at all to see if
its mappings are still valid.

AHCI makes a mapping *once* when FIS RX is turned on, and it unmaps it
when it's turned off. It assumes it remains valid that whole time. When
we migrate, it checks to see if it was running, and performs the
mappings again to re-boot the state machine.

What I'm asking is; what are the implications of a guest disabling
PCI_COMMAND_MASTER? (I don't know PCI as well as you do.)

What should that mean for the AHCI state machine?

Does this *necessarily* invalidate the mappings?
(In which case -- it's an error that AHCI held on to them after Windows
disabled the card, even if AHCI isn't being engaged by the gue

Re: [Qemu-devel] [PATCH] ahci: enable pci bus master MemoryRegion before loading ahci engines

2023-08-21 Thread manish.mishra

Hi Everyone,

We are facing this issue. I see this conversation was never conversed and discussed issue 
is still active on QEMU master. Just for summary, the solution mentioned in this thread 
"temporarily enable bus master memory region" was not taken with the following 
justification.

"Poking at PCI device internals like this seems fragile.  And force

enabling bus master can lead to unpleasantness like corrupting guest

memory, unhandled interrupts, etc.  E.g. it's quite reasonable,

spec-wise, for the guest to move thing in memory around while bus

mastering is off."


There was an alternate solution mentioned in thread, that is to mark 
PORT_CMD_FIS_RX and PORT_CMD_START disabled forcefully if the bus master for 
the device is disabled. But there are no further conclusive discussions on this.


I wanted to start this conversation again to hopefully get a conclusion for 
this.

Thanks

Manish Mishra


On 10/09/19 7:38 pm, John Snow wrote:


On 9/10/19 9:58 AM, Michael S. Tsirkin wrote:

On Tue, Sep 10, 2019 at 09:50:41AM -0400, John Snow wrote:


On 9/10/19 3:04 AM, Michael S. Tsirkin wrote:

On Tue, Sep 10, 2019 at 01:18:37AM +0800, andychiu wrote:

If Windows 10 guests have enabled 'turn off hard disk after idle'
option in power settings, and the guest has a SATA disk plugged in,
the SATA disk will be turned off after a specified idle time.
If the guest is live migrated or saved/loaded with its SATA disk
turned off, the following error will occur:

qemu-system-x86_64: AHCI: Failed to start FIS receive engine: bad FIS receive 
buffer address
qemu-system-x86_64: Failed to load ich9_ahci:ahci
qemu-system-x86_64: error while loading state for instance 0x0 of device 
':00:1a.0/ich9_ahci'
qemu-system-x86_64: load of migration failed: Operation not permitted

Observation from trace logs shows that a while after Windows 10 turns off
a SATA disk (IDE disks don't have the following behavior),
it will disable the PCI_COMMAND_MASTER flag of the pci device containing
the ahci device. When the the disk is turning back on,
the PCI_COMMAND_MASTER flag will be restored first.
But if the guest is migrated or saved/loaded while the disk is off,
the post_load callback of ahci device, ahci_state_post_load(), will fail
at ahci_cond_start_engines() if the MemoryRegion
pci_dev->bus_master_enable_region is not enabled, with pci_dev pointing
to the PCIDevice struct containing the ahci device.

This patch enables pci_dev->bus_master_enable_region before calling
ahci_cond_start_engines() in ahci_state_post_load(), and restore the
MemoryRegion to its original state afterwards.

Signed-off-by: andychiu

Poking at PCI device internals like this seems fragile.  And force
enabling bus master can lead to unpleasantness like corrupting guest
memory, unhandled interrupts, etc.  E.g. it's quite reasonable,
spec-wise, for the guest to move thing in memory around while bus
mastering is off.

Can you teach ahci that region being disabled
during migration is ok, and recover from it?

That's what I'm wondering.

I could try to just disable the FIS RX engine if the mapping fails, but
that will require a change to guest visible state.

My hunch, though, is that when windows re-enables the device it will
need to re-program the address registers anyway, so it might cope well
with the FIS RX bit getting switched off.

(I'm wondering if it isn't a mistake that QEMU is trying to re-map this
address in the first place. Is it legal that the PCI device has pci bus
master disabled but we've held on to a mapping?

If you are poking at guest memory when bus master is off, then most likely yes.


Should there be some
callback where AHCI knows to invalidate mappings at that point...?)

ATM the callback is the config write, you check
proxy->pci_dev.config[PCI_COMMAND] & PCI_COMMAND_MASTER
and if disabled invalidate the mapping.

virtio at least has code that pokes at
proxy->pci_dev.config[PCI_COMMAND] too, I'm quite
open to a function along the lines of
pci_is_bus_master_enabled()
that will do this.


Well, that's not a callback. I don't think it's right to check the
PCI_COMMAND register *every* time AHCI does anything at all to see if
its mappings are still valid.

AHCI makes a mapping *once* when FIS RX is turned on, and it unmaps it
when it's turned off. It assumes it remains valid that whole time. When
we migrate, it checks to see if it was running, and performs the
mappings again to re-boot the state machine.

What I'm asking is; what are the implications of a guest disabling
PCI_COMMAND_MASTER? (I don't know PCI as well as you do.)

What should that mean for the AHCI state machine?

Does this *necessarily* invalidate the mappings?
(In which case -- it's an error that AHCI held on to them after Windows
disabled the card, even if AHCI isn't being engaged by the guest
anymore. Essentially, we were turned off but didn't clean up a dangling
pointer, but we need the event that tells us to clean the dangling mapping.)


Re: [PATCH] multifd: Avoid busy-wait in multifd_send_pages()

2023-04-26 Thread manish.mishra



On 26/04/23 5:35 pm, Juan Quintela wrote:

"manish.mishra"  wrote:

On 26/04/23 4:35 pm, Juan Quintela wrote:

"manish.mishra"  wrote:

On 26/04/23 3:58 pm, Juan Quintela wrote:

Before:

while (true) {
  
  sem_post(channels_ready)
}

And you want to add to the initialization a counter equal to the number
of channels.

Now:

while (true) {
  sem_post(channels_ready)
  
}

It is semantically the same, but when we setup it ready it means that
when we set it to 1, we now that the channel and thread are ready for
action.


May be we can do one thing let the sem_post in while loop at same
position itself. But we can do another post just before start

I can see how this can make any difference.



of this while loop, as that will be called only once it should do work
of initialising count equal to multiFD channels?

Yeap.  But I can see what difference do we have here.

Later, Juan.


Thanks Juan,
Just confirming if i misunderstood something :)

I meant your approach makes sense, i was just suggesting a small change. To do 
something like this.

qemu_sem_init(&multifd_send_state->channels_ready, 0);

static void *multifd_send_thread(void *opaque) {
  ...

  sem_post(channels_ready); // Post once at start of thread and let one in 
loop as it is.

  while (true) {
     
     sem_post(channels_ready)
  }
}


Something like below has issue that we are marking channel_ready even
before channel is actually ready,

I think it is exactly the same.


i meant if network is slow it may
take some time to update pending_job and hence we can busy loop in
send_multifd_page().

No difference from send_multifd_page() point of view.
Notice that I mank that the channel is ready before I do any work.

send_multifd_page() does a sem_wait() before doing anything related to
this channel, so I can't see how it can be a differnce.


static void *multifd_send_thread(void *opaque) {
  ...

  while (true) {
     sem_post(channels_ready);
     
  }
}


Not sure if we are already in agreement :) just confirming.
  sem_post(channels_ready); // Post once at start of thread and let one in 
loop as it is.
  while (true) {
     ...
     sem_post(channels_ready)
  }

and


  while (true) {
     sem_post(channels_ready)
     ...
  }

When "..." is exactly the same don't make any difference, the only
difference is that in one case we "write" to semposts, and in the other
we write just one.

Or I am missing something.

Later, Juan.



yes sorry, i misunderstood techically it does not make any difference. LGTM

Thanks

Manish Mishra




Re: [PATCH] multifd: Avoid busy-wait in multifd_send_pages()

2023-04-26 Thread manish.mishra



On 26/04/23 4:35 pm, Juan Quintela wrote:

"manish.mishra"  wrote:

On 26/04/23 3:58 pm, Juan Quintela wrote:

"manish.mishra"  wrote:

multifd_send_sync_main() posts request on the multifd channel
but does not call sem_wait() on channels_ready semaphore, making
the channels_ready semaphore count keep increasing.
As a result, sem_wait() on channels_ready in multifd_send_pages()
is always non-blocking hence multifd_send_pages() keeps searching
for a free channel in a busy loop until a channel is freed.

Signed-off-by: manish.mishra 
---
   migration/multifd.c | 3 ++-
   1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/migration/multifd.c b/migration/multifd.c
index cce3ad6988..43d26e7012 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -615,6 +615,7 @@ int multifd_send_sync_main(QEMUFile *f)
 trace_multifd_send_sync_main_signal(p->id);
   +qemu_sem_wait(&multifd_send_state->channels_ready);
   qemu_mutex_lock(&p->mutex);
 if (p->quit) {

We need this, but I think it is better to put it on the second loop.


@@ -919,7 +920,7 @@ int multifd_save_setup(Error **errp)
   multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
   multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
   multifd_send_state->pages = multifd_pages_init(page_count);
-qemu_sem_init(&multifd_send_state->channels_ready, 0);
+qemu_sem_init(&multifd_send_state->channels_ready, thread_count);
   qatomic_set(&multifd_send_state->exiting, 0);
   multifd_send_state->ops = multifd_ops[migrate_multifd_compression()];

I think this bit is wrong.
We should not set the channels ready until the thread is ready and
channel is created.

What do you think about this patch:

  From bcb0ef9b97b858458c403d2e4dc9e0dbd96721b3 Mon Sep 17 00:00:00 2001
From: Juan Quintela 
Date: Wed, 26 Apr 2023 12:20:36 +0200
Subject: [PATCH] multifd: Fix the number of channels ready

We don't wait in the sem when we are doing a sync_main.  Make it wait
there.  To make things clearer, we mark the channel ready at the
begining of the thread loop.

This causes a busy loop in multifd_send_page().
Found-by: manish.mishra 

Signed-off-by: Juan Quintela 
---
   migration/multifd.c | 3 ++-
   1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/migration/multifd.c b/migration/multifd.c
index 903df2117b..e625e8725e 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -635,6 +635,7 @@ int multifd_send_sync_main(QEMUFile *f)
   for (i = 0; i < migrate_multifd_channels(); i++) {
   MultiFDSendParams *p = &multifd_send_state->params[i];
   +qemu_sem_wait(&multifd_send_state->channels_ready);
   trace_multifd_send_sync_main_wait(p->id);
   qemu_sem_wait(&p->sem_sync);
   @@ -668,6 +669,7 @@ static void *multifd_send_thread(void *opaque)
   p->num_packets = 1;
 while (true) {
+qemu_sem_post(&multifd_send_state->channels_ready);


This has one issue though, if we mark channel_ready here itself, channel is 
actually not ready so we can still busy loop?

Before:

while (true) {
 
 sem_post(channels_ready)
}

And you want to add to the initialization a counter equal to the number
of channels.

Now:

while (true) {
 sem_post(channels_ready)
 
}

It is semantically the same, but when we setup it ready it means that
when we set it to 1, we now that the channel and thread are ready for
action.


May be we can do one thing let the sem_post in while loop at same
position itself. But we can do another post just before start

I can see how this can make any difference.



of this while loop, as that will be called only once it should do work
of initialising count equal to multiFD channels?

Yeap.  But I can see what difference do we have here.

Later, Juan.



Thanks Juan,
Just confirming if i misunderstood something :)

I meant your approach makes sense, i was just suggesting a small change. To do 
something like this.

qemu_sem_init(&multifd_send_state->channels_ready, 0);

static void *multifd_send_thread(void *opaque) {
 ...

 sem_post(channels_ready); // Post once at start of thread and let one in 
loop as it is.

 while (true) {
    
    sem_post(channels_ready)
 }
}


Something like below has issue that we are marking channel_ready even before 
channel is actually ready, i meant if network is slow it may take some time to 
update pending_job and hence we can busy loop in send_multifd_page().

static void *multifd_send_thread(void *opaque) {
 ...

 while (true) {
    sem_post(channels_ready);
    
 }
}


Not sure if we are already in agreement :) just confirming.


Thanks

Manish Mishra




Re: [PATCH] multifd: Avoid busy-wait in multifd_send_pages()

2023-04-26 Thread manish.mishra



On 26/04/23 3:58 pm, Juan Quintela wrote:

"manish.mishra"  wrote:

multifd_send_sync_main() posts request on the multifd channel
but does not call sem_wait() on channels_ready semaphore, making
the channels_ready semaphore count keep increasing.
As a result, sem_wait() on channels_ready in multifd_send_pages()
is always non-blocking hence multifd_send_pages() keeps searching
for a free channel in a busy loop until a channel is freed.

Signed-off-by: manish.mishra 
---
  migration/multifd.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/migration/multifd.c b/migration/multifd.c
index cce3ad6988..43d26e7012 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -615,6 +615,7 @@ int multifd_send_sync_main(QEMUFile *f)
  
  trace_multifd_send_sync_main_signal(p->id);
  
+qemu_sem_wait(&multifd_send_state->channels_ready);

  qemu_mutex_lock(&p->mutex);
  
  if (p->quit) {

We need this, but I think it is better to put it on the second loop.


@@ -919,7 +920,7 @@ int multifd_save_setup(Error **errp)
  multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
  multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
  multifd_send_state->pages = multifd_pages_init(page_count);
-qemu_sem_init(&multifd_send_state->channels_ready, 0);
+qemu_sem_init(&multifd_send_state->channels_ready, thread_count);
  qatomic_set(&multifd_send_state->exiting, 0);
  multifd_send_state->ops = multifd_ops[migrate_multifd_compression()];

I think this bit is wrong.
We should not set the channels ready until the thread is ready and
channel is created.

What do you think about this patch:

 From bcb0ef9b97b858458c403d2e4dc9e0dbd96721b3 Mon Sep 17 00:00:00 2001
From: Juan Quintela 
Date: Wed, 26 Apr 2023 12:20:36 +0200
Subject: [PATCH] multifd: Fix the number of channels ready

We don't wait in the sem when we are doing a sync_main.  Make it wait
there.  To make things clearer, we mark the channel ready at the
begining of the thread loop.

This causes a busy loop in multifd_send_page().
Found-by: manish.mishra 

Signed-off-by: Juan Quintela 
---
  migration/multifd.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/migration/multifd.c b/migration/multifd.c
index 903df2117b..e625e8725e 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -635,6 +635,7 @@ int multifd_send_sync_main(QEMUFile *f)
  for (i = 0; i < migrate_multifd_channels(); i++) {
  MultiFDSendParams *p = &multifd_send_state->params[i];
  
+qemu_sem_wait(&multifd_send_state->channels_ready);

  trace_multifd_send_sync_main_wait(p->id);
  qemu_sem_wait(&p->sem_sync);
  
@@ -668,6 +669,7 @@ static void *multifd_send_thread(void *opaque)

  p->num_packets = 1;
  
  while (true) {

+qemu_sem_post(&multifd_send_state->channels_ready);



This has one issue though, if we mark channel_ready here itself, channel is 
actually not ready so we can still busy loop?

May be we can do one thing let the sem_post in while loop at same position 
itself. But we can do another post just before start

of this while loop, as that will be called only once it should do work of 
initialising count equal to multiFD channels?



  qemu_sem_wait(&p->sem);
  
  if (qatomic_read(&multifd_send_state->exiting)) {

@@ -736,7 +738,6 @@ static void *multifd_send_thread(void *opaque)
  if (flags & MULTIFD_FLAG_SYNC) {
  qemu_sem_post(&p->sem_sync);
  }
-qemu_sem_post(&multifd_send_state->channels_ready);
  } else if (p->quit) {
  qemu_mutex_unlock(&p->mutex);
  break;




[PATCH] multifd: Avoid busy-wait in multifd_send_pages()

2023-04-25 Thread manish.mishra
multifd_send_sync_main() posts request on the multifd channel
but does not call sem_wait() on channels_ready semaphore, making
the channels_ready semaphore count keep increasing.
As a result, sem_wait() on channels_ready in multifd_send_pages()
is always non-blocking hence multifd_send_pages() keeps searching
for a free channel in a busy loop until a channel is freed.

Signed-off-by: manish.mishra 
---
 migration/multifd.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/migration/multifd.c b/migration/multifd.c
index cce3ad6988..43d26e7012 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -615,6 +615,7 @@ int multifd_send_sync_main(QEMUFile *f)
 
 trace_multifd_send_sync_main_signal(p->id);
 
+qemu_sem_wait(&multifd_send_state->channels_ready);
 qemu_mutex_lock(&p->mutex);
 
 if (p->quit) {
@@ -919,7 +920,7 @@ int multifd_save_setup(Error **errp)
 multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
 multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
 multifd_send_state->pages = multifd_pages_init(page_count);
-qemu_sem_init(&multifd_send_state->channels_ready, 0);
+qemu_sem_init(&multifd_send_state->channels_ready, thread_count);
 qatomic_set(&multifd_send_state->exiting, 0);
 multifd_send_state->ops = multifd_ops[migrate_multifd_compression()];
 
-- 
2.22.3




Re: Expose support for HyperV features via QMP

2023-02-10 Thread manish.mishra



On 09/02/23 7:47 pm, Vitaly Kuznetsov wrote:

Alex Bennée  writes:


"manish.mishra"  writes:


Hi Everyone,

Checking if there is any feedback on this.

I've expanded the CC list to some relevant maintainers and people who
have touched that code in case this was missed.


Thanks

Manish Mishra

On 31/01/23 8:17 pm, manish.mishra wrote:

  Hi Everyone,
  I hope everyone is doing great. We wanted to check why we do not expose 
support for HyperV features in
  Qemu similar to what we do for normal CPU features via query-cpu-defs or 
cpu-model-expansion QMP
  commands. This support is required for live migration with HyperV features as 
hyperv passthrough is not
  an option. If users had knowledge of what features are supported by source 
and destination, VM can be
  started with an intersection of features supported by both source and 
destination.
  If there is no specific reason for not doing this, does it make sense to add 
a new QMP which expose
  support (internally also validating with KVM or KVM_GET_SUPPORTED_HV_CPUID 
ioctl) for HyperV
  features.
  Apologies in advance if i misunderstood something.


Thanks for Ccing me.

Hyper-V features should appear in QMP since

commit 071ce4b03becf9e2df6b758fde9609be8ddf56f1
Author: Vitaly Kuznetsov 
Date:   Tue Jun 8 14:08:13 2021 +0200

 i386: expand Hyper-V features during CPU feature expansion time

also, the support for Hypre-V feature discovery was just added to
libvirt:

903ea9370d qemu_capabilities: Report Hyper-V Enlightenments in domcapabilities
10f4784864 qemu_capabilities: Query for Hyper-V Enlightenments
ff8731680b qemuMonitorJSONGetCPUModelExpansion: Introduce @hv_passthrough 
argument
7c12eb2397 qemuMonitorJSONMakeCPUModel: Introduce @hv_passthrough argument
7c1ecfd512 domain_capabilities: Expose Hyper-V Enlightenments
179e45d237 virDomainCapsEnumFormat: Retrun void
a7789d9324 virDomainCapsEnumFormat: Switch to virXMLFormatElement()

in case this is not enough, could you please elaborate on the use-case
you have in mind?



Thanks Vitaly, Alex,

Yes this should work. Sorry I checked qemu master code but not sure how i 
missed this :).

Thanks

Manish Mishra




Re: Expose support for HyperV features via QMP

2023-02-09 Thread manish.mishra

Hi Everyone,

Checking if there is any feedback on this.

Thanks

Manish Mishra


On 31/01/23 8:17 pm, manish.mishra wrote:


Hi Everyone,
I hope everyone is doing great. We wanted to check why we do not expose support 
for HyperV features in Qemu similar to what we do for normal CPU features via 
query-cpu-defs or cpu-model-expansion QMP commands. This support is required 
for live migration with HyperV features as hyperv passthrough is not an option. 
If users had knowledge of what features are supported by source and 
destination, VM can be started with an intersection of features supported by 
both source and destination.
If there is no specific reason for not doing this, does it make sense to add a 
new QMP which expose support (internally also validating with KVM or 
KVM_GET_SUPPORTED_HV_CPUID ioctl) for HyperV features.
Apologies in advance if i misunderstood something.

Thanks

Manish Mishra


Re: Expose support for HyperV features via QMP

2023-02-01 Thread manish.mishra


On 31/01/23 8:17 pm, manish.mishra wrote:


Hi Everyone,
I hope everyone is doing great. We wanted to check why we do not expose support 
for HyperV features in Qemu similar to what we do for normal CPU features via 
query-cpu-defs or cpu-model-expansion QMP commands. This support is required 
for live migration with HyperV features as hyperv passthrough is not an option. 
If users had knowledge of what features are supported by source and 
destination, VM can be started with an intersection of features supported by 
both source and destination.
If there is no specific reason for not doing this, does it make sense to add a 
new QMP which expose support (internally also validating with KVM or 
KVM_GET_SUPPORTED_HV_CPUID ioctl) for HyperV features.
Apologies in advance if i misunderstood something.

Thanks

Manish Mishra



Hi Everyone,

Checking if there is any feedback on this.

Thanks

Manish Mishra


Re: [PATCH v6 0/2] check magic value for deciding the mapping of channels

2023-01-31 Thread manish.mishra



On 31/01/23 8:47 pm, Peter Xu wrote:

On Tue, Jan 31, 2023 at 08:29:08PM +0530, manish.mishra wrote:

Hi Peter, Daniel,

Just a gentle reminder on this patch if it can be merged, and really
sorry i see now earlier reminders i sent were on v6[0/2] and somehow you
were not CCed on that earlier. You were CCed just on v6[1/2] and v6[2,2]
so that's why probably missed it.

Yes I think so.  For some reason I guess Juan missed this set when sending
the most recent PR. We should pick them up soon.



Thanks Peter :)




Expose support for HyperV features via QMP

2023-01-31 Thread manish.mishra

Hi Everyone,
I hope everyone is doing great. We wanted to check why we do not expose support 
for HyperV features in Qemu similar to what we do for normal CPU features via 
query-cpu-defs or cpu-model-expansion QMP commands. This support is required 
for live migration with HyperV features as hyperv passthrough is not an option. 
If users had knowledge of what features are supported by source and 
destination, VM can be started with an intersection of features supported by 
both source and destination.
If there is no specific reason for not doing this, does it make sense to add a 
new QMP which expose support (internally also validating with KVM or 
KVM_GET_SUPPORTED_HV_CPUID ioctl) for HyperV features.
Apologies in advance if i misunderstood something.

Thanks

Manish Mishra


Re: [PATCH v6 0/2] check magic value for deciding the mapping of channels

2023-01-31 Thread manish.mishra

Hi Peter, Daniel,

Just a gentle reminder on this patch if it can be merged, and really sorry i 
see now earlier reminders i sent were on v6[0/2] and somehow you were not CCed 
on that earlier. You were CCed just on v6[1/2] and v6[2,2] so that's why 
probably missed it.

Thanks

Manish Mishra

On 21/12/22 12:06 am, manish.mishra wrote:

Current logic assumes that channel connections on the destination side are
always established in the same order as the source and the first one will
always be the main channel followed by the multifid or post-copy
preemption channel. This may not be always true, as even if a channel has a
connection established on the source side it can be in the pending state on
the destination side and a newer connection can be established first.
Basically causing out of order mapping of channels on the destination side.
Currently, all channels except post-copy preempt send a magic number, this
patch uses that magic number to decide the type of channel. This logic is
applicable only for precopy(multifd) live migration, as mentioned, the
post-copy preempt channel does not send any magic number. Also, tls live
migrations already does tls handshake before creating other channels, so
this issue is not possible with tls, hence this logic is avoided for tls
live migrations. This patch uses MSG_PEEK to check the magic number of
channels so that current data/control stream management remains
un-effected.

v2:
   TLS does not support MSG_PEEK, so V1 was broken for tls live
   migrations. For tls live migration, while initializing main channel
   tls handshake is done before we can create other channels, so this
   issue is not possible for tls live migrations. In V2 added a check
   to avoid checking magic number for tls live migration and fallback
   to older method to decide mapping of channels on destination side.

v3:
   1. Split change in two patches, io patch for read_peek routines,
  migration patch for migration related changes.
   2. Add flags to io_readv calls to get extra read flags like
  MSG_PEEK.
   3. Some other minor fixes.

v4:
   1. Removed common *all_eof routines for read peek and added one
  specific to live migration.
   2. Updated to use qemu_co_sleep_ns instead of qio_channel_yield.
   3. Some other minor fixes.

v5:
   1. Handle busy-wait in migration_channel_read_peek due partial reads.

v6:
   With earlier patch, multifd_load_setup was done only in
   migration_incoming_setup but if multifd channel is received before
   default channel, multifd channels will be uninitialized. Moved
   multifd_load_setup to migration_ioc_process_incoming.
   


manish.mishra (2):
   io: Add support for MSG_PEEK for socket channel
   migration: check magic value for deciding the mapping of channels

  chardev/char-socket.c   |  4 +--
  include/io/channel.h|  6 
  io/channel-buffer.c |  1 +
  io/channel-command.c|  1 +
  io/channel-file.c   |  1 +
  io/channel-null.c   |  1 +
  io/channel-socket.c | 17 -
  io/channel-tls.c|  1 +
  io/channel-websock.c|  1 +
  io/channel.c| 16 ++---
  migration/channel-block.c   |  1 +
  migration/channel.c | 45 
  migration/channel.h |  5 +++
  migration/migration.c   | 54 -
  migration/multifd.c | 19 +-
  migration/multifd.h |  2 +-
  migration/postcopy-ram.c|  5 +--
  migration/postcopy-ram.h|  2 +-
  scsi/qemu-pr-helper.c   |  2 +-
  tests/qtest/tpm-emu.c   |  2 +-
  tests/unit/test-io-channel-socket.c |  1 +
  util/vhost-user-server.c|  2 +-
  22 files changed, 148 insertions(+), 41 deletions(-)


Re: [PATCH v6 0/2] check magic value for deciding the mapping of channels

2023-01-04 Thread manish.mishra

Hi Everyone,

I was just checking if it was not missed in holidays and was received. :)

Thanks

Manish Mishra

On 21/12/22 12:14 am, manish.mishra wrote:

Current logic assumes that channel connections on the destination side are
always established in the same order as the source and the first one will
always be the main channel followed by the multifid or post-copy
preemption channel. This may not be always true, as even if a channel has a
connection established on the source side it can be in the pending state on
the destination side and a newer connection can be established first.
Basically causing out of order mapping of channels on the destination side.
Currently, all channels except post-copy preempt send a magic number, this
patch uses that magic number to decide the type of channel. This logic is
applicable only for precopy(multifd) live migration, as mentioned, the
post-copy preempt channel does not send any magic number. Also, tls live
migrations already does tls handshake before creating other channels, so
this issue is not possible with tls, hence this logic is avoided for tls
live migrations. This patch uses MSG_PEEK to check the magic number of
channels so that current data/control stream management remains
un-effected.

v2:
   TLS does not support MSG_PEEK, so V1 was broken for tls live
   migrations. For tls live migration, while initializing main channel
   tls handshake is done before we can create other channels, so this
   issue is not possible for tls live migrations. In V2 added a check
   to avoid checking magic number for tls live migration and fallback
   to older method to decide mapping of channels on destination side.

v3:
   1. Split change in two patches, io patch for read_peek routines,
  migration patch for migration related changes.
   2. Add flags to io_readv calls to get extra read flags like
  MSG_PEEK.
   3. Some other minor fixes.

v4:
   1. Removed common *all_eof routines for read peek and added one
  specific to live migration.
   2. Updated to use qemu_co_sleep_ns instead of qio_channel_yield.
   3. Some other minor fixes.

v5:
   1. Handle busy-wait in migration_channel_read_peek due partial reads.

v6:
   With earlier patch, multifd_load_setup was done only in
   migration_incoming_setup but if multifd channel is received before
   default channel, multifd channels will be uninitialized. Moved
   multifd_load_setup to migration_ioc_process_incoming.
   


manish.mishra (2):
   io: Add support for MSG_PEEK for socket channel
   migration: check magic value for deciding the mapping of channels

  chardev/char-socket.c   |  4 +--
  include/io/channel.h|  6 
  io/channel-buffer.c |  1 +
  io/channel-command.c|  1 +
  io/channel-file.c   |  1 +
  io/channel-null.c   |  1 +
  io/channel-socket.c | 17 -
  io/channel-tls.c|  1 +
  io/channel-websock.c|  1 +
  io/channel.c| 16 ++---
  migration/channel-block.c   |  1 +
  migration/channel.c | 45 
  migration/channel.h |  5 +++
  migration/migration.c   | 54 -
  migration/multifd.c | 19 +-
  migration/multifd.h |  2 +-
  migration/postcopy-ram.c|  5 +--
  migration/postcopy-ram.h|  2 +-
  scsi/qemu-pr-helper.c   |  2 +-
  tests/qtest/tpm-emu.c   |  2 +-
  tests/unit/test-io-channel-socket.c |  1 +
  util/vhost-user-server.c|  2 +-
  22 files changed, 148 insertions(+), 41 deletions(-)


[PATCH v6 2/2] migration: check magic value for deciding the mapping of channels

2022-12-20 Thread manish.mishra
Current logic assumes that channel connections on the destination side are
always established in the same order as the source and the first one will
always be the main channel followed by the multifid or post-copy
preemption channel. This may not be always true, as even if a channel has a
connection established on the source side it can be in the pending state on
the destination side and a newer connection can be established first.
Basically causing out of order mapping of channels on the destination side.
Currently, all channels except post-copy preempt send a magic number, this
patch uses that magic number to decide the type of channel. This logic is
applicable only for precopy(multifd) live migration, as mentioned, the
post-copy preempt channel does not send any magic number. Also, tls live
migrations already does tls handshake before creating other channels, so
this issue is not possible with tls, hence this logic is avoided for tls
live migrations. This patch uses read peek to check the magic number of
channels so that current data/control stream management remains
un-effected.

Reviewed-by: Peter Xu 
Reviewed-by: Daniel P. Berrange 
Suggested-by: Daniel P. Berrange 
Signed-off-by: manish.mishra 
---
 migration/channel.c  | 45 +
 migration/channel.h  |  5 
 migration/migration.c| 54 
 migration/multifd.c  | 19 +++---
 migration/multifd.h  |  2 +-
 migration/postcopy-ram.c |  5 +---
 migration/postcopy-ram.h |  2 +-
 7 files changed, 101 insertions(+), 31 deletions(-)

diff --git a/migration/channel.c b/migration/channel.c
index 1b0815039f..fc0228a198 100644
--- a/migration/channel.c
+++ b/migration/channel.c
@@ -92,3 +92,48 @@ void migration_channel_connect(MigrationState *s,
 migrate_fd_connect(s, error);
 error_free(error);
 }
+
+
+/**
+ * @migration_channel_read_peek - Peek at migration channel, without
+ * actually removing it from channel buffer.
+ *
+ * @ioc: the channel object
+ * @buf: the memory region to read data into
+ * @buflen: the number of bytes to read in @buf
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Returns 0 if successful, returns -1 and sets @errp if fails.
+ */
+int migration_channel_read_peek(QIOChannel *ioc,
+const char *buf,
+const size_t buflen,
+Error **errp)
+{
+ssize_t len = 0;
+struct iovec iov = { .iov_base = (char *)buf, .iov_len = buflen };
+
+while (true) {
+len = qio_channel_readv_full(ioc, &iov, 1, NULL,
+ NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, 
errp);
+
+if (len <= 0 && len != QIO_CHANNEL_ERR_BLOCK) {
+error_setg(errp,
+   "Failed to peek at channel");
+return -1;
+}
+
+if (len == buflen) {
+break;
+}
+
+/* 1ms sleep. */
+if (qemu_in_coroutine()) {
+qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100);
+} else {
+g_usleep(1000);
+}
+}
+
+return 0;
+}
diff --git a/migration/channel.h b/migration/channel.h
index 67a461c28a..5bdb8208a7 100644
--- a/migration/channel.h
+++ b/migration/channel.h
@@ -24,4 +24,9 @@ void migration_channel_connect(MigrationState *s,
QIOChannel *ioc,
const char *hostname,
Error *error_in);
+
+int migration_channel_read_peek(QIOChannel *ioc,
+const char *buf,
+const size_t buflen,
+Error **errp);
 #endif
diff --git a/migration/migration.c b/migration/migration.c
index 52b5d39244..bbc9ce3ca6 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -31,6 +31,7 @@
 #include "migration.h"
 #include "savevm.h"
 #include "qemu-file.h"
+#include "channel.h"
 #include "migration/vmstate.h"
 #include "block/block.h"
 #include "qapi/error.h"
@@ -663,10 +664,6 @@ static bool migration_incoming_setup(QEMUFile *f, Error 
**errp)
 {
 MigrationIncomingState *mis = migration_incoming_get_current();
 
-if (multifd_load_setup(errp) != 0) {
-return false;
-}
-
 if (!mis->from_src_file) {
 mis->from_src_file = f;
 }
@@ -733,31 +730,56 @@ void migration_ioc_process_incoming(QIOChannel *ioc, 
Error **errp)
 {
 MigrationIncomingState *mis = migration_incoming_get_current();
 Error *local_err = NULL;
-bool start_migration;
 QEMUFile *f;
+bool default_channel = true;
+uint32_t channel_magic = 0;
+int ret = 0;
 
-if (!mis->from_src_file) {
-/* The first connection (multifd may have multiple) */
+if (migrate_use_multifd() && !migrate_postcopy_ram() &

[PATCH v6 1/2] io: Add support for MSG_PEEK for socket channel

2022-12-20 Thread manish.mishra
MSG_PEEK peeks at the channel, The data is treated as unread and
the next read shall still return this data. This support is
currently added only for socket class. Extra parameter 'flags'
is added to io_readv calls to pass extra read flags like MSG_PEEK.

Reviewed-by: Peter Xu 
Reviewed-by: Daniel P. Berrange 
Suggested-by: Daniel P. Berrange 
Signed-off-by: manish.mishra 
---
 chardev/char-socket.c   |  4 ++--
 include/io/channel.h|  6 ++
 io/channel-buffer.c |  1 +
 io/channel-command.c|  1 +
 io/channel-file.c   |  1 +
 io/channel-null.c   |  1 +
 io/channel-socket.c | 17 -
 io/channel-tls.c|  1 +
 io/channel-websock.c|  1 +
 io/channel.c| 16 
 migration/channel-block.c   |  1 +
 scsi/qemu-pr-helper.c   |  2 +-
 tests/qtest/tpm-emu.c   |  2 +-
 tests/unit/test-io-channel-socket.c |  1 +
 util/vhost-user-server.c|  2 +-
 15 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index 29ffe5075e..c2265436ac 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -283,11 +283,11 @@ static ssize_t tcp_chr_recv(Chardev *chr, char *buf, 
size_t len)
 if (qio_channel_has_feature(s->ioc, QIO_CHANNEL_FEATURE_FD_PASS)) {
 ret = qio_channel_readv_full(s->ioc, &iov, 1,
  &msgfds, &msgfds_num,
- NULL);
+ 0, NULL);
 } else {
 ret = qio_channel_readv_full(s->ioc, &iov, 1,
  NULL, NULL,
- NULL);
+ 0, NULL);
 }
 
 if (msgfds_num) {
diff --git a/include/io/channel.h b/include/io/channel.h
index f1b7e05f81..5b41d02b2b 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -34,6 +34,8 @@ OBJECT_DECLARE_TYPE(QIOChannel, QIOChannelClass,
 
 #define QIO_CHANNEL_WRITE_FLAG_ZERO_COPY 0x1
 
+#define QIO_CHANNEL_READ_FLAG_MSG_PEEK 0x1
+
 typedef enum QIOChannelFeature QIOChannelFeature;
 
 enum QIOChannelFeature {
@@ -41,6 +43,7 @@ enum QIOChannelFeature {
 QIO_CHANNEL_FEATURE_SHUTDOWN,
 QIO_CHANNEL_FEATURE_LISTEN,
 QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY,
+QIO_CHANNEL_FEATURE_READ_MSG_PEEK,
 };
 
 
@@ -114,6 +117,7 @@ struct QIOChannelClass {
 size_t niov,
 int **fds,
 size_t *nfds,
+int flags,
 Error **errp);
 int (*io_close)(QIOChannel *ioc,
 Error **errp);
@@ -188,6 +192,7 @@ void qio_channel_set_name(QIOChannel *ioc,
  * @niov: the length of the @iov array
  * @fds: pointer to an array that will received file handles
  * @nfds: pointer filled with number of elements in @fds on return
+ * @flags: read flags (QIO_CHANNEL_READ_FLAG_*)
  * @errp: pointer to a NULL-initialized error object
  *
  * Read data from the IO channel, storing it in the
@@ -224,6 +229,7 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc,
size_t niov,
int **fds,
size_t *nfds,
+   int flags,
Error **errp);
 
 
diff --git a/io/channel-buffer.c b/io/channel-buffer.c
index bf52011be2..8096180f85 100644
--- a/io/channel-buffer.c
+++ b/io/channel-buffer.c
@@ -54,6 +54,7 @@ static ssize_t qio_channel_buffer_readv(QIOChannel *ioc,
 size_t niov,
 int **fds,
 size_t *nfds,
+int flags,
 Error **errp)
 {
 QIOChannelBuffer *bioc = QIO_CHANNEL_BUFFER(ioc);
diff --git a/io/channel-command.c b/io/channel-command.c
index 74516252ba..e7edd091af 100644
--- a/io/channel-command.c
+++ b/io/channel-command.c
@@ -203,6 +203,7 @@ static ssize_t qio_channel_command_readv(QIOChannel *ioc,
  size_t niov,
  int **fds,
  size_t *nfds,
+ int flags,
  Error **errp)
 {
 QIOChannelCommand *cioc = QIO_CHANNEL_COMMAND(ioc);
diff --git a/io/channel-file.c b/io/channel-file.c
index b67687c2aa..d76663e6ae 100644
--- a/io/channel-file.c
+++ b/io/channel-file.c
@@ -86,6 +86,7 @@ static ssize_t qio_channel_file_readv(QIOChannel *ioc,
   size_t niov,
   int **fds,
   size_t *nfds,
+  

[PATCH v6 0/2] check magic value for deciding the mapping of channels

2022-12-20 Thread manish.mishra
Current logic assumes that channel connections on the destination side are
always established in the same order as the source and the first one will
always be the main channel followed by the multifid or post-copy
preemption channel. This may not be always true, as even if a channel has a
connection established on the source side it can be in the pending state on
the destination side and a newer connection can be established first.
Basically causing out of order mapping of channels on the destination side.
Currently, all channels except post-copy preempt send a magic number, this
patch uses that magic number to decide the type of channel. This logic is
applicable only for precopy(multifd) live migration, as mentioned, the
post-copy preempt channel does not send any magic number. Also, tls live
migrations already does tls handshake before creating other channels, so
this issue is not possible with tls, hence this logic is avoided for tls
live migrations. This patch uses MSG_PEEK to check the magic number of
channels so that current data/control stream management remains
un-effected.

v2:
  TLS does not support MSG_PEEK, so V1 was broken for tls live
  migrations. For tls live migration, while initializing main channel
  tls handshake is done before we can create other channels, so this
  issue is not possible for tls live migrations. In V2 added a check
  to avoid checking magic number for tls live migration and fallback
  to older method to decide mapping of channels on destination side.

v3:
  1. Split change in two patches, io patch for read_peek routines,
 migration patch for migration related changes.
  2. Add flags to io_readv calls to get extra read flags like
 MSG_PEEK.
  3. Some other minor fixes.

v4:
  1. Removed common *all_eof routines for read peek and added one
 specific to live migration.
  2. Updated to use qemu_co_sleep_ns instead of qio_channel_yield.
  3. Some other minor fixes.

v5:
  1. Handle busy-wait in migration_channel_read_peek due partial reads.

v6:
  With earlier patch, multifd_load_setup was done only in
  migration_incoming_setup but if multifd channel is received before
  default channel, multifd channels will be uninitialized. Moved
  multifd_load_setup to migration_ioc_process_incoming.
  

manish.mishra (2):
  io: Add support for MSG_PEEK for socket channel
  migration: check magic value for deciding the mapping of channels

 chardev/char-socket.c   |  4 +--
 include/io/channel.h|  6 
 io/channel-buffer.c |  1 +
 io/channel-command.c|  1 +
 io/channel-file.c   |  1 +
 io/channel-null.c   |  1 +
 io/channel-socket.c | 17 -
 io/channel-tls.c|  1 +
 io/channel-websock.c|  1 +
 io/channel.c| 16 ++---
 migration/channel-block.c   |  1 +
 migration/channel.c | 45 
 migration/channel.h |  5 +++
 migration/migration.c   | 54 -
 migration/multifd.c | 19 +-
 migration/multifd.h |  2 +-
 migration/postcopy-ram.c|  5 +--
 migration/postcopy-ram.h|  2 +-
 scsi/qemu-pr-helper.c   |  2 +-
 tests/qtest/tpm-emu.c   |  2 +-
 tests/unit/test-io-channel-socket.c |  1 +
 util/vhost-user-server.c|  2 +-
 22 files changed, 148 insertions(+), 41 deletions(-)

-- 
2.22.3




[PATCH v6 0/2] check magic value for deciding the mapping of channels

2022-12-20 Thread manish.mishra
Current logic assumes that channel connections on the destination side are
always established in the same order as the source and the first one will
always be the main channel followed by the multifid or post-copy
preemption channel. This may not be always true, as even if a channel has a
connection established on the source side it can be in the pending state on
the destination side and a newer connection can be established first.
Basically causing out of order mapping of channels on the destination side.
Currently, all channels except post-copy preempt send a magic number, this
patch uses that magic number to decide the type of channel. This logic is
applicable only for precopy(multifd) live migration, as mentioned, the
post-copy preempt channel does not send any magic number. Also, tls live
migrations already does tls handshake before creating other channels, so
this issue is not possible with tls, hence this logic is avoided for tls
live migrations. This patch uses MSG_PEEK to check the magic number of
channels so that current data/control stream management remains
un-effected.

v2:
  TLS does not support MSG_PEEK, so V1 was broken for tls live
  migrations. For tls live migration, while initializing main channel
  tls handshake is done before we can create other channels, so this
  issue is not possible for tls live migrations. In V2 added a check
  to avoid checking magic number for tls live migration and fallback
  to older method to decide mapping of channels on destination side.

v3:
  1. Split change in two patches, io patch for read_peek routines,
 migration patch for migration related changes.
  2. Add flags to io_readv calls to get extra read flags like
 MSG_PEEK.
  3. Some other minor fixes.

v4:
  1. Removed common *all_eof routines for read peek and added one
 specific to live migration.
  2. Updated to use qemu_co_sleep_ns instead of qio_channel_yield.
  3. Some other minor fixes.

v5:
  1. Handle busy-wait in migration_channel_read_peek due partial reads.

v6:
  With earlier patch, multifd_load_setup was done only in
  migration_incoming_setup but if multifd channel is received before
  default channel, multifd channels will be uninitialized. Moved
  multifd_load_setup to migration_ioc_process_incoming.
  

manish.mishra (2):
  io: Add support for MSG_PEEK for socket channel
  migration: check magic value for deciding the mapping of channels

 chardev/char-socket.c   |  4 +--
 include/io/channel.h|  6 
 io/channel-buffer.c |  1 +
 io/channel-command.c|  1 +
 io/channel-file.c   |  1 +
 io/channel-null.c   |  1 +
 io/channel-socket.c | 17 -
 io/channel-tls.c|  1 +
 io/channel-websock.c|  1 +
 io/channel.c| 16 ++---
 migration/channel-block.c   |  1 +
 migration/channel.c | 45 
 migration/channel.h |  5 +++
 migration/migration.c   | 54 -
 migration/multifd.c | 19 +-
 migration/multifd.h |  2 +-
 migration/postcopy-ram.c|  5 +--
 migration/postcopy-ram.h|  2 +-
 scsi/qemu-pr-helper.c   |  2 +-
 tests/qtest/tpm-emu.c   |  2 +-
 tests/unit/test-io-channel-socket.c |  1 +
 util/vhost-user-server.c|  2 +-
 22 files changed, 148 insertions(+), 41 deletions(-)

-- 
2.22.3




Re: [PATCH 1/5] io: Add support for MSG_PEEK for socket channel

2022-12-16 Thread manish.mishra



On 15/12/22 11:05 pm, Peter Xu wrote:

On Thu, Dec 15, 2022 at 09:40:41AM +, Daniel P. Berrangé wrote:

On Wed, Dec 14, 2022 at 04:30:48PM -0500, Peter Xu wrote:

On Wed, Dec 14, 2022 at 09:14:09AM +, Daniel P. Berrangé wrote:

On Tue, Dec 13, 2022 at 04:38:46PM -0500, Peter Xu wrote:

From: "manish.mishra" 

MSG_PEEK reads from the peek of channel, The data is treated as
unread and the next read shall still return this data. This
support is currently added only for socket class. Extra parameter
'flags' is added to io_readv calls to pass extra read flags like
MSG_PEEK.

Reviewed-by: Daniel P. Berrang?? 
The last letter of my name has been mangled - whatever tools used
to pull in manish's patches seem to not be UTF-8 clean.

Also the email addr isn't terminated, but that was pre-existing
in manish's previous posting.



Yes sorry, noticed it now, should i send another patch to update it. Also in 
our internal testing we noticed multifd_init was skipped for 1 case, so if this 
patch series is not yet submitted should i send another version of it early 
morning monday?

Thanks

Manish Mishra


I'll fix at least the latter in my next post, sorry.

For the 1st one - I am still looking at what went wrong.

Here from the web interfaces it all looks good (besides the wrong
ending..), e.g. on lore or patchew:

https://urldefense.proofpoint.com/v2/url?u=https-3A__lore.kernel.org_all_20221213213850.1481858-2D2-2Dpeterx-40redhat.com_&d=DwIDaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=UYXwYmi1YA2mdAgHNDIkmgh-Xbem8hmqgCedCRNaj-32GKMVK8X30wH3nnXE9VF9&s=_9G0TGvGLvwCdB7vumPEboQqcrX5RoaxZ_4aZTnMDzY&e=
https://urldefense.proofpoint.com/v2/url?u=https-3A__patchew.org_QEMU_20221213213850.1481858-2D1-2Dpeterx-40redhat.com_20221213213850.1481858-2D2-2Dpeterx-40redhat.com_&d=DwIDaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=UYXwYmi1YA2mdAgHNDIkmgh-Xbem8hmqgCedCRNaj-32GKMVK8X30wH3nnXE9VF9&s=Nv0ek8VsMHEDFdQ--vwCOrwxT_u1KqbV9014SQzI1kQ&e=

It also looks good with e.g. Gmail webclient.

Then I digged into the email headers and I found that comparing to Manish's
original message, the patches I posted has one more line of "Content-type":

   Content-Type: text/plain; charset="utf-8"
   Content-type: text/plain
   
https://urldefense.proofpoint.com/v2/url?u=https-3A__patchew.org_QEMU_20221213213850.1481858-2D2-2Dpeterx-40redhat.com_mbox&d=DwIDaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=UYXwYmi1YA2mdAgHNDIkmgh-Xbem8hmqgCedCRNaj-32GKMVK8X30wH3nnXE9VF9&s=wD5Y-ZvvyUOVKpYHJkM7M8znJ3BjjpkbRIE1NVlMloI&e=

While Manish's patch only has one line:

   Content-Type: text/plain; charset="utf-8"
   
https://urldefense.proofpoint.com/v2/url?u=https-3A__patchew.org_QEMU_20221123172735.25181-2D2-2Dmanish.mishra-40nutanix.com_mbox&d=DwIDaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=UYXwYmi1YA2mdAgHNDIkmgh-Xbem8hmqgCedCRNaj-32GKMVK8X30wH3nnXE9VF9&s=eQ3eyB3GvzUzyHdjTlWTt14dw4x0PL2xSbmXWPfN104&e=

Don't trust what is shown by patchew, as that's been through many
hops.

The copy I receieved came directly to me via CC, so didn't hit mailman,
nor patchew, and that *only* has  "Content-type: text/plain".  So the
extra Content-type line with utf8 must have been added either by
mailman or patchew.

So it probably looks like a config problem in the tool you use to send
the patches originally.

Ouch... for misterious reasons I had one line in .gitconfig:

177 [format]
178 headers = "Content-type: text/plain"

And that'll also affect git-publish too...  I have it dropped now.

Thanks,





Re: [PATCH v5 1/2] io: Add support for MSG_PEEK for socket channel

2022-11-29 Thread manish.mishra



On 29/11/22 7:57 pm, Peter Xu wrote:

On Tue, Nov 29, 2022 at 04:24:58PM +0530, manish.mishra wrote:

On 23/11/22 11:34 pm, Peter Xu wrote:

On Wed, Nov 23, 2022 at 05:27:34PM +, manish.mishra wrote:

MSG_PEEK reads from the peek of channel, The data is treated as
unread and the next read shall still return this data. This
support is currently added only for socket class. Extra parameter
'flags' is added to io_readv calls to pass extra read flags like
MSG_PEEK.

Reviewed-by: Daniel P. Berrangé 

Reviewed-by: Peter Xu 


Hi Peter, Daniel

Just wanted to check if both patches in this series are approved or i
need to update few more things. Also Peter, for your Review-by to be
included do i need to send another patch version or it will be taken up
whoever will be merging it?

No need to resend it for now.

https://urldefense.proofpoint.com/v2/url?u=https-3A__wiki.qemu.org_Planning_7.2&d=DwIDaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=VKz-1dQAm60GTS1TgeCxSN7GCerZujf727kNL7OIzxaQGwdxTmmXBlOi4jJyNnGv&s=wnWA4vdxdaNYmEh0ZDD498R4ClO30VnmpOQhsIurs7U&e=

Today should tag rc3, maybe it's too late for such a change to go into this
release.  It's not a regression of 7.2 so may not be justified either.

Maybe it'll need to wait until the next dev cycle opens.

Thanks,


ok got it, thank you Peter.

Thanks

Manish Mishra




Re: [PATCH v5 1/2] io: Add support for MSG_PEEK for socket channel

2022-11-29 Thread manish.mishra



On 23/11/22 11:34 pm, Peter Xu wrote:

On Wed, Nov 23, 2022 at 05:27:34PM +, manish.mishra wrote:

MSG_PEEK reads from the peek of channel, The data is treated as
unread and the next read shall still return this data. This
support is currently added only for socket class. Extra parameter
'flags' is added to io_readv calls to pass extra read flags like
MSG_PEEK.

Reviewed-by: Daniel P. Berrangé 

Reviewed-by: Peter Xu 


Hi Peter, Daniel

Just wanted to check if both patches in this series are approved or i need to 
update few more things. Also Peter, for your Review-by to be included do i need 
to send another patch version or it will be taken up whoever will be merging it?

Thanks

Manish Mishra





[PATCH v5 0/2] check magic value for deciding the mapping of channels

2022-11-23 Thread manish.mishra
Current logic assumes that channel connections on the destination side are
always established in the same order as the source and the first one will
always be the main channel followed by the multifid or post-copy
preemption channel. This may not be always true, as even if a channel has a
connection established on the source side it can be in the pending state on
the destination side and a newer connection can be established first.
Basically causing out of order mapping of channels on the destination side.
Currently, all channels except post-copy preempt send a magic number, this
patch uses that magic number to decide the type of channel. This logic is
applicable only for precopy(multifd) live migration, as mentioned, the
post-copy preempt channel does not send any magic number. Also, tls live
migrations already does tls handshake before creating other channels, so
this issue is not possible with tls, hence this logic is avoided for tls
live migrations. This patch uses MSG_PEEK to check the magic number of
channels so that current data/control stream management remains
un-effected.

v2:
  TLS does not support MSG_PEEK, so V1 was broken for tls live
  migrations. For tls live migration, while initializing main channel
  tls handshake is done before we can create other channels, so this
  issue is not possible for tls live migrations. In V2 added a check
  to avoid checking magic number for tls live migration and fallback
  to older method to decide mapping of channels on destination side.

v3:
  1. Split change in two patches, io patch for read_peek routines,
 migration patch for migration related changes.
  2. Add flags to io_readv calls to get extra read flags like
 MSG_PEEK.
  3. Some other minor fixes.

v4:
  1. Removed common *all_eof routines for read peek and added one
 specific to live migration.
  2. Updated to use qemu_co_sleep_ns instead of qio_channel_yield.
  3. Some other minor fixes.

v5:
  1. Handle busy-wait in migration_channel_read_peek due partial reads.

manish.mishra (2):
  io: Add support for MSG_PEEK for socket channel
  migration: check magic value for deciding the mapping of channels

 chardev/char-socket.c   |  4 +--
 include/io/channel.h|  6 
 io/channel-buffer.c |  1 +
 io/channel-command.c|  1 +
 io/channel-file.c   |  1 +
 io/channel-null.c   |  1 +
 io/channel-socket.c | 17 ++-
 io/channel-tls.c|  1 +
 io/channel-websock.c|  1 +
 io/channel.c| 16 +++---
 migration/channel-block.c   |  1 +
 migration/channel.c | 45 +
 migration/channel.h |  5 
 migration/migration.c   | 45 +
 migration/multifd.c | 12 +++-
 migration/multifd.h |  2 +-
 migration/postcopy-ram.c|  5 +---
 migration/postcopy-ram.h|  2 +-
 scsi/qemu-pr-helper.c   |  2 +-
 tests/qtest/tpm-emu.c   |  2 +-
 tests/unit/test-io-channel-socket.c |  1 +
 util/vhost-user-server.c|  2 +-
 22 files changed, 137 insertions(+), 36 deletions(-)

-- 
2.22.3




[PATCH v5 1/2] io: Add support for MSG_PEEK for socket channel

2022-11-23 Thread manish.mishra
MSG_PEEK reads from the peek of channel, The data is treated as
unread and the next read shall still return this data. This
support is currently added only for socket class. Extra parameter
'flags' is added to io_readv calls to pass extra read flags like
MSG_PEEK.

Reviewed-by: Daniel P. Berrangé 
---
 chardev/char-socket.c   |  4 ++--
 include/io/channel.h|  6 ++
 io/channel-buffer.c |  1 +
 io/channel-command.c|  1 +
 io/channel-file.c   |  1 +
 io/channel-null.c   |  1 +
 io/channel-socket.c | 17 -
 io/channel-tls.c|  1 +
 io/channel-websock.c|  1 +
 io/channel.c| 16 
 migration/channel-block.c   |  1 +
 scsi/qemu-pr-helper.c   |  2 +-
 tests/qtest/tpm-emu.c   |  2 +-
 tests/unit/test-io-channel-socket.c |  1 +
 util/vhost-user-server.c|  2 +-
 15 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index 879564aa8a..5afce9a464 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -283,11 +283,11 @@ static ssize_t tcp_chr_recv(Chardev *chr, char *buf, 
size_t len)
 if (qio_channel_has_feature(s->ioc, QIO_CHANNEL_FEATURE_FD_PASS)) {
 ret = qio_channel_readv_full(s->ioc, &iov, 1,
  &msgfds, &msgfds_num,
- NULL);
+ 0, NULL);
 } else {
 ret = qio_channel_readv_full(s->ioc, &iov, 1,
  NULL, NULL,
- NULL);
+ 0, NULL);
 }
 
 if (msgfds_num) {
diff --git a/include/io/channel.h b/include/io/channel.h
index c680ee7480..716235d496 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -34,6 +34,8 @@ OBJECT_DECLARE_TYPE(QIOChannel, QIOChannelClass,
 
 #define QIO_CHANNEL_WRITE_FLAG_ZERO_COPY 0x1
 
+#define QIO_CHANNEL_READ_FLAG_MSG_PEEK 0x1
+
 typedef enum QIOChannelFeature QIOChannelFeature;
 
 enum QIOChannelFeature {
@@ -41,6 +43,7 @@ enum QIOChannelFeature {
 QIO_CHANNEL_FEATURE_SHUTDOWN,
 QIO_CHANNEL_FEATURE_LISTEN,
 QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY,
+QIO_CHANNEL_FEATURE_READ_MSG_PEEK,
 };
 
 
@@ -114,6 +117,7 @@ struct QIOChannelClass {
 size_t niov,
 int **fds,
 size_t *nfds,
+int flags,
 Error **errp);
 int (*io_close)(QIOChannel *ioc,
 Error **errp);
@@ -188,6 +192,7 @@ void qio_channel_set_name(QIOChannel *ioc,
  * @niov: the length of the @iov array
  * @fds: pointer to an array that will received file handles
  * @nfds: pointer filled with number of elements in @fds on return
+ * @flags: read flags (QIO_CHANNEL_READ_FLAG_*)
  * @errp: pointer to a NULL-initialized error object
  *
  * Read data from the IO channel, storing it in the
@@ -224,6 +229,7 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc,
size_t niov,
int **fds,
size_t *nfds,
+   int flags,
Error **errp);
 
 
diff --git a/io/channel-buffer.c b/io/channel-buffer.c
index bf52011be2..8096180f85 100644
--- a/io/channel-buffer.c
+++ b/io/channel-buffer.c
@@ -54,6 +54,7 @@ static ssize_t qio_channel_buffer_readv(QIOChannel *ioc,
 size_t niov,
 int **fds,
 size_t *nfds,
+int flags,
 Error **errp)
 {
 QIOChannelBuffer *bioc = QIO_CHANNEL_BUFFER(ioc);
diff --git a/io/channel-command.c b/io/channel-command.c
index 74516252ba..e7edd091af 100644
--- a/io/channel-command.c
+++ b/io/channel-command.c
@@ -203,6 +203,7 @@ static ssize_t qio_channel_command_readv(QIOChannel *ioc,
  size_t niov,
  int **fds,
  size_t *nfds,
+ int flags,
  Error **errp)
 {
 QIOChannelCommand *cioc = QIO_CHANNEL_COMMAND(ioc);
diff --git a/io/channel-file.c b/io/channel-file.c
index b67687c2aa..d76663e6ae 100644
--- a/io/channel-file.c
+++ b/io/channel-file.c
@@ -86,6 +86,7 @@ static ssize_t qio_channel_file_readv(QIOChannel *ioc,
   size_t niov,
   int **fds,
   size_t *nfds,
+  int flags,
   Error **errp)
 {
 QIOChannelFil

[PATCH v5 2/2] migration: check magic value for deciding the mapping of channels

2022-11-23 Thread manish.mishra
Current logic assumes that channel connections on the destination side are
always established in the same order as the source and the first one will
always be the main channel followed by the multifid or post-copy
preemption channel. This may not be always true, as even if a channel has a
connection established on the source side it can be in the pending state on
the destination side and a newer connection can be established first.
Basically causing out of order mapping of channels on the destination side.
Currently, all channels except post-copy preempt send a magic number, this
patch uses that magic number to decide the type of channel. This logic is
applicable only for precopy(multifd) live migration, as mentioned, the
post-copy preempt channel does not send any magic number. Also, tls live
migrations already does tls handshake before creating other channels, so
this issue is not possible with tls, hence this logic is avoided for tls
live migrations. This patch uses read peek to check the magic number of
channels so that current data/control stream management remains
un-effected.

Reviewed-by: Peter Xu 
Reviewed-by: Daniel P. Berrangé 
---
 migration/channel.c  | 45 
 migration/channel.h  |  5 +
 migration/migration.c| 45 +---
 migration/multifd.c  | 12 ---
 migration/multifd.h  |  2 +-
 migration/postcopy-ram.c |  5 +
 migration/postcopy-ram.h |  2 +-
 7 files changed, 90 insertions(+), 26 deletions(-)

diff --git a/migration/channel.c b/migration/channel.c
index 1b0815039f..270b7acca2 100644
--- a/migration/channel.c
+++ b/migration/channel.c
@@ -92,3 +92,48 @@ void migration_channel_connect(MigrationState *s,
 migrate_fd_connect(s, error);
 error_free(error);
 }
+
+
+/**
+ * @migration_channel_read_peek - Read from the peek of migration channel,
+ *without actually removing it from channel buffer.
+ *
+ * @ioc: the channel object
+ * @buf: the memory region to read data into
+ * @buflen: the number of bytes to read in @buf
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Returns 0 if successful, returns -1 and sets @errp if fails.
+ */
+int migration_channel_read_peek(QIOChannel *ioc,
+const char *buf,
+const size_t buflen,
+Error **errp)
+{
+ssize_t len = 0;
+struct iovec iov = { .iov_base = (char *)buf, .iov_len = buflen };
+
+while (true) {
+len = qio_channel_readv_full(ioc, &iov, 1, NULL,
+ NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, 
errp);
+
+if (len <= 0 && len != QIO_CHANNEL_ERR_BLOCK) {
+error_setg(errp,
+   "Failed to read peek of channel");
+return -1;
+}
+
+if (len == buflen) {
+break;
+}
+
+/* 1ms sleep. */
+if (qemu_in_coroutine()) {
+qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100);
+} else {
+g_usleep(1000);
+}
+}
+
+return 0;
+}
diff --git a/migration/channel.h b/migration/channel.h
index 67a461c28a..5bdb8208a7 100644
--- a/migration/channel.h
+++ b/migration/channel.h
@@ -24,4 +24,9 @@ void migration_channel_connect(MigrationState *s,
QIOChannel *ioc,
const char *hostname,
Error *error_in);
+
+int migration_channel_read_peek(QIOChannel *ioc,
+const char *buf,
+const size_t buflen,
+Error **errp);
 #endif
diff --git a/migration/migration.c b/migration/migration.c
index f485eea5fb..8509f20a80 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -31,6 +31,7 @@
 #include "migration.h"
 #include "savevm.h"
 #include "qemu-file.h"
+#include "channel.h"
 #include "migration/vmstate.h"
 #include "block/block.h"
 #include "qapi/error.h"
@@ -733,31 +734,51 @@ void migration_ioc_process_incoming(QIOChannel *ioc, 
Error **errp)
 {
 MigrationIncomingState *mis = migration_incoming_get_current();
 Error *local_err = NULL;
-bool start_migration;
 QEMUFile *f;
+bool default_channel = true;
+uint32_t channel_magic = 0;
+int ret = 0;
 
-if (!mis->from_src_file) {
-/* The first connection (multifd may have multiple) */
+if (migrate_use_multifd() && !migrate_postcopy_ram() &&
+qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
+/*
+ * With multiple channels, it is possible that we receive channels
+ * out of order on destination side, causing incorrect mapping of
+ * source channels on destination side. Check channel MAGIC to
+ * decide type of channel. Please note this is best effort, postcopy
+ * preempt channel does not send any magic number so avoid it for
+ 

Re: [PATCH v4 2/2] migration: check magic value for deciding the mapping of channels

2022-11-23 Thread manish.mishra



On 23/11/22 9:57 pm, Peter Xu wrote:

On Wed, Nov 23, 2022 at 09:28:14PM +0530, manish.mishra wrote:

On 23/11/22 9:22 pm, Peter Xu wrote:

On Wed, Nov 23, 2022 at 03:05:27PM +, manish.mishra wrote:

+int migration_channel_read_peek(QIOChannel *ioc,
+const char *buf,
+const size_t buflen,
+Error **errp)
+{
+   ssize_t len = 0;
+   struct iovec iov = { .iov_base = (char *)buf, .iov_len = buflen };
+
+   while (len < buflen) {
+   len = qio_channel_readv_full(ioc, &iov, 1, NULL,
+NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, 
errp);
+
+   if (len == QIO_CHANNEL_ERR_BLOCK) {

This needs to take care of partial len too?


sorry Peter, I did not quite understand it. Can you please give some more 
details.

As Daniel pointed out, I think if we peek partially it'll go into a loop.
Since we shouldn't read 0 anyway here, maybe you can directly write it as:

while (true) {
   len = read();
   if (len <= 0 && len != QIO_CHANNEL_ERR_BLOCK) {
   error_setg(...);
   return -1;
   }
   if (len == buflen) {
   break;
   }
   /* For either partial peek or QIO_CHANNEL_ERR_BLOCK, retry with timeout */
   sleep();
}



Yes, got it, sorry, will update it.

Thanks

Manish Mishra


+if (qemu_in_coroutine()) {
+/* 1ms sleep. */
+qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100);
+} else {
+qio_channel_wait(ioc, G_IO_IN);
+}
+continue;
+   }
+   if (len == 0) {
+   error_setg(errp,
+  "Unexpected end-of-file on channel");
+   return -1;
+   }
+   if (len < 0) {
+   return -1;
+   }
+   }
+
+   return 0;
+}

Thanks

Manish Mishra





Re: [PATCH v4 2/2] migration: check magic value for deciding the mapping of channels

2022-11-23 Thread manish.mishra



On 23/11/22 9:28 pm, Daniel P. Berrangé wrote:

On Wed, Nov 23, 2022 at 03:05:27PM +, manish.mishra wrote:

Current logic assumes that channel connections on the destination side are
always established in the same order as the source and the first one will
always be the main channel followed by the multifid or post-copy
preemption channel. This may not be always true, as even if a channel has a
connection established on the source side it can be in the pending state on
the destination side and a newer connection can be established first.
Basically causing out of order mapping of channels on the destination side.
Currently, all channels except post-copy preempt send a magic number, this
patch uses that magic number to decide the type of channel. This logic is
applicable only for precopy(multifd) live migration, as mentioned, the
post-copy preempt channel does not send any magic number. Also, tls live
migrations already does tls handshake before creating other channels, so
this issue is not possible with tls, hence this logic is avoided for tls
live migrations. This patch uses read peek to check the magic number of
channels so that current data/control stream management remains
un-effected.

Reviewed-by: Peter Xu 
Reviewed-by: Daniel P. Berrangé 
---
  migration/channel.c  | 46 
  migration/channel.h  |  5 +
  migration/migration.c| 45 ---
  migration/multifd.c  | 12 ---
  migration/multifd.h  |  2 +-
  migration/postcopy-ram.c |  5 +
  migration/postcopy-ram.h |  2 +-
  7 files changed, 91 insertions(+), 26 deletions(-)

diff --git a/migration/channel.c b/migration/channel.c
index 1b0815039f..a4600f52c5 100644
--- a/migration/channel.c
+++ b/migration/channel.c
@@ -92,3 +92,49 @@ void migration_channel_connect(MigrationState *s,
  migrate_fd_connect(s, error);
  error_free(error);
  }
+
+
+/**
+ * @migration_channel_read_peek - Read from the peek of migration channel,
+ *without actually removing it from channel buffer.
+ *
+ * @ioc: the channel object
+ * @buf: the memory region to read data into
+ * @buflen: the number of bytes to read in @buf
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Returns 0 if successful, returns -1 and sets @errp if fails.
+ */
+int migration_channel_read_peek(QIOChannel *ioc,
+const char *buf,
+const size_t buflen,
+Error **errp)
+{
+   ssize_t len = 0;
+   struct iovec iov = { .iov_base = (char *)buf, .iov_len = buflen };
+
+   while (len < buflen) {
+   len = qio_channel_readv_full(ioc, &iov, 1, NULL,
+NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, 
errp);
+
+   if (len == QIO_CHANNEL_ERR_BLOCK) {
+if (qemu_in_coroutine()) {
+/* 1ms sleep. */
+qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100);
+} else {
+qio_channel_wait(ioc, G_IO_IN);
+}
+continue;
+   }
+   if (len == 0) {
+   error_setg(errp,
+  "Unexpected end-of-file on channel");
+   return -1;
+   }
+   if (len < 0) {
+   return -1;
+   }
+   }

This busy waits when len > 0 and < buflen


With regards,
Daniel


Sorry,   Daniel, may be i misunderstood something from earlier discussions. I 
thought we discussed we may not prevent it from looping multiple times but we 
can  qemu_co_sleep_ns after every retry to deal with busy wait?

Thanks

Manish Mishra





Re: [PATCH v4 2/2] migration: check magic value for deciding the mapping of channels

2022-11-23 Thread manish.mishra



On 23/11/22 9:22 pm, Peter Xu wrote:

On Wed, Nov 23, 2022 at 03:05:27PM +, manish.mishra wrote:

+int migration_channel_read_peek(QIOChannel *ioc,
+const char *buf,
+const size_t buflen,
+Error **errp)
+{
+   ssize_t len = 0;
+   struct iovec iov = { .iov_base = (char *)buf, .iov_len = buflen };
+
+   while (len < buflen) {
+   len = qio_channel_readv_full(ioc, &iov, 1, NULL,
+NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, 
errp);
+
+   if (len == QIO_CHANNEL_ERR_BLOCK) {

This needs to take care of partial len too?



sorry Peter, I did not quite understand it. Can you please give some more 
details.




+if (qemu_in_coroutine()) {
+/* 1ms sleep. */
+qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100);
+} else {
+qio_channel_wait(ioc, G_IO_IN);
+}
+continue;
+   }
+   if (len == 0) {
+   error_setg(errp,
+  "Unexpected end-of-file on channel");
+   return -1;
+   }
+   if (len < 0) {
+   return -1;
+   }
+   }
+
+   return 0;
+}


Thanks

Manish Mishra




[PATCH v4 0/2] migration: check magic value for deciding the mapping of channels

2022-11-23 Thread manish.mishra
Current logic assumes that channel connections on the destination side are
always established in the same order as the source and the first one will
always be the main channel followed by the multifid or post-copy
preemption channel. This may not be always true, as even if a channel has a
connection established on the source side it can be in the pending state on
the destination side and a newer connection can be established first.
Basically causing out of order mapping of channels on the destination side.
Currently, all channels except post-copy preempt send a magic number, this
patch uses that magic number to decide the type of channel. This logic is
applicable only for precopy(multifd) live migration, as mentioned, the
post-copy preempt channel does not send any magic number. Also, tls live
migrations already does tls handshake before creating other channels, so
this issue is not possible with tls, hence this logic is avoided for tls
live migrations. This patch uses MSG_PEEK to check the magic number of
channels so that current data/control stream management remains
un-effected.

v2:
  TLS does not support MSG_PEEK, so V1 was broken for tls live
  migrations. For tls live migration, while initializing main channel
  tls handshake is done before we can create other channels, so this
  issue is not possible for tls live migrations. In V2 added a check
  to avoid checking magic number for tls live migration and fallback
  to older method to decide mapping of channels on destination side.

v3:
  1. Split change in two patches, io patch for read_peek routines,
 migration patch for migration related changes.
  2. Add flags to io_readv calls to get extra read flags like
 MSG_PEEK.
  3. Some other minor fixes.

v4:
  1. Removed common *all_eof routines for read peek and added one
 specific to live migration.
  2. Updated to use qemu_co_sleep_ns instead of qio_channel_yield.
  3. Some other minor fixes.

manish.mishra (2):
  io: Add support for MSG_PEEK for socket channel
  migration: check magic value for deciding the mapping of channels

 chardev/char-socket.c   |  4 +--
 include/io/channel.h|  6 
 io/channel-buffer.c |  1 +
 io/channel-command.c|  1 +
 io/channel-file.c   |  1 +
 io/channel-null.c   |  1 +
 io/channel-socket.c | 17 ++-
 io/channel-tls.c|  1 +
 io/channel-websock.c|  1 +
 io/channel.c| 16 +++---
 migration/channel-block.c   |  1 +
 migration/channel.c | 46 +
 migration/channel.h |  5 
 migration/migration.c   | 45 
 migration/multifd.c | 12 +++-
 migration/multifd.h |  2 +-
 migration/postcopy-ram.c|  5 +---
 migration/postcopy-ram.h|  2 +-
 scsi/qemu-pr-helper.c   |  2 +-
 tests/qtest/tpm-emu.c   |  2 +-
 tests/unit/test-io-channel-socket.c |  1 +
 util/vhost-user-server.c|  2 +-
 22 files changed, 138 insertions(+), 36 deletions(-)

-- 
2.22.3




[PATCH v4 2/2] migration: check magic value for deciding the mapping of channels

2022-11-23 Thread manish.mishra
Current logic assumes that channel connections on the destination side are
always established in the same order as the source and the first one will
always be the main channel followed by the multifid or post-copy
preemption channel. This may not be always true, as even if a channel has a
connection established on the source side it can be in the pending state on
the destination side and a newer connection can be established first.
Basically causing out of order mapping of channels on the destination side.
Currently, all channels except post-copy preempt send a magic number, this
patch uses that magic number to decide the type of channel. This logic is
applicable only for precopy(multifd) live migration, as mentioned, the
post-copy preempt channel does not send any magic number. Also, tls live
migrations already does tls handshake before creating other channels, so
this issue is not possible with tls, hence this logic is avoided for tls
live migrations. This patch uses read peek to check the magic number of
channels so that current data/control stream management remains
un-effected.

Reviewed-by: Peter Xu 
Reviewed-by: Daniel P. Berrangé 
---
 migration/channel.c  | 46 
 migration/channel.h  |  5 +
 migration/migration.c| 45 ---
 migration/multifd.c  | 12 ---
 migration/multifd.h  |  2 +-
 migration/postcopy-ram.c |  5 +
 migration/postcopy-ram.h |  2 +-
 7 files changed, 91 insertions(+), 26 deletions(-)

diff --git a/migration/channel.c b/migration/channel.c
index 1b0815039f..a4600f52c5 100644
--- a/migration/channel.c
+++ b/migration/channel.c
@@ -92,3 +92,49 @@ void migration_channel_connect(MigrationState *s,
 migrate_fd_connect(s, error);
 error_free(error);
 }
+
+
+/**
+ * @migration_channel_read_peek - Read from the peek of migration channel,
+ *without actually removing it from channel buffer.
+ *
+ * @ioc: the channel object
+ * @buf: the memory region to read data into
+ * @buflen: the number of bytes to read in @buf
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Returns 0 if successful, returns -1 and sets @errp if fails.
+ */
+int migration_channel_read_peek(QIOChannel *ioc,
+const char *buf,
+const size_t buflen,
+Error **errp)
+{
+   ssize_t len = 0;
+   struct iovec iov = { .iov_base = (char *)buf, .iov_len = buflen };
+
+   while (len < buflen) {
+   len = qio_channel_readv_full(ioc, &iov, 1, NULL,
+NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, 
errp);
+
+   if (len == QIO_CHANNEL_ERR_BLOCK) {
+if (qemu_in_coroutine()) {
+/* 1ms sleep. */
+qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100);
+} else {
+qio_channel_wait(ioc, G_IO_IN);
+}
+continue;
+   }
+   if (len == 0) {
+   error_setg(errp,
+  "Unexpected end-of-file on channel");
+   return -1;
+   }
+   if (len < 0) {
+   return -1;
+   }
+   }
+
+   return 0;
+}
diff --git a/migration/channel.h b/migration/channel.h
index 67a461c28a..5bdb8208a7 100644
--- a/migration/channel.h
+++ b/migration/channel.h
@@ -24,4 +24,9 @@ void migration_channel_connect(MigrationState *s,
QIOChannel *ioc,
const char *hostname,
Error *error_in);
+
+int migration_channel_read_peek(QIOChannel *ioc,
+const char *buf,
+const size_t buflen,
+Error **errp);
 #endif
diff --git a/migration/migration.c b/migration/migration.c
index f485eea5fb..8509f20a80 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -31,6 +31,7 @@
 #include "migration.h"
 #include "savevm.h"
 #include "qemu-file.h"
+#include "channel.h"
 #include "migration/vmstate.h"
 #include "block/block.h"
 #include "qapi/error.h"
@@ -733,31 +734,51 @@ void migration_ioc_process_incoming(QIOChannel *ioc, 
Error **errp)
 {
 MigrationIncomingState *mis = migration_incoming_get_current();
 Error *local_err = NULL;
-bool start_migration;
 QEMUFile *f;
+bool default_channel = true;
+uint32_t channel_magic = 0;
+int ret = 0;
 
-if (!mis->from_src_file) {
-/* The first connection (multifd may have multiple) */
+if (migrate_use_multifd() && !migrate_postcopy_ram() &&
+qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
+/*
+ * With multiple channels, it is possible that we receive channels
+ * out of order on destination side, causing incorrect mapping of
+ * source channels on destination side. Check channel MAGIC to
+ * decide type of channel. Please note this is best effort, postcopy

[PATCH v4 1/2] io: Add support for MSG_PEEK for socket channel

2022-11-23 Thread manish.mishra
MSG_PEEK reads from the peek of channel, The data is treated as
unread and the next read shall still return this data. This
support is currently added only for socket class. Extra parameter
'flags' is added to io_readv calls to pass extra read flags like
MSG_PEEK.

Suggested-by: Daniel P. Berrangé 
---
 chardev/char-socket.c   |  4 ++--
 include/io/channel.h|  6 ++
 io/channel-buffer.c |  1 +
 io/channel-command.c|  1 +
 io/channel-file.c   |  1 +
 io/channel-null.c   |  1 +
 io/channel-socket.c | 17 -
 io/channel-tls.c|  1 +
 io/channel-websock.c|  1 +
 io/channel.c| 16 
 migration/channel-block.c   |  1 +
 scsi/qemu-pr-helper.c   |  2 +-
 tests/qtest/tpm-emu.c   |  2 +-
 tests/unit/test-io-channel-socket.c |  1 +
 util/vhost-user-server.c|  2 +-
 15 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index 879564aa8a..5afce9a464 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -283,11 +283,11 @@ static ssize_t tcp_chr_recv(Chardev *chr, char *buf, 
size_t len)
 if (qio_channel_has_feature(s->ioc, QIO_CHANNEL_FEATURE_FD_PASS)) {
 ret = qio_channel_readv_full(s->ioc, &iov, 1,
  &msgfds, &msgfds_num,
- NULL);
+ 0, NULL);
 } else {
 ret = qio_channel_readv_full(s->ioc, &iov, 1,
  NULL, NULL,
- NULL);
+ 0, NULL);
 }
 
 if (msgfds_num) {
diff --git a/include/io/channel.h b/include/io/channel.h
index c680ee7480..716235d496 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -34,6 +34,8 @@ OBJECT_DECLARE_TYPE(QIOChannel, QIOChannelClass,
 
 #define QIO_CHANNEL_WRITE_FLAG_ZERO_COPY 0x1
 
+#define QIO_CHANNEL_READ_FLAG_MSG_PEEK 0x1
+
 typedef enum QIOChannelFeature QIOChannelFeature;
 
 enum QIOChannelFeature {
@@ -41,6 +43,7 @@ enum QIOChannelFeature {
 QIO_CHANNEL_FEATURE_SHUTDOWN,
 QIO_CHANNEL_FEATURE_LISTEN,
 QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY,
+QIO_CHANNEL_FEATURE_READ_MSG_PEEK,
 };
 
 
@@ -114,6 +117,7 @@ struct QIOChannelClass {
 size_t niov,
 int **fds,
 size_t *nfds,
+int flags,
 Error **errp);
 int (*io_close)(QIOChannel *ioc,
 Error **errp);
@@ -188,6 +192,7 @@ void qio_channel_set_name(QIOChannel *ioc,
  * @niov: the length of the @iov array
  * @fds: pointer to an array that will received file handles
  * @nfds: pointer filled with number of elements in @fds on return
+ * @flags: read flags (QIO_CHANNEL_READ_FLAG_*)
  * @errp: pointer to a NULL-initialized error object
  *
  * Read data from the IO channel, storing it in the
@@ -224,6 +229,7 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc,
size_t niov,
int **fds,
size_t *nfds,
+   int flags,
Error **errp);
 
 
diff --git a/io/channel-buffer.c b/io/channel-buffer.c
index bf52011be2..8096180f85 100644
--- a/io/channel-buffer.c
+++ b/io/channel-buffer.c
@@ -54,6 +54,7 @@ static ssize_t qio_channel_buffer_readv(QIOChannel *ioc,
 size_t niov,
 int **fds,
 size_t *nfds,
+int flags,
 Error **errp)
 {
 QIOChannelBuffer *bioc = QIO_CHANNEL_BUFFER(ioc);
diff --git a/io/channel-command.c b/io/channel-command.c
index 74516252ba..e7edd091af 100644
--- a/io/channel-command.c
+++ b/io/channel-command.c
@@ -203,6 +203,7 @@ static ssize_t qio_channel_command_readv(QIOChannel *ioc,
  size_t niov,
  int **fds,
  size_t *nfds,
+ int flags,
  Error **errp)
 {
 QIOChannelCommand *cioc = QIO_CHANNEL_COMMAND(ioc);
diff --git a/io/channel-file.c b/io/channel-file.c
index b67687c2aa..d76663e6ae 100644
--- a/io/channel-file.c
+++ b/io/channel-file.c
@@ -86,6 +86,7 @@ static ssize_t qio_channel_file_readv(QIOChannel *ioc,
   size_t niov,
   int **fds,
   size_t *nfds,
+  int flags,
   Error **errp)
 {
 QIOChannelFi

Re: [PATCH v3 1/2] io: Add support for MSG_PEEK for socket channel

2022-11-22 Thread manish.mishra



On 22/11/22 10:03 pm, Peter Xu wrote:

On Tue, Nov 22, 2022 at 11:29:05AM -0500, Peter Xu wrote:

On Tue, Nov 22, 2022 at 11:10:18AM -0500, Peter Xu wrote:

On Tue, Nov 22, 2022 at 09:01:59PM +0530, manish.mishra wrote:

On 22/11/22 8:19 pm, Daniel P. Berrangé wrote:

On Tue, Nov 22, 2022 at 09:41:01AM -0500, Peter Xu wrote:

On Tue, Nov 22, 2022 at 02:38:53PM +0530, manish.mishra wrote:

On 22/11/22 2:30 pm, Daniel P. Berrangé wrote:

On Sat, Nov 19, 2022 at 09:36:14AM +, manish.mishra wrote:

MSG_PEEK reads from the peek of channel, The data is treated as
unread and the next read shall still return this data. This
support is currently added only for socket class. Extra parameter
'flags' is added to io_readv calls to pass extra read flags like
MSG_PEEK.

Suggested-by: Daniel P. Berrangé 
---
chardev/char-socket.c   |  4 +-
include/io/channel.h| 83 +
io/channel-buffer.c |  1 +
io/channel-command.c|  1 +
io/channel-file.c   |  1 +
io/channel-null.c   |  1 +
io/channel-socket.c | 16 +-
io/channel-tls.c|  1 +
io/channel-websock.c|  1 +
io/channel.c| 73 +++--
migration/channel-block.c   |  1 +
scsi/qemu-pr-helper.c   |  2 +-
tests/qtest/tpm-emu.c   |  2 +-
tests/unit/test-io-channel-socket.c |  1 +
util/vhost-user-server.c|  2 +-
15 files changed, 179 insertions(+), 11 deletions(-)
diff --git a/io/channel-socket.c b/io/channel-socket.c
index b76dca9cc1..a06b24766d 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -406,6 +406,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc,
}
#endif /* WIN32 */
+qio_channel_set_feature(QIO_CHANNEL(cioc), 
QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
+

This covers the incoming server side socket.

This also needs to be set in outgoing client side socket in
qio_channel_socket_connect_async

Yes sorry, i considered only current use-case, but as it is generic one both 
should be there. Thanks will update it.


@@ -705,7 +718,6 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
}
#endif /* WIN32 */
-
#ifdef QEMU_MSG_ZEROCOPY
static int qio_channel_socket_flush(QIOChannel *ioc,
Error **errp)

Please remove this unrelated whitespace change.



@@ -109,6 +117,37 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp);
}
+int qio_channel_readv_peek_all_eof(QIOChannel *ioc,
+   const struct iovec *iov,
+   size_t niov,
+   Error **errp)
+{
+   ssize_t len = 0;
+   ssize_t total = iov_size(iov, niov);
+
+   while (len < total) {
+   len = qio_channel_readv_full(ioc, iov, niov, NULL,
+NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, 
errp);
+
+   if (len == QIO_CHANNEL_ERR_BLOCK) {
+if (qemu_in_coroutine()) {
+qio_channel_yield(ioc, G_IO_IN);
+} else {
+qio_channel_wait(ioc, G_IO_IN);
+}
+continue;
+   }
+   if (len == 0) {
+   return 0;
+   }
+   if (len < 0) {
+   return -1;
+   }
+   }

This will busy wait burning CPU where there is a read > 0 and < total.


Daniel, i could use MSG_WAITALL too if that works but then we will lose 
opportunity to yield. Or if you have some other idea.

How easy would this happen?

Another alternative is we could just return the partial len to caller then
we fallback to the original channel orders if it happens.  And then if it
mostly will never happen it'll behave merely the same as what we want.

Well we're trying to deal with a bug where the slow and/or unreliable
network causes channels to arrive in unexpected order. Given we know
we're having network trouble, I wouldn't want to make more assumptions
about things happening correctly.


With regards,
Daniel


Peter, I have seen MSG_PEEK used in combination with MSG_WAITALL, but looks 
like even though chances are less it can still return partial data even with 
multiple retries for signal case, so is not full proof.

*MSG_WAITALL *(since Linux 2.2)
   This flag requests that the operation block until the full
   request is satisfied.  However, the call may still return
   less data than requested if a signal is caught, an error
   or disconnect occurs, or the next data to be received is
   of a different type than that returned.  This flag has no
   effect for datagram sockets.

Actual read ahead will be little hackish, so just confirming we all are in 
a

Re: [PATCH] multifd: Updated QAPI format for 'migrate' qemu monitor command

2022-11-22 Thread manish.mishra



On 22/11/22 2:53 pm, Daniel P. Berrangé wrote:

On Mon, Nov 21, 2022 at 01:40:27PM +0100, Juan Quintela wrote:

Het Gala  wrote:

To prevent double data encoding of uris, instead of passing transport
mechanisms, host address and port all together in form of a single string
and writing different parsing functions, we intend the user to explicitly
mention most of the parameters through the medium of qmp command itself.

The current patch is continuation of patchset series
https://urldefense.proofpoint.com/v2/url?u=https-3A__www.mail-2Darchive.com_qemu-2Ddevel-40nongnu.org_msg901274.html&d=DwIBaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=htOVZmDRCu27EhvYDu-28snN6GV0a6-ffX34joSwBkELLLVGHzRqD0LicZed3Xtw&s=xvzrWRBN4S5l3orPqu6di0MRq-gSGWZZU6-e7wpn8w4&e=
and reply to the ongoing discussion for better QAPI design here
https://urldefense.proofpoint.com/v2/url?u=https-3A__www.mail-2Darchive.com_qemu-2Ddevel-40nongnu.org_msg903753.html&d=DwIBaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=htOVZmDRCu27EhvYDu-28snN6GV0a6-ffX34joSwBkELLLVGHzRqD0LicZed3Xtw&s=anLqZbhqa73P9SyPUaGk5q8R0SYR6IUtH_FWFML3lZY&e=
 .

Suggested-by: Daniel P. Berrange 
Suggested-by: Aravind Retnakaran 
Suggested-by: Manish Mishra 
Signed-off-by: Het Gala 





+{ 'struct': 'MigrateChannel',
+  'data' : {
+'channeltype' : 'MigrateChannelType',
+'*src-addr' : 'MigrateAddress',
+'dest-addr' : 'MigrateAddress',

Why do we want *both* addresses?

This is part of their goal to have source based routing, so that
traffic exits the src host via a particular NIC.

I think this patch would be better if we split it into two parts.

First introduce "MigrateChannel" struct with *only* the 'dest-addr'
field, and only allow one element in the list, making 'uri' optional.

Basically the first patch would *only* be about switching the
'migrate' command from using a plain string to a MigrateAddress
structure.

A second patch would then extend it to allow multiple MigrateChannel
elements, to support different destinations for each channel.

A third patch would then  extend it to add src-addr to attain the
source based routing.

A fourth patch can then deprecate the existing 'uri' field.


+'*multifd-count' : 'int' } }

And if we are passing a list, why do we want to pass the real number?

Yeah, technically I think this field is redundant, as you can just
add many entires to the 'channels' list, using the same addresses.
All this field does is allow a more compact JSON arg list. I'm
not sure this is neccessary, unless we're expecting huge numbers
of 'channels', and even then this isn't likely to be a performance
issue.


  # -> { "execute": "migrate", "arguments": { "uri": "tcp:0:4446" } }
  # <- { "return": {} }
  #
+# -> { "execute": "migrate",
+#  "arguments": {
+#  "channels": [ { 'channeltype': 'control',
+#  'dest-addr': {'transport': 'socket',
+#'type': 'inet',
+#'host': '10.12.34.9', 'port': 
'1050'}},
+#{ 'channeltype': 'data',
+#  'src-addr': {'transport': 'socket',
+#'type': 'inet',
+#'host': '10.12.34.9',
+#'port': '4000', 'ipv4': 'true'},
+#  'dest-addr': { 'transport': 'socket',
+#  'type': 'inet',
+#  'host': '10.12.34.92',
+#  'port': '1234', 'ipv4': 'true'},
+#  'multifd-count': 5 },
+#{ 'channeltype': 'data',
+#  'src-addr': {'transport': 'socket',
+#'type': 'inet',
+#'host': '10.2.3.4', 'port': '1000'},
+#  'dest-addr': {'transport': 'socket',
+# 'type': 'inet',
+# 'host': '0.0.0.4', 'port': '3000'},
+#  'multifd-count': 3 } ] } }
+# <- { "return": {} }
+#
  ##
  { 'command': 'migrate',
-  'data': {'uri': 'str', '*blk': 'bool', '*inc': 'bool',
-   '*detach': 'bool', '*resume': 'bool' } }
+  'data': {'*uri': 'str', '*channels': ['MigrateChannel'], '*blk': 'bool',

I think that "uri" bit should be dropped, right?

No, it is required for back compatibility with existing clients.
It should be marked deprecated, and removed several releases later,
at which point 'chanels' can become mandatory instead of optional.


+   '*inc': 'bool', '*detach': 'bool', '*resume': 'bool' } }
  
  ##

  # @migrate-incoming:

I can't see how to make the old one to work on top of this one (i.e. we
would have to create strings from lists on QAPI, I think that is just
too much).

All we need

Re: [PATCH v3 1/2] io: Add support for MSG_PEEK for socket channel

2022-11-22 Thread manish.mishra


On 22/11/22 8:19 pm, Daniel P. Berrangé wrote:

On Tue, Nov 22, 2022 at 09:41:01AM -0500, Peter Xu wrote:

On Tue, Nov 22, 2022 at 02:38:53PM +0530, manish.mishra wrote:

On 22/11/22 2:30 pm, Daniel P. Berrangé wrote:

On Sat, Nov 19, 2022 at 09:36:14AM +, manish.mishra wrote:

MSG_PEEK reads from the peek of channel, The data is treated as
unread and the next read shall still return this data. This
support is currently added only for socket class. Extra parameter
'flags' is added to io_readv calls to pass extra read flags like
MSG_PEEK.

Suggested-by: Daniel P. Berrangé 
---
   chardev/char-socket.c   |  4 +-
   include/io/channel.h| 83 +
   io/channel-buffer.c |  1 +
   io/channel-command.c|  1 +
   io/channel-file.c   |  1 +
   io/channel-null.c   |  1 +
   io/channel-socket.c | 16 +-
   io/channel-tls.c|  1 +
   io/channel-websock.c|  1 +
   io/channel.c| 73 +++--
   migration/channel-block.c   |  1 +
   scsi/qemu-pr-helper.c   |  2 +-
   tests/qtest/tpm-emu.c   |  2 +-
   tests/unit/test-io-channel-socket.c |  1 +
   util/vhost-user-server.c|  2 +-
   15 files changed, 179 insertions(+), 11 deletions(-)



diff --git a/io/channel-socket.c b/io/channel-socket.c
index b76dca9cc1..a06b24766d 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -406,6 +406,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc,
   }
   #endif /* WIN32 */
+qio_channel_set_feature(QIO_CHANNEL(cioc), 
QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
+

This covers the incoming server side socket.

This also needs to be set in outgoing client side socket in
qio_channel_socket_connect_async


Yes sorry, i considered only current use-case, but as it is generic one both 
should be there. Thanks will update it.


@@ -705,7 +718,6 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
   }
   #endif /* WIN32 */
-
   #ifdef QEMU_MSG_ZEROCOPY
   static int qio_channel_socket_flush(QIOChannel *ioc,
   Error **errp)

Please remove this unrelated whitespace change.



@@ -109,6 +117,37 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
   return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp);
   }
+int qio_channel_readv_peek_all_eof(QIOChannel *ioc,
+   const struct iovec *iov,
+   size_t niov,
+   Error **errp)
+{
+   ssize_t len = 0;
+   ssize_t total = iov_size(iov, niov);
+
+   while (len < total) {
+   len = qio_channel_readv_full(ioc, iov, niov, NULL,
+NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, 
errp);
+
+   if (len == QIO_CHANNEL_ERR_BLOCK) {
+if (qemu_in_coroutine()) {
+qio_channel_yield(ioc, G_IO_IN);
+} else {
+qio_channel_wait(ioc, G_IO_IN);
+}
+continue;
+   }
+   if (len == 0) {
+   return 0;
+   }
+   if (len < 0) {
+   return -1;
+   }
+   }

This will busy wait burning CPU where there is a read > 0 and < total.


Daniel, i could use MSG_WAITALL too if that works but then we will lose 
opportunity to yield. Or if you have some other idea.

How easy would this happen?

Another alternative is we could just return the partial len to caller then
we fallback to the original channel orders if it happens.  And then if it
mostly will never happen it'll behave merely the same as what we want.

Well we're trying to deal with a bug where the slow and/or unreliable
network causes channels to arrive in unexpected order. Given we know
we're having network trouble, I wouldn't want to make more assumptions
about things happening correctly.


With regards,
Daniel



Peter, I have seen MSG_PEEK used in combination with MSG_WAITALL, but looks 
like even though chances are less it can still return partial data even with 
multiple retries for signal case, so is not full proof.

*MSG_WAITALL *(since Linux 2.2)
  This flag requests that the operation block until the full
  request is satisfied.  However, the call may still return
  less data than requested if a signal is caught, an error
  or disconnect occurs, or the next data to be received is
  of a different type than that returned.  This flag has no
  effect for datagram sockets.

Actual read ahead will be little hackish, so just confirming we all are in 
agreement to do actual read ahead and i can send patch? :)

Thanks
Manish Mishra


Re: [PATCH v3 1/2] io: Add support for MSG_PEEK for socket channel

2022-11-22 Thread manish.mishra



On 22/11/22 3:23 pm, Daniel P. Berrangé wrote:

On Tue, Nov 22, 2022 at 03:10:53PM +0530, manish.mishra wrote:

On 22/11/22 2:59 pm, Daniel P. Berrangé wrote:

On Tue, Nov 22, 2022 at 02:38:53PM +0530, manish.mishra wrote:

On 22/11/22 2:30 pm, Daniel P. Berrangé wrote:

On Sat, Nov 19, 2022 at 09:36:14AM +, manish.mishra wrote:

MSG_PEEK reads from the peek of channel, The data is treated as
unread and the next read shall still return this data. This
support is currently added only for socket class. Extra parameter
'flags' is added to io_readv calls to pass extra read flags like
MSG_PEEK.

Suggested-by: Daniel P. Berrangé 
---
chardev/char-socket.c   |  4 +-
include/io/channel.h| 83 +
io/channel-buffer.c |  1 +
io/channel-command.c|  1 +
io/channel-file.c   |  1 +
io/channel-null.c   |  1 +
io/channel-socket.c | 16 +-
io/channel-tls.c|  1 +
io/channel-websock.c|  1 +
io/channel.c| 73 +++--
migration/channel-block.c   |  1 +
scsi/qemu-pr-helper.c   |  2 +-
tests/qtest/tpm-emu.c   |  2 +-
tests/unit/test-io-channel-socket.c |  1 +
util/vhost-user-server.c|  2 +-
15 files changed, 179 insertions(+), 11 deletions(-)
diff --git a/io/channel-socket.c b/io/channel-socket.c
index b76dca9cc1..a06b24766d 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -406,6 +406,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc,
}
#endif /* WIN32 */
+qio_channel_set_feature(QIO_CHANNEL(cioc), 
QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
+

This covers the incoming server side socket.

This also needs to be set in outgoing client side socket in
qio_channel_socket_connect_async

Yes sorry, i considered only current use-case, but as it is generic one both 
should be there. Thanks will update it.


@@ -705,7 +718,6 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
}
#endif /* WIN32 */
-
#ifdef QEMU_MSG_ZEROCOPY
static int qio_channel_socket_flush(QIOChannel *ioc,
Error **errp)

Please remove this unrelated whitespace change.



@@ -109,6 +117,37 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp);
}
+int qio_channel_readv_peek_all_eof(QIOChannel *ioc,
+   const struct iovec *iov,
+   size_t niov,
+   Error **errp)
+{
+   ssize_t len = 0;
+   ssize_t total = iov_size(iov, niov);
+
+   while (len < total) {
+   len = qio_channel_readv_full(ioc, iov, niov, NULL,
+NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, 
errp);
+
+   if (len == QIO_CHANNEL_ERR_BLOCK) {
+if (qemu_in_coroutine()) {
+qio_channel_yield(ioc, G_IO_IN);
+} else {
+qio_channel_wait(ioc, G_IO_IN);
+}
+continue;
+   }
+   if (len == 0) {
+   return 0;
+   }
+   if (len < 0) {
+   return -1;
+   }
+   }

This will busy wait burning CPU where there is a read > 0 and < total.


Daniel, i could use MSG_WAITALL too if that works but then we will
lose opportunity to yield. Or if you have some other idea.

I fear this is an inherant problem with the idea of using PEEK to
look at the magic data.

If we actually read the magic bytes off the wire, then we could have
the same code path for TLS and non-TLS. We would have to modify the
existing later code paths though to take account of fact that the
magic was already read by an earlier codepath.

With regards,
Daniel


sure Daniel, I am happy to drop use of MSG_PEEK, but that way also we
have issue with tls for reason we discussed in V2. Is it okay to send
a patch with actual read ahead but not for tls case? tls anyway does
not have this bug as it does handshake.

I've re-read the previous threads, but I don't see what the problem
with TLS is.  We already decided that TLS is not affected by the
race condition. So there should be no problem in reading the magic
bytes early on the TLS channels, while reading the bytes early on
a non-TLS channel will fix the race condition.



Actually with tls all channels requires handshake to be assumed established, 
and from source side we do initial qemu_flush only when all channels are 
established. But on destination side we will stuck on reading magic for main 
channel itself which never comes because source has not flushed data, so no new 
connections can be established(e.g. multiFD). So basically destination can not 
accept any new channel until we read from main channel and source is not 
putting any data on mai

Re: [PATCH v3 1/2] io: Add support for MSG_PEEK for socket channel

2022-11-22 Thread manish.mishra



On 22/11/22 2:59 pm, Daniel P. Berrangé wrote:

On Tue, Nov 22, 2022 at 02:38:53PM +0530, manish.mishra wrote:

On 22/11/22 2:30 pm, Daniel P. Berrangé wrote:

On Sat, Nov 19, 2022 at 09:36:14AM +, manish.mishra wrote:

MSG_PEEK reads from the peek of channel, The data is treated as
unread and the next read shall still return this data. This
support is currently added only for socket class. Extra parameter
'flags' is added to io_readv calls to pass extra read flags like
MSG_PEEK.

Suggested-by: Daniel P. Berrangé 
---
   chardev/char-socket.c   |  4 +-
   include/io/channel.h| 83 +
   io/channel-buffer.c |  1 +
   io/channel-command.c|  1 +
   io/channel-file.c   |  1 +
   io/channel-null.c   |  1 +
   io/channel-socket.c | 16 +-
   io/channel-tls.c|  1 +
   io/channel-websock.c|  1 +
   io/channel.c| 73 +++--
   migration/channel-block.c   |  1 +
   scsi/qemu-pr-helper.c   |  2 +-
   tests/qtest/tpm-emu.c   |  2 +-
   tests/unit/test-io-channel-socket.c |  1 +
   util/vhost-user-server.c|  2 +-
   15 files changed, 179 insertions(+), 11 deletions(-)



diff --git a/io/channel-socket.c b/io/channel-socket.c
index b76dca9cc1..a06b24766d 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -406,6 +406,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc,
   }
   #endif /* WIN32 */
+qio_channel_set_feature(QIO_CHANNEL(cioc), 
QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
+

This covers the incoming server side socket.

This also needs to be set in outgoing client side socket in
qio_channel_socket_connect_async


Yes sorry, i considered only current use-case, but as it is generic one both 
should be there. Thanks will update it.


@@ -705,7 +718,6 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
   }
   #endif /* WIN32 */
-
   #ifdef QEMU_MSG_ZEROCOPY
   static int qio_channel_socket_flush(QIOChannel *ioc,
   Error **errp)

Please remove this unrelated whitespace change.



@@ -109,6 +117,37 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
   return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp);
   }
+int qio_channel_readv_peek_all_eof(QIOChannel *ioc,
+   const struct iovec *iov,
+   size_t niov,
+   Error **errp)
+{
+   ssize_t len = 0;
+   ssize_t total = iov_size(iov, niov);
+
+   while (len < total) {
+   len = qio_channel_readv_full(ioc, iov, niov, NULL,
+NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, 
errp);
+
+   if (len == QIO_CHANNEL_ERR_BLOCK) {
+if (qemu_in_coroutine()) {
+qio_channel_yield(ioc, G_IO_IN);
+} else {
+qio_channel_wait(ioc, G_IO_IN);
+}
+continue;
+   }
+   if (len == 0) {
+   return 0;
+   }
+   if (len < 0) {
+   return -1;
+   }
+   }

This will busy wait burning CPU where there is a read > 0 and < total.


Daniel, i could use MSG_WAITALL too if that works but then we will
lose opportunity to yield. Or if you have some other idea.

I fear this is an inherant problem with the idea of using PEEK to
look at the magic data.

If we actually read the magic bytes off the wire, then we could have
the same code path for TLS and non-TLS. We would have to modify the
existing later code paths though to take account of fact that the
magic was already read by an earlier codepath.

With regards,
Daniel



sure Daniel, I am happy to drop use of MSG_PEEK, but that way also we have 
issue with tls for reason we discussed in V2. Is it okay to send a patch with 
actual read ahead but not for tls case? tls anyway does not have this bug as it 
does handshake.

Thanks

Manish Mishra




Re: [PATCH v3 1/2] io: Add support for MSG_PEEK for socket channel

2022-11-22 Thread manish.mishra



On 22/11/22 2:30 pm, Daniel P. Berrangé wrote:

On Sat, Nov 19, 2022 at 09:36:14AM +, manish.mishra wrote:

MSG_PEEK reads from the peek of channel, The data is treated as
unread and the next read shall still return this data. This
support is currently added only for socket class. Extra parameter
'flags' is added to io_readv calls to pass extra read flags like
MSG_PEEK.

Suggested-by: Daniel P. Berrangé 
---
  chardev/char-socket.c   |  4 +-
  include/io/channel.h| 83 +
  io/channel-buffer.c |  1 +
  io/channel-command.c|  1 +
  io/channel-file.c   |  1 +
  io/channel-null.c   |  1 +
  io/channel-socket.c | 16 +-
  io/channel-tls.c|  1 +
  io/channel-websock.c|  1 +
  io/channel.c| 73 +++--
  migration/channel-block.c   |  1 +
  scsi/qemu-pr-helper.c   |  2 +-
  tests/qtest/tpm-emu.c   |  2 +-
  tests/unit/test-io-channel-socket.c |  1 +
  util/vhost-user-server.c|  2 +-
  15 files changed, 179 insertions(+), 11 deletions(-)




diff --git a/io/channel-socket.c b/io/channel-socket.c
index b76dca9cc1..a06b24766d 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -406,6 +406,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc,
  }
  #endif /* WIN32 */
  
+qio_channel_set_feature(QIO_CHANNEL(cioc), QIO_CHANNEL_FEATURE_READ_MSG_PEEK);

+

This covers the incoming server side socket.

This also needs to be set in outgoing client side socket in
qio_channel_socket_connect_async



Yes sorry, i considered only current use-case, but as it is generic one both 
should be there. Thanks will update it.




@@ -705,7 +718,6 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
  }
  #endif /* WIN32 */
  
-

  #ifdef QEMU_MSG_ZEROCOPY
  static int qio_channel_socket_flush(QIOChannel *ioc,
  Error **errp)

Please remove this unrelated whitespace change.



@@ -109,6 +117,37 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
  return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp);
  }
  
+int qio_channel_readv_peek_all_eof(QIOChannel *ioc,

+   const struct iovec *iov,
+   size_t niov,
+   Error **errp)
+{
+   ssize_t len = 0;
+   ssize_t total = iov_size(iov, niov);
+
+   while (len < total) {
+   len = qio_channel_readv_full(ioc, iov, niov, NULL,
+NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, 
errp);
+
+   if (len == QIO_CHANNEL_ERR_BLOCK) {
+if (qemu_in_coroutine()) {
+qio_channel_yield(ioc, G_IO_IN);
+} else {
+qio_channel_wait(ioc, G_IO_IN);
+}
+continue;
+   }
+   if (len == 0) {
+   return 0;
+   }
+   if (len < 0) {
+   return -1;
+   }
+   }

This will busy wait burning CPU where there is a read > 0 and < total.



Daniel, i could use MSG_WAITALL too if that works but then we will lose 
opportunity to yield. Or if you have some other idea.



With regards,
Daniel



Thanks

Manish Mishra




Re: [PATCH 1/2] io: Add support for MSG_PEEK for socket channel

2022-11-19 Thread manish.mishra



On 19/11/22 3:06 pm, manish.mishra wrote:

MSG_PEEK reads from the peek of channel, The data is treated as
unread and the next read shall still return this data. This
support is currently added only for socket class. Extra parameter
'flags' is added to io_readv calls to pass extra read flags like
MSG_PEEK.
---
  chardev/char-socket.c   |  4 +-
  include/io/channel.h| 83 +
  io/channel-buffer.c |  1 +
  io/channel-command.c|  1 +
  io/channel-file.c   |  1 +
  io/channel-null.c   |  1 +
  io/channel-socket.c | 16 +-
  io/channel-tls.c|  1 +
  io/channel-websock.c|  1 +
  io/channel.c| 73 +++--
  migration/channel-block.c   |  1 +
  scsi/qemu-pr-helper.c   |  2 +-
  tests/qtest/tpm-emu.c   |  2 +-
  tests/unit/test-io-channel-socket.c |  1 +
  util/vhost-user-server.c|  2 +-
  15 files changed, 179 insertions(+), 11 deletions(-)

diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index 879564aa8a..5afce9a464 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -283,11 +283,11 @@ static ssize_t tcp_chr_recv(Chardev *chr, char *buf, 
size_t len)
  if (qio_channel_has_feature(s->ioc, QIO_CHANNEL_FEATURE_FD_PASS)) {
  ret = qio_channel_readv_full(s->ioc, &iov, 1,
   &msgfds, &msgfds_num,
- NULL);
+ 0, NULL);
  } else {
  ret = qio_channel_readv_full(s->ioc, &iov, 1,
   NULL, NULL,
- NULL);
+ 0, NULL);
  }
  
  if (msgfds_num) {

diff --git a/include/io/channel.h b/include/io/channel.h
index c680ee7480..cbcde4b88f 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -34,6 +34,8 @@ OBJECT_DECLARE_TYPE(QIOChannel, QIOChannelClass,
  
  #define QIO_CHANNEL_WRITE_FLAG_ZERO_COPY 0x1
  
+#define QIO_CHANNEL_READ_FLAG_MSG_PEEK 0x1

+
  typedef enum QIOChannelFeature QIOChannelFeature;
  
  enum QIOChannelFeature {

@@ -41,6 +43,7 @@ enum QIOChannelFeature {
  QIO_CHANNEL_FEATURE_SHUTDOWN,
  QIO_CHANNEL_FEATURE_LISTEN,
  QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY,
+QIO_CHANNEL_FEATURE_READ_MSG_PEEK,
  };
  
  
@@ -114,6 +117,7 @@ struct QIOChannelClass {

  size_t niov,
  int **fds,
  size_t *nfds,
+int flags,
  Error **errp);
  int (*io_close)(QIOChannel *ioc,
  Error **errp);
@@ -188,6 +192,7 @@ void qio_channel_set_name(QIOChannel *ioc,
   * @niov: the length of the @iov array
   * @fds: pointer to an array that will received file handles
   * @nfds: pointer filled with number of elements in @fds on return
+ * @flags: read flags (QIO_CHANNEL_READ_FLAG_*)
   * @errp: pointer to a NULL-initialized error object
   *
   * Read data from the IO channel, storing it in the
@@ -224,6 +229,7 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc,
 size_t niov,
 int **fds,
 size_t *nfds,
+   int flags,
 Error **errp);
  
  
@@ -300,6 +306,34 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,

size_t niov,
Error **errp);
  
+/**

+ * qio_channel_readv_peek_all_eof:
+ * @ioc: the channel object
+ * @iov: the array of memory regions to read data into
+ * @niov: the length of the @iov array
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Read data from the peek of IO channel without
+ * actually removing it from channel buffer, storing
+ * it in the memory regions referenced by @iov. Each
+ * element in the @iov will be fully populated with
+ * data before the next one is used. The @niov
+ * parameter specifies the total number of elements
+ * in @iov.
+ *
+ * The function will wait for all requested data
+ * to be read, yielding from the current coroutine
+ * if required.
+ *
+ * Returns: 1 if all bytes were read, 0 if end-of-file
+ *  occurs without data, or -1 on error
+ */
+int qio_channel_readv_peek_all_eof(QIOChannel *ioc,
+   const struct iovec *iov,
+   size_t niov,
+   Error **errp);
+
+
  /**
   * qio_channel_readv_all:
   * @ioc: the channel object
@@ -328,6 +362,34 @@ int qio_channel_readv_all(QIOChannel *ioc,
Error **errp);
  
  
+/**

+ * qio_channel_readv_peek_all:
+ * @ioc: the channel object
+ * @iov: the array of memory regions to read dat

check magic value for deciding the mapping of channels

2022-11-19 Thread manish.mishra
Current logic assumes that channel connections on the destination side are
always established in the same order as the source and the first one will
always be the main channel followed by the multifid or post-copy
preemption channel. This may not be always true, as even if a channel has a
connection established on the source side it can be in the pending state on
the destination side and a newer connection can be established first.
Basically causing out of order mapping of channels on the destination side.
Currently, all channels except post-copy preempt send a magic number, this
patch uses that magic number to decide the type of channel. This logic is
applicable only for precopy(multifd) live migration, as mentioned, the
post-copy preempt channel does not send any magic number. Also, tls live
migrations already does tls handshake before creating other channels, so
this issue is not possible with tls, hence this logic is avoided for tls
live migrations. This patch uses MSG_PEEK to check the magic number of
channels so that current data/control stream management remains
un-effected.

Suggested-by: Daniel P. Berrangé 
Signed-off-by: manish.mishra 

v2:
  TLS does not support MSG_PEEK, so V1 was broken for tls live
  migrations. For tls live migration, while initializing main channel
  tls handshake is done before we can create other channels, so this
  issue is not possible for tls live migrations. In V2 added a check
  to avoid checking magic number for tls live migration and fallback
  to older method to decide mapping of channels on destination side.

v3:
  1. Split change in two patches, io patch for read_peek routines,
 migration patch for migration related changes.
  2. Add flags to io_readv calls to get extra read flags like
 MSG_PEEK.
  3. Some other minor fixes.

manish.mishra (2):
  io: Add support for MSG_PEEK for socket channel
  migration: check magic value for deciding the mapping of channels

 chardev/char-socket.c   |  4 +-
 include/io/channel.h| 83 +
 io/channel-buffer.c |  1 +
 io/channel-command.c|  1 +
 io/channel-file.c   |  1 +
 io/channel-null.c   |  1 +
 io/channel-socket.c | 16 +-
 io/channel-tls.c|  1 +
 io/channel-websock.c|  1 +
 io/channel.c| 73 +++--
 migration/channel-block.c   |  1 +
 migration/migration.c   | 44 ++-
 migration/multifd.c | 12 ++---
 migration/multifd.h |  2 +-
 migration/postcopy-ram.c|  5 +-
 migration/postcopy-ram.h|  2 +-
 scsi/qemu-pr-helper.c   |  2 +-
 tests/qtest/tpm-emu.c   |  2 +-
 tests/unit/test-io-channel-socket.c |  1 +
 util/vhost-user-server.c|  2 +-
 20 files changed, 218 insertions(+), 37 deletions(-)

-- 
2.22.3




[PATCH v3 2/2] migration: check magic value for deciding the mapping of channels

2022-11-19 Thread manish.mishra
Current logic assumes that channel connections on the destination side are
always established in the same order as the source and the first one will
always be the main channel followed by the multifid or post-copy
preemption channel. This may not be always true, as even if a channel has a
connection established on the source side it can be in the pending state on
the destination side and a newer connection can be established first.
Basically causing out of order mapping of channels on the destination side.
Currently, all channels except post-copy preempt send a magic number, this
patch uses that magic number to decide the type of channel. This logic is
applicable only for precopy(multifd) live migration, as mentioned, the
post-copy preempt channel does not send any magic number. Also, tls live
migrations already does tls handshake before creating other channels, so
this issue is not possible with tls, hence this logic is avoided for tls
live migrations. This patch uses read peek to check the magic number of
channels so that current data/control stream management remains
un-effected.

Suggested-by: Daniel P. Berrangé 
---
 migration/migration.c| 44 +---
 migration/multifd.c  | 12 ---
 migration/multifd.h  |  2 +-
 migration/postcopy-ram.c |  5 +
 migration/postcopy-ram.h |  2 +-
 5 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 739bb683f3..787e678d48 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -733,31 +733,51 @@ void migration_ioc_process_incoming(QIOChannel *ioc, 
Error **errp)
 {
 MigrationIncomingState *mis = migration_incoming_get_current();
 Error *local_err = NULL;
-bool start_migration;
 QEMUFile *f;
+bool default_channel = true;
+uint32_t channel_magic = 0;
+int ret = 0;
 
-if (!mis->from_src_file) {
-/* The first connection (multifd may have multiple) */
+if (migrate_use_multifd() && !migrate_postcopy_ram() &&
+qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
+/*
+ * With multiple channels, it is possible that we receive channels
+ * out of order on destination side, causing incorrect mapping of
+ * source channels on destination side. Check channel MAGIC to
+ * decide type of channel. Please note this is best effort, postcopy
+ * preempt channel does not send any magic number so avoid it for
+ * postcopy live migration. Also tls live migration already does
+ * tls handshake while initializing main channel so with tls this
+ * issue is not possible.
+ */
+ret = qio_channel_read_peek_all(ioc, (void *)&channel_magic,
+sizeof(channel_magic), &local_err);
+
+if (ret != 0) {
+error_propagate(errp, local_err);
+return;
+}
+
+default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC));
+} else {
+default_channel = !mis->from_src_file;
+}
+
+if (default_channel) {
 f = qemu_file_new_input(ioc);
 
 if (!migration_incoming_setup(f, errp)) {
 return;
 }
-
-/*
- * Common migration only needs one channel, so we can start
- * right now.  Some features need more than one channel, we wait.
- */
-start_migration = !migration_needs_multiple_sockets();
 } else {
 /* Multiple connections */
 assert(migration_needs_multiple_sockets());
 if (migrate_use_multifd()) {
-start_migration = multifd_recv_new_channel(ioc, &local_err);
+multifd_recv_new_channel(ioc, &local_err);
 } else {
 assert(migrate_postcopy_preempt());
 f = qemu_file_new_input(ioc);
-start_migration = postcopy_preempt_new_channel(mis, f);
+postcopy_preempt_new_channel(mis, f);
 }
 if (local_err) {
 error_propagate(errp, local_err);
@@ -765,7 +785,7 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error 
**errp)
 }
 }
 
-if (start_migration) {
+if (migration_has_all_channels()) {
 /* If it's a recovery, we're done */
 if (postcopy_try_recover()) {
 return;
diff --git a/migration/multifd.c b/migration/multifd.c
index 586ddc9d65..be86a4d07f 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -1220,11 +1220,9 @@ bool multifd_recv_all_channels_created(void)
 
 /*
  * Try to receive all multifd channels to get ready for the migration.
- * - Return true and do not set @errp when correctly receiving all channels;
- * - Return false and do not set @errp when correctly receiving the current 
one;
- * - Return false and set @errp when failing to receive the current channel.
+ * Sets @errp when failing to receive the current channel.
  */
-bool multifd_recv_new_channel(QIOChannel

[PATCH 2/2] migration: check magic value for deciding the mapping of channels

2022-11-19 Thread manish.mishra
Current logic assumes that channel connections on the destination side are
always established in the same order as the source and the first one will
always be the main channel followed by the multifid or post-copy
preemption channel. This may not be always true, as even if a channel has a
connection established on the source side it can be in the pending state on
the destination side and a newer connection can be established first.
Basically causing out of order mapping of channels on the destination side.
Currently, all channels except post-copy preempt send a magic number, this
patch uses that magic number to decide the type of channel. This logic is
applicable only for precopy(multifd) live migration, as mentioned, the
post-copy preempt channel does not send any magic number. Also, tls live
migrations already does tls handshake before creating other channels, so
this issue is not possible with tls, hence this logic is avoided for tls
live migrations. This patch uses read peek to check the magic number of
channels so that current data/control stream management remains
un-effected.
---
 migration/migration.c| 44 +---
 migration/multifd.c  | 12 ---
 migration/multifd.h  |  2 +-
 migration/postcopy-ram.c |  5 +
 migration/postcopy-ram.h |  2 +-
 5 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 739bb683f3..787e678d48 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -733,31 +733,51 @@ void migration_ioc_process_incoming(QIOChannel *ioc, 
Error **errp)
 {
 MigrationIncomingState *mis = migration_incoming_get_current();
 Error *local_err = NULL;
-bool start_migration;
 QEMUFile *f;
+bool default_channel = true;
+uint32_t channel_magic = 0;
+int ret = 0;
 
-if (!mis->from_src_file) {
-/* The first connection (multifd may have multiple) */
+if (migrate_use_multifd() && !migrate_postcopy_ram() &&
+qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
+/*
+ * With multiple channels, it is possible that we receive channels
+ * out of order on destination side, causing incorrect mapping of
+ * source channels on destination side. Check channel MAGIC to
+ * decide type of channel. Please note this is best effort, postcopy
+ * preempt channel does not send any magic number so avoid it for
+ * postcopy live migration. Also tls live migration already does
+ * tls handshake while initializing main channel so with tls this
+ * issue is not possible.
+ */
+ret = qio_channel_read_peek_all(ioc, (void *)&channel_magic,
+sizeof(channel_magic), &local_err);
+
+if (ret != 0) {
+error_propagate(errp, local_err);
+return;
+}
+
+default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC));
+} else {
+default_channel = !mis->from_src_file;
+}
+
+if (default_channel) {
 f = qemu_file_new_input(ioc);
 
 if (!migration_incoming_setup(f, errp)) {
 return;
 }
-
-/*
- * Common migration only needs one channel, so we can start
- * right now.  Some features need more than one channel, we wait.
- */
-start_migration = !migration_needs_multiple_sockets();
 } else {
 /* Multiple connections */
 assert(migration_needs_multiple_sockets());
 if (migrate_use_multifd()) {
-start_migration = multifd_recv_new_channel(ioc, &local_err);
+multifd_recv_new_channel(ioc, &local_err);
 } else {
 assert(migrate_postcopy_preempt());
 f = qemu_file_new_input(ioc);
-start_migration = postcopy_preempt_new_channel(mis, f);
+postcopy_preempt_new_channel(mis, f);
 }
 if (local_err) {
 error_propagate(errp, local_err);
@@ -765,7 +785,7 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error 
**errp)
 }
 }
 
-if (start_migration) {
+if (migration_has_all_channels()) {
 /* If it's a recovery, we're done */
 if (postcopy_try_recover()) {
 return;
diff --git a/migration/multifd.c b/migration/multifd.c
index 586ddc9d65..be86a4d07f 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -1220,11 +1220,9 @@ bool multifd_recv_all_channels_created(void)
 
 /*
  * Try to receive all multifd channels to get ready for the migration.
- * - Return true and do not set @errp when correctly receiving all channels;
- * - Return false and do not set @errp when correctly receiving the current 
one;
- * - Return false and set @errp when failing to receive the current channel.
+ * Sets @errp when failing to receive the current channel.
  */
-bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
+void multifd_

[PATCH v3 1/2] io: Add support for MSG_PEEK for socket channel

2022-11-19 Thread manish.mishra
MSG_PEEK reads from the peek of channel, The data is treated as
unread and the next read shall still return this data. This
support is currently added only for socket class. Extra parameter
'flags' is added to io_readv calls to pass extra read flags like
MSG_PEEK.

Suggested-by: Daniel P. Berrangé 
---
 chardev/char-socket.c   |  4 +-
 include/io/channel.h| 83 +
 io/channel-buffer.c |  1 +
 io/channel-command.c|  1 +
 io/channel-file.c   |  1 +
 io/channel-null.c   |  1 +
 io/channel-socket.c | 16 +-
 io/channel-tls.c|  1 +
 io/channel-websock.c|  1 +
 io/channel.c| 73 +++--
 migration/channel-block.c   |  1 +
 scsi/qemu-pr-helper.c   |  2 +-
 tests/qtest/tpm-emu.c   |  2 +-
 tests/unit/test-io-channel-socket.c |  1 +
 util/vhost-user-server.c|  2 +-
 15 files changed, 179 insertions(+), 11 deletions(-)

diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index 879564aa8a..5afce9a464 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -283,11 +283,11 @@ static ssize_t tcp_chr_recv(Chardev *chr, char *buf, 
size_t len)
 if (qio_channel_has_feature(s->ioc, QIO_CHANNEL_FEATURE_FD_PASS)) {
 ret = qio_channel_readv_full(s->ioc, &iov, 1,
  &msgfds, &msgfds_num,
- NULL);
+ 0, NULL);
 } else {
 ret = qio_channel_readv_full(s->ioc, &iov, 1,
  NULL, NULL,
- NULL);
+ 0, NULL);
 }
 
 if (msgfds_num) {
diff --git a/include/io/channel.h b/include/io/channel.h
index c680ee7480..cbcde4b88f 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -34,6 +34,8 @@ OBJECT_DECLARE_TYPE(QIOChannel, QIOChannelClass,
 
 #define QIO_CHANNEL_WRITE_FLAG_ZERO_COPY 0x1
 
+#define QIO_CHANNEL_READ_FLAG_MSG_PEEK 0x1
+
 typedef enum QIOChannelFeature QIOChannelFeature;
 
 enum QIOChannelFeature {
@@ -41,6 +43,7 @@ enum QIOChannelFeature {
 QIO_CHANNEL_FEATURE_SHUTDOWN,
 QIO_CHANNEL_FEATURE_LISTEN,
 QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY,
+QIO_CHANNEL_FEATURE_READ_MSG_PEEK,
 };
 
 
@@ -114,6 +117,7 @@ struct QIOChannelClass {
 size_t niov,
 int **fds,
 size_t *nfds,
+int flags,
 Error **errp);
 int (*io_close)(QIOChannel *ioc,
 Error **errp);
@@ -188,6 +192,7 @@ void qio_channel_set_name(QIOChannel *ioc,
  * @niov: the length of the @iov array
  * @fds: pointer to an array that will received file handles
  * @nfds: pointer filled with number of elements in @fds on return
+ * @flags: read flags (QIO_CHANNEL_READ_FLAG_*)
  * @errp: pointer to a NULL-initialized error object
  *
  * Read data from the IO channel, storing it in the
@@ -224,6 +229,7 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc,
size_t niov,
int **fds,
size_t *nfds,
+   int flags,
Error **errp);
 
 
@@ -300,6 +306,34 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
   size_t niov,
   Error **errp);
 
+/**
+ * qio_channel_readv_peek_all_eof:
+ * @ioc: the channel object
+ * @iov: the array of memory regions to read data into
+ * @niov: the length of the @iov array
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Read data from the peek of IO channel without
+ * actually removing it from channel buffer, storing
+ * it in the memory regions referenced by @iov. Each
+ * element in the @iov will be fully populated with
+ * data before the next one is used. The @niov
+ * parameter specifies the total number of elements
+ * in @iov.
+ *
+ * The function will wait for all requested data
+ * to be read, yielding from the current coroutine
+ * if required.
+ *
+ * Returns: 1 if all bytes were read, 0 if end-of-file
+ *  occurs without data, or -1 on error
+ */
+int qio_channel_readv_peek_all_eof(QIOChannel *ioc,
+   const struct iovec *iov,
+   size_t niov,
+   Error **errp);
+
+
 /**
  * qio_channel_readv_all:
  * @ioc: the channel object
@@ -328,6 +362,34 @@ int qio_channel_readv_all(QIOChannel *ioc,
   Error **errp);
 
 
+/**
+ * qio_channel_readv_peek_all:
+ * @ioc: the channel object
+ * @iov: the array of memory regions to read data into
+ * @niov: the length of the @iov array
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Read data

[PATCH 1/2] io: Add support for MSG_PEEK for socket channel

2022-11-19 Thread manish.mishra
MSG_PEEK reads from the peek of channel, The data is treated as
unread and the next read shall still return this data. This
support is currently added only for socket class. Extra parameter
'flags' is added to io_readv calls to pass extra read flags like
MSG_PEEK.
---
 chardev/char-socket.c   |  4 +-
 include/io/channel.h| 83 +
 io/channel-buffer.c |  1 +
 io/channel-command.c|  1 +
 io/channel-file.c   |  1 +
 io/channel-null.c   |  1 +
 io/channel-socket.c | 16 +-
 io/channel-tls.c|  1 +
 io/channel-websock.c|  1 +
 io/channel.c| 73 +++--
 migration/channel-block.c   |  1 +
 scsi/qemu-pr-helper.c   |  2 +-
 tests/qtest/tpm-emu.c   |  2 +-
 tests/unit/test-io-channel-socket.c |  1 +
 util/vhost-user-server.c|  2 +-
 15 files changed, 179 insertions(+), 11 deletions(-)

diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index 879564aa8a..5afce9a464 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -283,11 +283,11 @@ static ssize_t tcp_chr_recv(Chardev *chr, char *buf, 
size_t len)
 if (qio_channel_has_feature(s->ioc, QIO_CHANNEL_FEATURE_FD_PASS)) {
 ret = qio_channel_readv_full(s->ioc, &iov, 1,
  &msgfds, &msgfds_num,
- NULL);
+ 0, NULL);
 } else {
 ret = qio_channel_readv_full(s->ioc, &iov, 1,
  NULL, NULL,
- NULL);
+ 0, NULL);
 }
 
 if (msgfds_num) {
diff --git a/include/io/channel.h b/include/io/channel.h
index c680ee7480..cbcde4b88f 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -34,6 +34,8 @@ OBJECT_DECLARE_TYPE(QIOChannel, QIOChannelClass,
 
 #define QIO_CHANNEL_WRITE_FLAG_ZERO_COPY 0x1
 
+#define QIO_CHANNEL_READ_FLAG_MSG_PEEK 0x1
+
 typedef enum QIOChannelFeature QIOChannelFeature;
 
 enum QIOChannelFeature {
@@ -41,6 +43,7 @@ enum QIOChannelFeature {
 QIO_CHANNEL_FEATURE_SHUTDOWN,
 QIO_CHANNEL_FEATURE_LISTEN,
 QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY,
+QIO_CHANNEL_FEATURE_READ_MSG_PEEK,
 };
 
 
@@ -114,6 +117,7 @@ struct QIOChannelClass {
 size_t niov,
 int **fds,
 size_t *nfds,
+int flags,
 Error **errp);
 int (*io_close)(QIOChannel *ioc,
 Error **errp);
@@ -188,6 +192,7 @@ void qio_channel_set_name(QIOChannel *ioc,
  * @niov: the length of the @iov array
  * @fds: pointer to an array that will received file handles
  * @nfds: pointer filled with number of elements in @fds on return
+ * @flags: read flags (QIO_CHANNEL_READ_FLAG_*)
  * @errp: pointer to a NULL-initialized error object
  *
  * Read data from the IO channel, storing it in the
@@ -224,6 +229,7 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc,
size_t niov,
int **fds,
size_t *nfds,
+   int flags,
Error **errp);
 
 
@@ -300,6 +306,34 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
   size_t niov,
   Error **errp);
 
+/**
+ * qio_channel_readv_peek_all_eof:
+ * @ioc: the channel object
+ * @iov: the array of memory regions to read data into
+ * @niov: the length of the @iov array
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Read data from the peek of IO channel without
+ * actually removing it from channel buffer, storing
+ * it in the memory regions referenced by @iov. Each
+ * element in the @iov will be fully populated with
+ * data before the next one is used. The @niov
+ * parameter specifies the total number of elements
+ * in @iov.
+ *
+ * The function will wait for all requested data
+ * to be read, yielding from the current coroutine
+ * if required.
+ *
+ * Returns: 1 if all bytes were read, 0 if end-of-file
+ *  occurs without data, or -1 on error
+ */
+int qio_channel_readv_peek_all_eof(QIOChannel *ioc,
+   const struct iovec *iov,
+   size_t niov,
+   Error **errp);
+
+
 /**
  * qio_channel_readv_all:
  * @ioc: the channel object
@@ -328,6 +362,34 @@ int qio_channel_readv_all(QIOChannel *ioc,
   Error **errp);
 
 
+/**
+ * qio_channel_readv_peek_all:
+ * @ioc: the channel object
+ * @iov: the array of memory regions to read data into
+ * @niov: the length of the @iov array
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Read data from the the peek of IO channel wi

Re: [PATCH v2] migration: check magic value for deciding the mapping of channels

2022-11-16 Thread manish.mishra



On 16/11/22 4:57 pm, Daniel P. Berrangé wrote:

On Wed, Nov 16, 2022 at 04:49:18PM +0530, manish.mishra wrote:

On 16/11/22 12:20 am, Daniel P. Berrangé wrote:

On Tue, Nov 15, 2022 at 06:11:30PM +, Daniel P. Berrangé wrote:

On Mon, Nov 07, 2022 at 04:51:59PM +, manish.mishra wrote:

Current logic assumes that channel connections on the destination side are
always established in the same order as the source and the first one will
always be the main channel followed by the multifid or post-copy
preemption channel. This may not be always true, as even if a channel has a
connection established on the source side it can be in the pending state on
the destination side and a newer connection can be established first.
Basically causing out of order mapping of channels on the destination side.
Currently, all channels except post-copy preempt send a magic number, this
patch uses that magic number to decide the type of channel. This logic is
applicable only for precopy(multifd) live migration, as mentioned, the
post-copy preempt channel does not send any magic number. Also, tls live
migrations already does tls handshake before creating other channels, so
this issue is not possible with tls, hence this logic is avoided for tls
live migrations. This patch uses MSG_PEEK to check the magic number of
channels so that current data/control stream management remains
un-effected.

Suggested-by: Daniel P. Berrangé
Signed-off-by: manish.mishra

v2:
TLS does not support MSG_PEEK, so V1 was broken for tls live
migrations. For tls live migration, while initializing main channel
tls handshake is done before we can create other channels, so this
issue is not possible for tls live migrations. In V2 added a check
to avoid checking magic number for tls live migration and fallback
to older method to decide mapping of channels on destination side.
---
   include/io/channel.h | 25 +++
   io/channel-socket.c  | 27 
   io/channel.c | 39 +++
   migration/migration.c| 44 +---
   migration/multifd.c  | 12 ---
   migration/multifd.h  |  2 +-
   migration/postcopy-ram.c |  5 +
   migration/postcopy-ram.h |  2 +-
   8 files changed, 130 insertions(+), 26 deletions(-)

This should be two commits, because the 'io' and 'migration'
code are two separate subsystems in QEMU.


diff --git a/include/io/channel.h b/include/io/channel.h
index c680ee7480..74177aeeea 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -115,6 +115,10 @@ struct QIOChannelClass {
   int **fds,
   size_t *nfds,
   Error **errp);
+ssize_t (*io_read_peek)(QIOChannel *ioc,
+void *buf,
+size_t nbytes,
+Error **errp);

This API should be called "io_read_peekv" and use
"const struct iovec *iov", such that is matches the
design of 'io_readv'.

There should also be a QIOChannelFeature flag
registered to indicate whether a given channel
impl supports peeking at data.



@@ -475,6 +479,27 @@ int qio_channel_write_all(QIOChannel *ioc,
 size_t buflen,
 Error **errp);
+/**
+ * qio_channel_read_peek_all:
+ * @ioc: the channel object
+ * @buf: the memory region to read in data
+ * @nbytes: the number of bytes to read
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Read given @nbytes data from peek of channel into
+ * memory region @buf.
+ *
+ * The function will be blocked until read size is
+ * equal to requested size.
+ *
+ * Returns: 1 if all bytes were read, 0 if end-of-file
+ *  occurs without data, or -1 on error
+ */
+int qio_channel_read_peek_all(QIOChannel *ioc,
+  void* buf,
+  size_t nbytes,
+  Error **errp);

There should be qio_channel_read_peek, qio_channel_read_peekv,
qio_channel_read_peek_all and qio_channel_read_peekv_all.

Actually ignore that.  We should not add any new APIs at
all.  Instead the io_readv callback, and the qio_channel_read*all()
methods should gain a 'int flags' parameter, in the same way that
the write methods have one. Then there should be as
QIO_CHANNEL_READ_FLAG_PEEK constant defined.

Hi Daniel, As MSG_PEEK always reads from top even if there were
previos partial reads, so current |qio_channel_readv_all_eofmay
not work? I can keep things upto ||qio_channel_readv as common
for both with flags parameters but have separate
||qio_channel_readv_peek_all_eof? Does something like that works.|||

Simplest is probably to just not add 'flags' to the 'all' variants,
just the non-'all' varants.



sure Daniel, will do that. Thanks.


With regards,
Daniel




Re: [PATCH v2] migration: check magic value for deciding the mapping of channels

2022-11-16 Thread manish.mishra


On 16/11/22 12:20 am, Daniel P. Berrangé wrote:

On Tue, Nov 15, 2022 at 06:11:30PM +, Daniel P. Berrangé wrote:

On Mon, Nov 07, 2022 at 04:51:59PM +, manish.mishra wrote:

Current logic assumes that channel connections on the destination side are
always established in the same order as the source and the first one will
always be the main channel followed by the multifid or post-copy
preemption channel. This may not be always true, as even if a channel has a
connection established on the source side it can be in the pending state on
the destination side and a newer connection can be established first.
Basically causing out of order mapping of channels on the destination side.
Currently, all channels except post-copy preempt send a magic number, this
patch uses that magic number to decide the type of channel. This logic is
applicable only for precopy(multifd) live migration, as mentioned, the
post-copy preempt channel does not send any magic number. Also, tls live
migrations already does tls handshake before creating other channels, so
this issue is not possible with tls, hence this logic is avoided for tls
live migrations. This patch uses MSG_PEEK to check the magic number of
channels so that current data/control stream management remains
un-effected.

Suggested-by: Daniel P. Berrangé
Signed-off-by: manish.mishra

v2:
   TLS does not support MSG_PEEK, so V1 was broken for tls live
   migrations. For tls live migration, while initializing main channel
   tls handshake is done before we can create other channels, so this
   issue is not possible for tls live migrations. In V2 added a check
   to avoid checking magic number for tls live migration and fallback
   to older method to decide mapping of channels on destination side.
---
  include/io/channel.h | 25 +++
  io/channel-socket.c  | 27 
  io/channel.c | 39 +++
  migration/migration.c| 44 +---
  migration/multifd.c  | 12 ---
  migration/multifd.h  |  2 +-
  migration/postcopy-ram.c |  5 +
  migration/postcopy-ram.h |  2 +-
  8 files changed, 130 insertions(+), 26 deletions(-)

This should be two commits, because the 'io' and 'migration'
code are two separate subsystems in QEMU.


diff --git a/include/io/channel.h b/include/io/channel.h
index c680ee7480..74177aeeea 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -115,6 +115,10 @@ struct QIOChannelClass {
  int **fds,
  size_t *nfds,
  Error **errp);
+ssize_t (*io_read_peek)(QIOChannel *ioc,
+void *buf,
+size_t nbytes,
+Error **errp);

This API should be called "io_read_peekv" and use
"const struct iovec *iov", such that is matches the
design of 'io_readv'.

There should also be a QIOChannelFeature flag
registered to indicate whether a given channel
impl supports peeking at data.



@@ -475,6 +479,27 @@ int qio_channel_write_all(QIOChannel *ioc,
size_t buflen,
Error **errp);
  
+/**

+ * qio_channel_read_peek_all:
+ * @ioc: the channel object
+ * @buf: the memory region to read in data
+ * @nbytes: the number of bytes to read
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Read given @nbytes data from peek of channel into
+ * memory region @buf.
+ *
+ * The function will be blocked until read size is
+ * equal to requested size.
+ *
+ * Returns: 1 if all bytes were read, 0 if end-of-file
+ *  occurs without data, or -1 on error
+ */
+int qio_channel_read_peek_all(QIOChannel *ioc,
+  void* buf,
+  size_t nbytes,
+  Error **errp);

There should be qio_channel_read_peek, qio_channel_read_peekv,
qio_channel_read_peek_all and qio_channel_read_peekv_all.

Actually ignore that.  We should not add any new APIs at
all.  Instead the io_readv callback, and the qio_channel_read*all()
methods should gain a 'int flags' parameter, in the same way that
the write methods have one. Then there should be as
QIO_CHANNEL_READ_FLAG_PEEK constant defined.


Hi Daniel, As MSG_PEEK always reads from top even if there were previos partial 
reads, so current |qio_channel_readv_all_eofmay not work? I can keep things 
upto ||qio_channel_readv as common for both with flags parameters but have 
separate ||qio_channel_readv_peek_all_eof? Does something like that works.|||

thanks

Manish Mishra



With regards,
Daniel

Re: [PATCH v2] migration: check magic value for deciding the mapping of channels

2022-11-15 Thread manish.mishra



On 15/11/22 11:06 pm, Peter Xu wrote:

Hi, Manish,

On Mon, Nov 07, 2022 at 04:51:59PM +, manish.mishra wrote:

Current logic assumes that channel connections on the destination side are
always established in the same order as the source and the first one will
always be the main channel followed by the multifid or post-copy
preemption channel. This may not be always true, as even if a channel has a
connection established on the source side it can be in the pending state on
the destination side and a newer connection can be established first.
Basically causing out of order mapping of channels on the destination side.
Currently, all channels except post-copy preempt send a magic number, this
patch uses that magic number to decide the type of channel. This logic is
applicable only for precopy(multifd) live migration, as mentioned, the
post-copy preempt channel does not send any magic number. Also, tls live
migrations already does tls handshake before creating other channels, so
this issue is not possible with tls, hence this logic is avoided for tls
live migrations. This patch uses MSG_PEEK to check the magic number of
channels so that current data/control stream management remains
un-effected.

Suggested-by: Daniel P. Berrangé 
Signed-off-by: manish.mishra 

v2:
   TLS does not support MSG_PEEK, so V1 was broken for tls live
   migrations. For tls live migration, while initializing main channel
   tls handshake is done before we can create other channels, so this
   issue is not possible for tls live migrations. In V2 added a check
   to avoid checking magic number for tls live migration and fallback
   to older method to decide mapping of channels on destination side.
---
  include/io/channel.h | 25 +++
  io/channel-socket.c  | 27 
  io/channel.c | 39 +++
  migration/migration.c| 44 +---
  migration/multifd.c  | 12 ---
  migration/multifd.h  |  2 +-
  migration/postcopy-ram.c |  5 +
  migration/postcopy-ram.h |  2 +-
  8 files changed, 130 insertions(+), 26 deletions(-)

diff --git a/include/io/channel.h b/include/io/channel.h
index c680ee7480..74177aeeea 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -115,6 +115,10 @@ struct QIOChannelClass {
  int **fds,
  size_t *nfds,
  Error **errp);
+ssize_t (*io_read_peek)(QIOChannel *ioc,
+void *buf,
+size_t nbytes,
+Error **errp);
  int (*io_close)(QIOChannel *ioc,
  Error **errp);
  GSource * (*io_create_watch)(QIOChannel *ioc,
@@ -475,6 +479,27 @@ int qio_channel_write_all(QIOChannel *ioc,
size_t buflen,
Error **errp);
  
+/**

+ * qio_channel_read_peek_all:
+ * @ioc: the channel object
+ * @buf: the memory region to read in data
+ * @nbytes: the number of bytes to read
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Read given @nbytes data from peek of channel into
+ * memory region @buf.
+ *
+ * The function will be blocked until read size is
+ * equal to requested size.
+ *
+ * Returns: 1 if all bytes were read, 0 if end-of-file
+ *  occurs without data, or -1 on error
+ */
+int qio_channel_read_peek_all(QIOChannel *ioc,
+  void* buf,
+  size_t nbytes,
+  Error **errp);
+
  /**
   * qio_channel_set_blocking:
   * @ioc: the channel object
diff --git a/io/channel-socket.c b/io/channel-socket.c
index b76dca9cc1..b99f5dfda6 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -705,6 +705,32 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
  }
  #endif /* WIN32 */
  
+static ssize_t qio_channel_socket_read_peek(QIOChannel *ioc,

+void *buf,
+size_t nbytes,
+Error **errp)
+{
+QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
+ssize_t bytes = 0;
+
+retry:
+bytes = recv(sioc->fd, buf, nbytes, MSG_PEEK);
+
+if (bytes < 0) {
+if (errno == EINTR) {
+goto retry;
+}
+if (errno == EAGAIN) {
+return QIO_CHANNEL_ERR_BLOCK;
+}
+
+error_setg_errno(errp, errno,
+ "Unable to read from peek of socket");
+return -1;
+}
+
+return bytes;
+}
  
  #ifdef QEMU_MSG_ZEROCOPY

  static int qio_channel_socket_flush(QIOChannel *ioc,
@@ -902,6 +928,7 @@ static void qio_channel_socket_class_init(ObjectClass 
*klass,
  
  ioc_klass->io_writev = qio_channel_socket_writev;

  ioc_klass->io_readv = qio_channel_socket_readv;
+ioc_klass->io_read_peek = qio_channel_socket_r

Re: [PATCH] migration: check magic value for deciding the mapping of channels

2022-11-14 Thread manish.mishra

Thanks Peter

On 14/11/22 10:21 pm, Peter Xu wrote:

Manish,

On Thu, Nov 03, 2022 at 11:47:51PM +0530, manish.mishra wrote:

Yes, but if we try to read early on main channel with tls enabled case it is an 
issue. Sorry i may not have put above comment cleary. I will try to put 
scenario step wise.
1. main channel is created and tls handshake is done for main channel.
2. Destionation side tries to read magic early on main channel in 
migration_ioc_process_incoming but it is not yet sent by source.
3. Source has written magic to main channel file buffer but it is not yet 
flushed, it is flushed first time in ram_save_setup, i mean data is sent on 
channel only if qemu file buffer is full or explicitly flushed.
4. Source side blocks on multifd_send_sync_main in ram_save_setup before 
flushing qemu file. But multifd_send_sync_main is blocked for sem_sync until 
handshake is done for multiFD channels.
5. Destination side is still waiting for reading magic on main channel, so 
unless we return from migration_ioc_process_incoming we can not accept new 
channel, so handshake of multiFD channel is blocked.
6. So basically source is blocked on multiFD channels handshake before sending 
data on main channel, but destination is blocked waiting for data before it can 
acknowledge multiFD channels and do handshake, so it kind of creates a deadlock 
situation.

Why is this issue only happening with TLS?  It sounds like it'll happen as
long as multifd enabled.



Actually this was happening with tls because with tls we do handshake, so a 
connection is assumed establised only after a tls handshake and we flush data 
from source only after all channels are established,  but with normal live 
migration even if connection is not accepted on destination side we can 
continue as we do not do any handshake. Basically in normal live migration a 
connection is assumed established if connect() call was successful even if it 
is not accepted/ack by destination, so that's why this deadlock was not 
hapening.



I'm also thinking whether we should flush in qemu_savevm_state_header() so
at least upgraded src qemu will always flush the headers if it never hurts.


yes sure Peter.



Thanks

Manish Mishra





Re: [PATCH v2] migration: check magic value for deciding the mapping of channels

2022-11-10 Thread manish.mishra



On 11/11/22 4:17 am, Peter Xu wrote:

On Thu, Nov 10, 2022 at 05:59:45PM +0530, manish.mishra wrote:

Hi Everyone, Just a gentle reminder for review. :)

Hi, Manish,

I've got a slightly busy week, sorry!  If Daniel and Juan won't have time
to look at it I'll have a closer look at it next Monday (holiday tomorrow).


Yes sure Peter, Thank you and Happy Holiday :).







Re: [PATCH v2] migration: check magic value for deciding the mapping of channels

2022-11-10 Thread manish.mishra

Hi Everyone, Just a gentle reminder for review. :)

Thanks

Manish Mishra

On 07/11/22 10:21 pm, manish.mishra wrote:

Current logic assumes that channel connections on the destination side are
always established in the same order as the source and the first one will
always be the main channel followed by the multifid or post-copy
preemption channel. This may not be always true, as even if a channel has a
connection established on the source side it can be in the pending state on
the destination side and a newer connection can be established first.
Basically causing out of order mapping of channels on the destination side.
Currently, all channels except post-copy preempt send a magic number, this
patch uses that magic number to decide the type of channel. This logic is
applicable only for precopy(multifd) live migration, as mentioned, the
post-copy preempt channel does not send any magic number. Also, tls live
migrations already does tls handshake before creating other channels, so
this issue is not possible with tls, hence this logic is avoided for tls
live migrations. This patch uses MSG_PEEK to check the magic number of
channels so that current data/control stream management remains
un-effected.

Suggested-by: Daniel P. Berrangé
Signed-off-by: manish.mishra

v2:
   TLS does not support MSG_PEEK, so V1 was broken for tls live
   migrations. For tls live migration, while initializing main channel
   tls handshake is done before we can create other channels, so this
   issue is not possible for tls live migrations. In V2 added a check
   to avoid checking magic number for tls live migration and fallback
   to older method to decide mapping of channels on destination side.
---
  include/io/channel.h | 25 +++
  io/channel-socket.c  | 27 
  io/channel.c | 39 +++
  migration/migration.c| 44 +---
  migration/multifd.c  | 12 ---
  migration/multifd.h  |  2 +-
  migration/postcopy-ram.c |  5 +
  migration/postcopy-ram.h |  2 +-
  8 files changed, 130 insertions(+), 26 deletions(-)

diff --git a/include/io/channel.h b/include/io/channel.h
index c680ee7480..74177aeeea 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -115,6 +115,10 @@ struct QIOChannelClass {
  int **fds,
  size_t *nfds,
  Error **errp);
+ssize_t (*io_read_peek)(QIOChannel *ioc,
+void *buf,
+size_t nbytes,
+Error **errp);
  int (*io_close)(QIOChannel *ioc,
  Error **errp);
  GSource * (*io_create_watch)(QIOChannel *ioc,
@@ -475,6 +479,27 @@ int qio_channel_write_all(QIOChannel *ioc,
size_t buflen,
Error **errp);
  
+/**

+ * qio_channel_read_peek_all:
+ * @ioc: the channel object
+ * @buf: the memory region to read in data
+ * @nbytes: the number of bytes to read
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Read given @nbytes data from peek of channel into
+ * memory region @buf.
+ *
+ * The function will be blocked until read size is
+ * equal to requested size.
+ *
+ * Returns: 1 if all bytes were read, 0 if end-of-file
+ *  occurs without data, or -1 on error
+ */
+int qio_channel_read_peek_all(QIOChannel *ioc,
+  void* buf,
+  size_t nbytes,
+  Error **errp);
+
  /**
   * qio_channel_set_blocking:
   * @ioc: the channel object
diff --git a/io/channel-socket.c b/io/channel-socket.c
index b76dca9cc1..b99f5dfda6 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -705,6 +705,32 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
  }
  #endif /* WIN32 */
  
+static ssize_t qio_channel_socket_read_peek(QIOChannel *ioc,

+void *buf,
+size_t nbytes,
+Error **errp)
+{
+QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
+ssize_t bytes = 0;
+
+retry:
+bytes = recv(sioc->fd, buf, nbytes, MSG_PEEK);
+
+if (bytes < 0) {
+if (errno == EINTR) {
+goto retry;
+}
+if (errno == EAGAIN) {
+return QIO_CHANNEL_ERR_BLOCK;
+}
+
+error_setg_errno(errp, errno,
+ "Unable to read from peek of socket");
+return -1;
+}
+
+return bytes;
+}
  
  #ifdef QEMU_MSG_ZEROCOPY

  static int qio_channel_socket_flush(QIOChannel *ioc,
@@ -902,6 +928,7 @@ static void qio_channel_socket_class_init(ObjectClass 
*klass,
  
  ioc_klass->io_writev = qio_channel_socket_writev;

  ioc_klass->io_readv = qio_channel_socket_readv;
+ioc_klass->io_read_peek = qio_channel_socket_r

Re: [PATCH v2] migration: check magic value for deciding the mapping of channels

2022-11-08 Thread manish.mishra



On 07/11/22 10:21 pm, manish.mishra wrote:

Current logic assumes that channel connections on the destination side are
always established in the same order as the source and the first one will
always be the main channel followed by the multifid or post-copy
preemption channel. This may not be always true, as even if a channel has a
connection established on the source side it can be in the pending state on
the destination side and a newer connection can be established first.
Basically causing out of order mapping of channels on the destination side.
Currently, all channels except post-copy preempt send a magic number, this
patch uses that magic number to decide the type of channel. This logic is
applicable only for precopy(multifd) live migration, as mentioned, the
post-copy preempt channel does not send any magic number. Also, tls live
migrations already does tls handshake before creating other channels, so
this issue is not possible with tls, hence this logic is avoided for tls
live migrations. This patch uses MSG_PEEK to check the magic number of
channels so that current data/control stream management remains
un-effected.

Suggested-by: Daniel P. Berrangé 
Signed-off-by: manish.mishra 

v2:
   TLS does not support MSG_PEEK, so V1 was broken for tls live
   migrations. For tls live migration, while initializing main channel
   tls handshake is done before we can create other channels, so this
   issue is not possible for tls live migrations. In V2 added a check
   to avoid checking magic number for tls live migration and fallback
   to older method to decide mapping of channels on destination side.


Hi Daniel, This is what i concluded from discussion on V1, if this is not what 
you expected, please let me know, we can continue discussion on V1 thread.

Thanks

Manish Mishra



---
  include/io/channel.h | 25 +++
  io/channel-socket.c  | 27 
  io/channel.c | 39 +++
  migration/migration.c| 44 +---
  migration/multifd.c  | 12 ---
  migration/multifd.h  |  2 +-
  migration/postcopy-ram.c |  5 +
  migration/postcopy-ram.h |  2 +-
  8 files changed, 130 insertions(+), 26 deletions(-)

diff --git a/include/io/channel.h b/include/io/channel.h
index c680ee7480..74177aeeea 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -115,6 +115,10 @@ struct QIOChannelClass {
  int **fds,
  size_t *nfds,
  Error **errp);
+ssize_t (*io_read_peek)(QIOChannel *ioc,
+void *buf,
+size_t nbytes,
+Error **errp);
  int (*io_close)(QIOChannel *ioc,
  Error **errp);
  GSource * (*io_create_watch)(QIOChannel *ioc,
@@ -475,6 +479,27 @@ int qio_channel_write_all(QIOChannel *ioc,
size_t buflen,
Error **errp);
  
+/**

+ * qio_channel_read_peek_all:
+ * @ioc: the channel object
+ * @buf: the memory region to read in data
+ * @nbytes: the number of bytes to read
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Read given @nbytes data from peek of channel into
+ * memory region @buf.
+ *
+ * The function will be blocked until read size is
+ * equal to requested size.
+ *
+ * Returns: 1 if all bytes were read, 0 if end-of-file
+ *  occurs without data, or -1 on error
+ */
+int qio_channel_read_peek_all(QIOChannel *ioc,
+  void* buf,
+  size_t nbytes,
+  Error **errp);
+
  /**
   * qio_channel_set_blocking:
   * @ioc: the channel object
diff --git a/io/channel-socket.c b/io/channel-socket.c
index b76dca9cc1..b99f5dfda6 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -705,6 +705,32 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
  }
  #endif /* WIN32 */
  
+static ssize_t qio_channel_socket_read_peek(QIOChannel *ioc,

+void *buf,
+size_t nbytes,
+Error **errp)
+{
+QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
+ssize_t bytes = 0;
+
+retry:
+bytes = recv(sioc->fd, buf, nbytes, MSG_PEEK);
+
+if (bytes < 0) {
+if (errno == EINTR) {
+goto retry;
+}
+if (errno == EAGAIN) {
+return QIO_CHANNEL_ERR_BLOCK;
+}
+
+error_setg_errno(errp, errno,
+ "Unable to read from peek of socket");
+return -1;
+}
+
+return bytes;
+}
  
  #ifdef QEMU_MSG_ZEROCOPY

  static int qio_channel_socket_flush(QIOChannel *ioc,
@@ -902,6 +928,7 @@ static void qio_channel_socket_class_init(ObjectClass 
*klass,
  
  ioc_klass->io_writev = qio_channel_socket_write

[PATCH v2] migration: check magic value for deciding the mapping of channels

2022-11-07 Thread manish.mishra
Current logic assumes that channel connections on the destination side are
always established in the same order as the source and the first one will
always be the main channel followed by the multifid or post-copy
preemption channel. This may not be always true, as even if a channel has a
connection established on the source side it can be in the pending state on
the destination side and a newer connection can be established first.
Basically causing out of order mapping of channels on the destination side.
Currently, all channels except post-copy preempt send a magic number, this
patch uses that magic number to decide the type of channel. This logic is
applicable only for precopy(multifd) live migration, as mentioned, the
post-copy preempt channel does not send any magic number. Also, tls live
migrations already does tls handshake before creating other channels, so
this issue is not possible with tls, hence this logic is avoided for tls
live migrations. This patch uses MSG_PEEK to check the magic number of
channels so that current data/control stream management remains
un-effected.

Suggested-by: Daniel P. Berrangé 
Signed-off-by: manish.mishra 

v2:
  TLS does not support MSG_PEEK, so V1 was broken for tls live
  migrations. For tls live migration, while initializing main channel
  tls handshake is done before we can create other channels, so this
  issue is not possible for tls live migrations. In V2 added a check
  to avoid checking magic number for tls live migration and fallback
  to older method to decide mapping of channels on destination side.
---
 include/io/channel.h | 25 +++
 io/channel-socket.c  | 27 
 io/channel.c | 39 +++
 migration/migration.c| 44 +---
 migration/multifd.c  | 12 ---
 migration/multifd.h  |  2 +-
 migration/postcopy-ram.c |  5 +
 migration/postcopy-ram.h |  2 +-
 8 files changed, 130 insertions(+), 26 deletions(-)

diff --git a/include/io/channel.h b/include/io/channel.h
index c680ee7480..74177aeeea 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -115,6 +115,10 @@ struct QIOChannelClass {
 int **fds,
 size_t *nfds,
 Error **errp);
+ssize_t (*io_read_peek)(QIOChannel *ioc,
+void *buf,
+size_t nbytes,
+Error **errp);
 int (*io_close)(QIOChannel *ioc,
 Error **errp);
 GSource * (*io_create_watch)(QIOChannel *ioc,
@@ -475,6 +479,27 @@ int qio_channel_write_all(QIOChannel *ioc,
   size_t buflen,
   Error **errp);
 
+/**
+ * qio_channel_read_peek_all:
+ * @ioc: the channel object
+ * @buf: the memory region to read in data
+ * @nbytes: the number of bytes to read
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Read given @nbytes data from peek of channel into
+ * memory region @buf.
+ *
+ * The function will be blocked until read size is
+ * equal to requested size.
+ *
+ * Returns: 1 if all bytes were read, 0 if end-of-file
+ *  occurs without data, or -1 on error
+ */
+int qio_channel_read_peek_all(QIOChannel *ioc,
+  void* buf,
+  size_t nbytes,
+  Error **errp);
+
 /**
  * qio_channel_set_blocking:
  * @ioc: the channel object
diff --git a/io/channel-socket.c b/io/channel-socket.c
index b76dca9cc1..b99f5dfda6 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -705,6 +705,32 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
 }
 #endif /* WIN32 */
 
+static ssize_t qio_channel_socket_read_peek(QIOChannel *ioc,
+void *buf,
+size_t nbytes,
+Error **errp)
+{
+QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
+ssize_t bytes = 0;
+
+retry:
+bytes = recv(sioc->fd, buf, nbytes, MSG_PEEK);
+
+if (bytes < 0) {
+if (errno == EINTR) {
+goto retry;
+}
+if (errno == EAGAIN) {
+return QIO_CHANNEL_ERR_BLOCK;
+}
+
+error_setg_errno(errp, errno,
+ "Unable to read from peek of socket");
+return -1;
+}
+
+return bytes;
+}
 
 #ifdef QEMU_MSG_ZEROCOPY
 static int qio_channel_socket_flush(QIOChannel *ioc,
@@ -902,6 +928,7 @@ static void qio_channel_socket_class_init(ObjectClass 
*klass,
 
 ioc_klass->io_writev = qio_channel_socket_writev;
 ioc_klass->io_readv = qio_channel_socket_readv;
+ioc_klass->io_read_peek = qio_channel_socket_read_peek;
 ioc_klass->io_set_blocking = qio_channel_socket_set_blocking;
 ioc_klass->io_close = qio_channel_socket_clo

Re: [PATCH] migration: check magic value for deciding the mapping of channels

2022-11-03 Thread manish.mishra



On 03/11/22 11:47 pm, manish.mishra wrote:


On 03/11/22 11:27 pm, Daniel P. Berrangé wrote:

On Thu, Nov 03, 2022 at 11:06:23PM +0530, manish.mishra wrote:

On 03/11/22 10:57 pm, Daniel P. Berrangé wrote:

On Thu, Nov 03, 2022 at 10:04:54PM +0530, manish.mishra wrote:

On 03/11/22 2:59 pm, Daniel P. Berrangé wrote:

On Thu, Nov 03, 2022 at 02:50:25PM +0530, manish.mishra wrote:

On 01/11/22 9:15 pm, Daniel P. Berrangé wrote:

On Tue, Nov 01, 2022 at 09:10:14PM +0530, manish.mishra wrote:

On 01/11/22 8:21 pm, Daniel P. Berrangé wrote:

On Tue, Nov 01, 2022 at 02:30:29PM +, manish.mishra wrote:

diff --git a/migration/migration.c b/migration/migration.c
index 739bb683f3..f4b6f278a9 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -733,31 +733,40 @@ void migration_ioc_process_incoming(QIOChannel *ioc, 
Error **errp)
  {
  MigrationIncomingState *mis = migration_incoming_get_current();
  Error *local_err = NULL;
-    bool start_migration;
  QEMUFile *f;
+    bool default_channel = true;
+    uint32_t channel_magic = 0;
+    int ret = 0;
-    if (!mis->from_src_file) {
-    /* The first connection (multifd may have multiple) */
+    if (migrate_use_multifd() && !migration_in_postcopy()) {
+    ret = qio_channel_read_peek_all(ioc, (void *)&channel_magic,
+ sizeof(channel_magic), &local_err);
+
+    if (ret != 1) {
+    error_propagate(errp, local_err);
+    return;
+    }

and thus this will fail for TLS channels AFAICT.

Yes, thanks for quick review Daniel. You pointed this earlier too, sorry missed 
it, will put another check !migrate_use_tls() in V2.

But we need this problem fixed with TLS too, so just excluding it
isn't right. IMHO we need to modify the migration code so we can
read the magic earlier, instead of peeking.


With regards,
Daniel

Hi Daniel, I was trying tls migrations. What i see is that tls session
creation does handshake. So if we read ahead in ioc_process_incoming
for default channel. Because client sends magic only after multiFD
channels are setup, which too requires tls handshake.

By the time we get to migrate_ioc_process_incoming, the TLS handshake
has already been performed.

migration_channel_process_incoming
   -> migration_ioc_process_incoming

vs

migration_channel_process_incoming
   -> migration_tls_channel_process_incoming
   -> migration_tls_incoming_handshake
 -> migration_channel_process_incoming
 ->  migration_ioc_process_incoming


Yes sorry i thought we block on source side till handshake is done but that is 
not true. I checked then why that deadlock is happening. So this where the 
dealock is happening.

static int ram_save_setup(QEMUFile *f, void *opaque) {
+
+
      ram_control_before_iterate(f, RAM_CONTROL_SETUP);
      ram_control_after_iterate(f, RAM_CONTROL_SETUP);

      ret =  multifd_send_sync_main(f);
      if (ret < 0) {
      return ret;
      }

      qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
      qemu_fflush(f);

      return 0;
}

Now if we block in migration_ioc_process_incoming for reading magic
value from channel, which is actually sent by client when this
qemu_fflush is done. Before this qemu_fflush we wait for
multifd_send_sync_main which actually requires that tls handshake is
done for multiFD channels as it blocks on sem_sync which posted by
multifd_send_thread which is called after handshake||. But then on
destination side we are blocked in migration_ioc_process_incoming()
waiting to read something from default channel hence handshake for
multiFD channels can not happen. This to me looks unresolvable
whatever way we try to manipulate stream until we do some changes
on source side.

The TLS handshake is already complete when migration_ioc_process_incoming
is blocking on read.

Regardless of which channel we're talking about, thue TLS handshake is
always performed & finished before we try to either send or recv any
data.

Yes Daniel, agree on that, in this case tls handshake is done for
default channel so we went in migration_ioc_process_incoming for
default channel. But if we try to read some data there, it does not
get because data on default channel is not yet flushed by source.
 From source side data in flushed in above function i pointed. Which
blocks for multiFD channels to be setup with handshake, before
flushing data. I mean data is sent only when buffer is full or
explicitly flushed, till then it is just in buffer. But multiFD
handshake can not complete until we return from
migration_ioc_process_incoming for default channel which infintely
waits because nothing is sent yet on channel.

On the source side, if we're in ram_save_setup then the TLS
handshake is already complete for the main channel. In fact
I think the TLS handshake should act as a serialization
point that prevents the entire bug. It should be guaranteed
that the main channel is fully received b

Re: [PATCH] migration: check magic value for deciding the mapping of channels

2022-11-03 Thread manish.mishra



On 03/11/22 11:47 pm, manish.mishra wrote:


On 03/11/22 11:27 pm, Daniel P. Berrangé wrote:

On Thu, Nov 03, 2022 at 11:06:23PM +0530, manish.mishra wrote:

On 03/11/22 10:57 pm, Daniel P. Berrangé wrote:

On Thu, Nov 03, 2022 at 10:04:54PM +0530, manish.mishra wrote:

On 03/11/22 2:59 pm, Daniel P. Berrangé wrote:

On Thu, Nov 03, 2022 at 02:50:25PM +0530, manish.mishra wrote:

On 01/11/22 9:15 pm, Daniel P. Berrangé wrote:

On Tue, Nov 01, 2022 at 09:10:14PM +0530, manish.mishra wrote:

On 01/11/22 8:21 pm, Daniel P. Berrangé wrote:

On Tue, Nov 01, 2022 at 02:30:29PM +, manish.mishra wrote:

diff --git a/migration/migration.c b/migration/migration.c
index 739bb683f3..f4b6f278a9 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -733,31 +733,40 @@ void migration_ioc_process_incoming(QIOChannel *ioc, 
Error **errp)
  {
  MigrationIncomingState *mis = migration_incoming_get_current();
  Error *local_err = NULL;
-    bool start_migration;
  QEMUFile *f;
+    bool default_channel = true;
+    uint32_t channel_magic = 0;
+    int ret = 0;
-    if (!mis->from_src_file) {
-    /* The first connection (multifd may have multiple) */
+    if (migrate_use_multifd() && !migration_in_postcopy()) {
+    ret = qio_channel_read_peek_all(ioc, (void *)&channel_magic,
+ sizeof(channel_magic), &local_err);
+
+    if (ret != 1) {
+    error_propagate(errp, local_err);
+    return;
+    }

and thus this will fail for TLS channels AFAICT.

Yes, thanks for quick review Daniel. You pointed this earlier too, sorry missed 
it, will put another check !migrate_use_tls() in V2.

But we need this problem fixed with TLS too, so just excluding it
isn't right. IMHO we need to modify the migration code so we can
read the magic earlier, instead of peeking.


With regards,
Daniel

Hi Daniel, I was trying tls migrations. What i see is that tls session
creation does handshake. So if we read ahead in ioc_process_incoming
for default channel. Because client sends magic only after multiFD
channels are setup, which too requires tls handshake.

By the time we get to migrate_ioc_process_incoming, the TLS handshake
has already been performed.

migration_channel_process_incoming
   -> migration_ioc_process_incoming

vs

migration_channel_process_incoming
   -> migration_tls_channel_process_incoming
   -> migration_tls_incoming_handshake
 -> migration_channel_process_incoming
 ->  migration_ioc_process_incoming


Yes sorry i thought we block on source side till handshake is done but that is 
not true. I checked then why that deadlock is happening. So this where the 
dealock is happening.

static int ram_save_setup(QEMUFile *f, void *opaque) {
+
+
      ram_control_before_iterate(f, RAM_CONTROL_SETUP);
      ram_control_after_iterate(f, RAM_CONTROL_SETUP);

      ret =  multifd_send_sync_main(f);
      if (ret < 0) {
      return ret;
      }

      qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
      qemu_fflush(f);

      return 0;
}

Now if we block in migration_ioc_process_incoming for reading magic
value from channel, which is actually sent by client when this
qemu_fflush is done. Before this qemu_fflush we wait for
multifd_send_sync_main which actually requires that tls handshake is
done for multiFD channels as it blocks on sem_sync which posted by
multifd_send_thread which is called after handshake||. But then on
destination side we are blocked in migration_ioc_process_incoming()
waiting to read something from default channel hence handshake for
multiFD channels can not happen. This to me looks unresolvable
whatever way we try to manipulate stream until we do some changes
on source side.

The TLS handshake is already complete when migration_ioc_process_incoming
is blocking on read.

Regardless of which channel we're talking about, thue TLS handshake is
always performed & finished before we try to either send or recv any
data.

Yes Daniel, agree on that, in this case tls handshake is done for
default channel so we went in migration_ioc_process_incoming for
default channel. But if we try to read some data there, it does not
get because data on default channel is not yet flushed by source.
 From source side data in flushed in above function i pointed. Which
blocks for multiFD channels to be setup with handshake, before
flushing data. I mean data is sent only when buffer is full or
explicitly flushed, till then it is just in buffer. But multiFD
handshake can not complete until we return from
migration_ioc_process_incoming for default channel which infintely
waits because nothing is sent yet on channel.

On the source side, if we're in ram_save_setup then the TLS
handshake is already complete for the main channel. In fact
I think the TLS handshake should act as a serialization
point that prevents the entire bug. It should be guaranteed
that the main channel is fully received b

Re: [PATCH] migration: check magic value for deciding the mapping of channels

2022-11-03 Thread manish.mishra



On 03/11/22 11:27 pm, Daniel P. Berrangé wrote:

On Thu, Nov 03, 2022 at 11:06:23PM +0530, manish.mishra wrote:

On 03/11/22 10:57 pm, Daniel P. Berrangé wrote:

On Thu, Nov 03, 2022 at 10:04:54PM +0530, manish.mishra wrote:

On 03/11/22 2:59 pm, Daniel P. Berrangé wrote:

On Thu, Nov 03, 2022 at 02:50:25PM +0530, manish.mishra wrote:

On 01/11/22 9:15 pm, Daniel P. Berrangé wrote:

On Tue, Nov 01, 2022 at 09:10:14PM +0530, manish.mishra wrote:

On 01/11/22 8:21 pm, Daniel P. Berrangé wrote:

On Tue, Nov 01, 2022 at 02:30:29PM +, manish.mishra wrote:

diff --git a/migration/migration.c b/migration/migration.c
index 739bb683f3..f4b6f278a9 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -733,31 +733,40 @@ void migration_ioc_process_incoming(QIOChannel *ioc, 
Error **errp)
  {
  MigrationIncomingState *mis = migration_incoming_get_current();
  Error *local_err = NULL;
-bool start_migration;
  QEMUFile *f;
+bool default_channel = true;
+uint32_t channel_magic = 0;
+int ret = 0;
-if (!mis->from_src_file) {
-/* The first connection (multifd may have multiple) */
+if (migrate_use_multifd() && !migration_in_postcopy()) {
+ret = qio_channel_read_peek_all(ioc, (void *)&channel_magic,
+sizeof(channel_magic), &local_err);
+
+if (ret != 1) {
+error_propagate(errp, local_err);
+return;
+}

and thus this will fail for TLS channels AFAICT.

Yes, thanks for quick review Daniel. You pointed this earlier too, sorry missed 
it, will put another check !migrate_use_tls() in V2.

But we need this problem fixed with TLS too, so just excluding it
isn't right. IMHO we need to modify the migration code so we can
read the magic earlier, instead of peeking.


With regards,
Daniel

Hi Daniel, I was trying tls migrations. What i see is that tls session
creation does handshake. So if we read ahead in ioc_process_incoming
for default channel. Because client sends magic only after multiFD
channels are setup, which too requires tls handshake.

By the time we get to migrate_ioc_process_incoming, the TLS handshake
has already been performed.

migration_channel_process_incoming
   -> migration_ioc_process_incoming

vs

migration_channel_process_incoming
   -> migration_tls_channel_process_incoming
   -> migration_tls_incoming_handshake
 -> migration_channel_process_incoming
 ->  migration_ioc_process_incoming


Yes sorry i thought we block on source side till handshake is done but that is 
not true. I checked then why that deadlock is happening. So this where the 
dealock is happening.

static int ram_save_setup(QEMUFile *f, void *opaque) {
+
+
      ram_control_before_iterate(f, RAM_CONTROL_SETUP);
      ram_control_after_iterate(f, RAM_CONTROL_SETUP);

      ret =  multifd_send_sync_main(f);
      if (ret < 0) {
      return ret;
      }

      qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
      qemu_fflush(f);

      return 0;
}

Now if we block in migration_ioc_process_incoming for reading magic
value from channel, which is actually sent by client when this
qemu_fflush is done. Before this qemu_fflush we wait for
multifd_send_sync_main which actually requires that tls handshake is
done for multiFD channels as it blocks on sem_sync which posted by
multifd_send_thread which is called after handshake||. But then on
destination side we are blocked in migration_ioc_process_incoming()
waiting to read something from default channel hence handshake for
multiFD channels can not happen. This to me looks unresolvable
whatever way we try to manipulate stream until we do some changes
on source side.

The TLS handshake is already complete when migration_ioc_process_incoming
is blocking on read.

Regardless of which channel we're talking about, thue TLS handshake is
always performed & finished before we try to either send or recv any
data.

Yes Daniel, agree on that, in this case tls handshake is done for
default channel so we went in migration_ioc_process_incoming for
default channel. But if we try to read some data there, it does not
get because data on default channel is not yet flushed by source.
 From source side data in flushed in above function i pointed. Which
blocks for multiFD channels to be setup with handshake, before
flushing data. I mean data is sent only when buffer is full or
explicitly flushed, till then it is just in buffer. But multiFD
handshake can not complete until we return from
migration_ioc_process_incoming for default channel which infintely
waits because nothing is sent yet on channel.

On the source side, if we're in ram_save_setup then the TLS
handshake is already complete for the main channel. In fact
I think the TLS handshake should act as a serialization
point that prevents the entire bug. It should be guaranteed
that the main channel is fully received b

Re: [PATCH] migration: check magic value for deciding the mapping of channels

2022-11-03 Thread manish.mishra



On 03/11/22 10:57 pm, Daniel P. Berrangé wrote:

On Thu, Nov 03, 2022 at 10:04:54PM +0530, manish.mishra wrote:

On 03/11/22 2:59 pm, Daniel P. Berrangé wrote:

On Thu, Nov 03, 2022 at 02:50:25PM +0530, manish.mishra wrote:

On 01/11/22 9:15 pm, Daniel P. Berrangé wrote:

On Tue, Nov 01, 2022 at 09:10:14PM +0530, manish.mishra wrote:

On 01/11/22 8:21 pm, Daniel P. Berrangé wrote:

On Tue, Nov 01, 2022 at 02:30:29PM +, manish.mishra wrote:

diff --git a/migration/migration.c b/migration/migration.c
index 739bb683f3..f4b6f278a9 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -733,31 +733,40 @@ void migration_ioc_process_incoming(QIOChannel *ioc, 
Error **errp)
 {
 MigrationIncomingState *mis = migration_incoming_get_current();
 Error *local_err = NULL;
-bool start_migration;
 QEMUFile *f;
+bool default_channel = true;
+uint32_t channel_magic = 0;
+int ret = 0;
-if (!mis->from_src_file) {
-/* The first connection (multifd may have multiple) */
+if (migrate_use_multifd() && !migration_in_postcopy()) {
+ret = qio_channel_read_peek_all(ioc, (void *)&channel_magic,
+sizeof(channel_magic), &local_err);
+
+if (ret != 1) {
+error_propagate(errp, local_err);
+return;
+}

and thus this will fail for TLS channels AFAICT.

Yes, thanks for quick review Daniel. You pointed this earlier too, sorry missed 
it, will put another check !migrate_use_tls() in V2.

But we need this problem fixed with TLS too, so just excluding it
isn't right. IMHO we need to modify the migration code so we can
read the magic earlier, instead of peeking.


With regards,
Daniel

Hi Daniel, I was trying tls migrations. What i see is that tls session
creation does handshake. So if we read ahead in ioc_process_incoming
for default channel. Because client sends magic only after multiFD
channels are setup, which too requires tls handshake.

By the time we get to migrate_ioc_process_incoming, the TLS handshake
has already been performed.

migration_channel_process_incoming
  -> migration_ioc_process_incoming

vs

migration_channel_process_incoming
  -> migration_tls_channel_process_incoming
  -> migration_tls_incoming_handshake
 -> migration_channel_process_incoming
 ->  migration_ioc_process_incoming


Yes sorry i thought we block on source side till handshake is done but that is 
not true. I checked then why that deadlock is happening. So this where the 
dealock is happening.

static int ram_save_setup(QEMUFile *f, void *opaque) {
+
+
     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
     ram_control_after_iterate(f, RAM_CONTROL_SETUP);

     ret =  multifd_send_sync_main(f);
     if (ret < 0) {
     return ret;
     }

     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
     qemu_fflush(f);

     return 0;
}

Now if we block in migration_ioc_process_incoming for reading magic
value from channel, which is actually sent by client when this
qemu_fflush is done. Before this qemu_fflush we wait for
multifd_send_sync_main which actually requires that tls handshake is
done for multiFD channels as it blocks on sem_sync which posted by
multifd_send_thread which is called after handshake||. But then on
destination side we are blocked in migration_ioc_process_incoming()
waiting to read something from default channel hence handshake for
multiFD channels can not happen. This to me looks unresolvable
whatever way we try to manipulate stream until we do some changes
on source side.

The TLS handshake is already complete when migration_ioc_process_incoming
is blocking on read.

Regardless of which channel we're talking about, thue TLS handshake is
always performed & finished before we try to either send or recv any
data.

With regards,
Daniel


Yes Daniel, agree on that, in this case tls handshake is done for default 
channel so we went in migration_ioc_process_incoming for default channel. But 
if we try to read some data there, it does not get because data on default 
channel is not yet flushed by source.  From source side data in flushed in 
above function i pointed. Which blocks for multiFD channels to be setup with 
handshake, before flushing data. I mean data is sent only when buffer is full 
or explicitly flushed, till then it is just in buffer. But multiFD handshake 
can not complete until we return from migration_ioc_process_incoming for 
default channel which infintely waits because nothing is sent yet on channel.

Thanks

Manish Mishra




Re: [PATCH] migration: check magic value for deciding the mapping of channels

2022-11-03 Thread manish.mishra


On 03/11/22 2:59 pm, Daniel P. Berrangé wrote:

On Thu, Nov 03, 2022 at 02:50:25PM +0530, manish.mishra wrote:

On 01/11/22 9:15 pm, Daniel P. Berrangé wrote:

On Tue, Nov 01, 2022 at 09:10:14PM +0530, manish.mishra wrote:

On 01/11/22 8:21 pm, Daniel P. Berrangé wrote:

On Tue, Nov 01, 2022 at 02:30:29PM +, manish.mishra wrote:

diff --git a/migration/migration.c b/migration/migration.c
index 739bb683f3..f4b6f278a9 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -733,31 +733,40 @@ void migration_ioc_process_incoming(QIOChannel *ioc, 
Error **errp)
{
MigrationIncomingState *mis = migration_incoming_get_current();
Error *local_err = NULL;
-bool start_migration;
QEMUFile *f;
+bool default_channel = true;
+uint32_t channel_magic = 0;
+int ret = 0;
-if (!mis->from_src_file) {
-/* The first connection (multifd may have multiple) */
+if (migrate_use_multifd() && !migration_in_postcopy()) {
+ret = qio_channel_read_peek_all(ioc, (void *)&channel_magic,
+sizeof(channel_magic), &local_err);
+
+if (ret != 1) {
+error_propagate(errp, local_err);
+return;
+}

and thus this will fail for TLS channels AFAICT.

Yes, thanks for quick review Daniel. You pointed this earlier too, sorry missed 
it, will put another check !migrate_use_tls() in V2.

But we need this problem fixed with TLS too, so just excluding it
isn't right. IMHO we need to modify the migration code so we can
read the magic earlier, instead of peeking.


With regards,
Daniel

Hi Daniel, I was trying tls migrations. What i see is that tls session
creation does handshake. So if we read ahead in ioc_process_incoming
for default channel. Because client sends magic only after multiFD
channels are setup, which too requires tls handshake.

By the time we get to migrate_ioc_process_incoming, the TLS handshake
has already been performed.

migration_channel_process_incoming
 -> migration_ioc_process_incoming

vs

migration_channel_process_incoming
 -> migration_tls_channel_process_incoming
 -> migration_tls_incoming_handshake
 -> migration_channel_process_incoming
 ->  migration_ioc_process_incoming



Yes sorry i thought we block on source side till handshake is done but that is 
not true. I checked then why that deadlock is happening. So this where the 
dealock is happening.

static int ram_save_setup(QEMUFile *f, void *opaque) {
+
+
    ram_control_before_iterate(f, RAM_CONTROL_SETUP);
    ram_control_after_iterate(f, RAM_CONTROL_SETUP);

    ret =  multifd_send_sync_main(f);
    if (ret < 0) {
    return ret;
    }

    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
    qemu_fflush(f);

    return 0;
}

Now if we block in migration_ioc_process_incoming for reading magic value from 
channel, which is actually sent by client when this qemu_fflush is done. Before 
this qemu_fflush we wait for multifd_send_sync_main which actually requires 
that tls handshake is done for multiFD channels as it blocks on sem_sync which 
posted by multifd_send_thread which is called after handshake||. But then on 
destination side we are blocked in migration_ioc_process_incoming() waiting to 
read something from default channel hence handshake for multiFD channels can 
not happen. This to me looks unresolvable whatever way we try to manipulate 
stream until we do some changes on source side.


With regards,
Daniel

Re: [PATCH] migration: check magic value for deciding the mapping of channels

2022-11-03 Thread manish.mishra



On 01/11/22 9:15 pm, Daniel P. Berrangé wrote:

On Tue, Nov 01, 2022 at 09:10:14PM +0530, manish.mishra wrote:

On 01/11/22 8:21 pm, Daniel P. Berrangé wrote:

On Tue, Nov 01, 2022 at 02:30:29PM +, manish.mishra wrote:

Current logic assumes that channel connections on the destination side are
always established in the same order as the source and the first one will
always be the default channel followed by the multifid or post-copy
preemption channel. This may not be always true, as even if a channel has a
connection established on the source side it can be in the pending state on
the destination side and a newer connection can be established first.
Basically causing out of order mapping of channels on the destination side.
Currently, all channels except post-copy preempt send a magic number, this
patch uses that magic number to decide the type of channel. This logic is
applicable only for precopy(multifd) live migration, as mentioned, the
post-copy preempt channel does not send any magic number. Also, this patch
uses MSG_PEEK to check the magic number of channels so that current
data/control stream management remains un-effected.

Signed-off-by: manish.mishra 
---
   include/io/channel.h | 25 +
   io/channel-socket.c  | 27 +++
   io/channel.c | 39 +++
   migration/migration.c| 33 +
   migration/multifd.c  | 12 
   migration/multifd.h  |  2 +-
   migration/postcopy-ram.c |  5 +
   migration/postcopy-ram.h |  2 +-
   8 files changed, 119 insertions(+), 26 deletions(-)

diff --git a/include/io/channel.h b/include/io/channel.h
index c680ee7480..74177aeeea 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -115,6 +115,10 @@ struct QIOChannelClass {
   int **fds,
   size_t *nfds,
   Error **errp);
+ssize_t (*io_read_peek)(QIOChannel *ioc,
+void *buf,
+size_t nbytes,
+Error **errp);
   int (*io_close)(QIOChannel *ioc,
   Error **errp);
   GSource * (*io_create_watch)(QIOChannel *ioc,
@@ -475,6 +479,27 @@ int qio_channel_write_all(QIOChannel *ioc,
 size_t buflen,
 Error **errp);
+/**
+ * qio_channel_read_peek_all:
+ * @ioc: the channel object
+ * @buf: the memory region to read in data
+ * @nbytes: the number of bytes to read
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Read given @nbytes data from peek of channel into
+ * memory region @buf.
+ *
+ * The function will be blocked until read size is
+ * equal to requested size.
+ *
+ * Returns: 1 if all bytes were read, 0 if end-of-file
+ *  occurs without data, or -1 on error
+ */
+int qio_channel_read_peek_all(QIOChannel *ioc,
+  void* buf,
+  size_t nbytes,
+  Error **errp);
+
   /**
* qio_channel_set_blocking:
* @ioc: the channel object
diff --git a/io/channel-socket.c b/io/channel-socket.c
index b76dca9cc1..b99f5dfda6 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -705,6 +705,32 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
   }
   #endif /* WIN32 */
+static ssize_t qio_channel_socket_read_peek(QIOChannel *ioc,
+void *buf,
+size_t nbytes,
+Error **errp)
+{
+QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
+ssize_t bytes = 0;
+
+retry:
+bytes = recv(sioc->fd, buf, nbytes, MSG_PEEK);
+
+if (bytes < 0) {
+if (errno == EINTR) {
+goto retry;
+}
+if (errno == EAGAIN) {
+return QIO_CHANNEL_ERR_BLOCK;
+}
+
+error_setg_errno(errp, errno,
+ "Unable to read from peek of socket");
+return -1;
+}
+
+return bytes;
+}
   #ifdef QEMU_MSG_ZEROCOPY
   static int qio_channel_socket_flush(QIOChannel *ioc,
@@ -902,6 +928,7 @@ static void qio_channel_socket_class_init(ObjectClass 
*klass,
   ioc_klass->io_writev = qio_channel_socket_writev;
   ioc_klass->io_readv = qio_channel_socket_readv;
+ioc_klass->io_read_peek = qio_channel_socket_read_peek;
   ioc_klass->io_set_blocking = qio_channel_socket_set_blocking;
   ioc_klass->io_close = qio_channel_socket_close;
   ioc_klass->io_shutdown = qio_channel_socket_shutdown;
diff --git a/io/channel.c b/io/channel.c
index 0640941ac5..a2d9b96f3f 100644
--- a/io/channel.c
+++ b/io/channel.c
@@ -346,6 +346,45 @@ int qio_channel_write_all(QIOChannel *ioc,
   return qio_channel_writev_all(ioc, &iov, 1, errp);
   }
+int qio_channel_read_peek_all(QIOChannel *ioc,
+ 

Re: [PATCH] migration: check magic value for deciding the mapping of channels

2022-11-01 Thread manish.mishra



On 01/11/22 9:15 pm, Daniel P. Berrangé wrote:

On Tue, Nov 01, 2022 at 09:10:14PM +0530, manish.mishra wrote:

On 01/11/22 8:21 pm, Daniel P. Berrangé wrote:

On Tue, Nov 01, 2022 at 02:30:29PM +, manish.mishra wrote:

Current logic assumes that channel connections on the destination side are
always established in the same order as the source and the first one will
always be the default channel followed by the multifid or post-copy
preemption channel. This may not be always true, as even if a channel has a
connection established on the source side it can be in the pending state on
the destination side and a newer connection can be established first.
Basically causing out of order mapping of channels on the destination side.
Currently, all channels except post-copy preempt send a magic number, this
patch uses that magic number to decide the type of channel. This logic is
applicable only for precopy(multifd) live migration, as mentioned, the
post-copy preempt channel does not send any magic number. Also, this patch
uses MSG_PEEK to check the magic number of channels so that current
data/control stream management remains un-effected.

Signed-off-by: manish.mishra 
---
   include/io/channel.h | 25 +
   io/channel-socket.c  | 27 +++
   io/channel.c | 39 +++
   migration/migration.c| 33 +
   migration/multifd.c  | 12 
   migration/multifd.h  |  2 +-
   migration/postcopy-ram.c |  5 +
   migration/postcopy-ram.h |  2 +-
   8 files changed, 119 insertions(+), 26 deletions(-)

diff --git a/include/io/channel.h b/include/io/channel.h
index c680ee7480..74177aeeea 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -115,6 +115,10 @@ struct QIOChannelClass {
   int **fds,
   size_t *nfds,
   Error **errp);
+ssize_t (*io_read_peek)(QIOChannel *ioc,
+void *buf,
+size_t nbytes,
+Error **errp);
   int (*io_close)(QIOChannel *ioc,
   Error **errp);
   GSource * (*io_create_watch)(QIOChannel *ioc,
@@ -475,6 +479,27 @@ int qio_channel_write_all(QIOChannel *ioc,
 size_t buflen,
 Error **errp);
+/**
+ * qio_channel_read_peek_all:
+ * @ioc: the channel object
+ * @buf: the memory region to read in data
+ * @nbytes: the number of bytes to read
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Read given @nbytes data from peek of channel into
+ * memory region @buf.
+ *
+ * The function will be blocked until read size is
+ * equal to requested size.
+ *
+ * Returns: 1 if all bytes were read, 0 if end-of-file
+ *  occurs without data, or -1 on error
+ */
+int qio_channel_read_peek_all(QIOChannel *ioc,
+  void* buf,
+  size_t nbytes,
+  Error **errp);
+
   /**
* qio_channel_set_blocking:
* @ioc: the channel object
diff --git a/io/channel-socket.c b/io/channel-socket.c
index b76dca9cc1..b99f5dfda6 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -705,6 +705,32 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
   }
   #endif /* WIN32 */
+static ssize_t qio_channel_socket_read_peek(QIOChannel *ioc,
+void *buf,
+size_t nbytes,
+Error **errp)
+{
+QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
+ssize_t bytes = 0;
+
+retry:
+bytes = recv(sioc->fd, buf, nbytes, MSG_PEEK);
+
+if (bytes < 0) {
+if (errno == EINTR) {
+goto retry;
+}
+if (errno == EAGAIN) {
+return QIO_CHANNEL_ERR_BLOCK;
+}
+
+error_setg_errno(errp, errno,
+ "Unable to read from peek of socket");
+return -1;
+}
+
+return bytes;
+}
   #ifdef QEMU_MSG_ZEROCOPY
   static int qio_channel_socket_flush(QIOChannel *ioc,
@@ -902,6 +928,7 @@ static void qio_channel_socket_class_init(ObjectClass 
*klass,
   ioc_klass->io_writev = qio_channel_socket_writev;
   ioc_klass->io_readv = qio_channel_socket_readv;
+ioc_klass->io_read_peek = qio_channel_socket_read_peek;
   ioc_klass->io_set_blocking = qio_channel_socket_set_blocking;
   ioc_klass->io_close = qio_channel_socket_close;
   ioc_klass->io_shutdown = qio_channel_socket_shutdown;
diff --git a/io/channel.c b/io/channel.c
index 0640941ac5..a2d9b96f3f 100644
--- a/io/channel.c
+++ b/io/channel.c
@@ -346,6 +346,45 @@ int qio_channel_write_all(QIOChannel *ioc,
   return qio_channel_writev_all(ioc, &iov, 1, errp);
   }
+int qio_channel_read_peek_all(QIOChannel *ioc,
+ 

Re: [PATCH] migration: check magic value for deciding the mapping of channels

2022-11-01 Thread manish.mishra



On 01/11/22 8:21 pm, Daniel P. Berrangé wrote:

On Tue, Nov 01, 2022 at 02:30:29PM +, manish.mishra wrote:

Current logic assumes that channel connections on the destination side are
always established in the same order as the source and the first one will
always be the default channel followed by the multifid or post-copy
preemption channel. This may not be always true, as even if a channel has a
connection established on the source side it can be in the pending state on
the destination side and a newer connection can be established first.
Basically causing out of order mapping of channels on the destination side.
Currently, all channels except post-copy preempt send a magic number, this
patch uses that magic number to decide the type of channel. This logic is
applicable only for precopy(multifd) live migration, as mentioned, the
post-copy preempt channel does not send any magic number. Also, this patch
uses MSG_PEEK to check the magic number of channels so that current
data/control stream management remains un-effected.

Signed-off-by: manish.mishra 
---
  include/io/channel.h | 25 +
  io/channel-socket.c  | 27 +++
  io/channel.c | 39 +++
  migration/migration.c| 33 +
  migration/multifd.c  | 12 
  migration/multifd.h  |  2 +-
  migration/postcopy-ram.c |  5 +
  migration/postcopy-ram.h |  2 +-
  8 files changed, 119 insertions(+), 26 deletions(-)

diff --git a/include/io/channel.h b/include/io/channel.h
index c680ee7480..74177aeeea 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -115,6 +115,10 @@ struct QIOChannelClass {
  int **fds,
  size_t *nfds,
  Error **errp);
+ssize_t (*io_read_peek)(QIOChannel *ioc,
+void *buf,
+size_t nbytes,
+Error **errp);
  int (*io_close)(QIOChannel *ioc,
  Error **errp);
  GSource * (*io_create_watch)(QIOChannel *ioc,
@@ -475,6 +479,27 @@ int qio_channel_write_all(QIOChannel *ioc,
size_t buflen,
Error **errp);
  
+/**

+ * qio_channel_read_peek_all:
+ * @ioc: the channel object
+ * @buf: the memory region to read in data
+ * @nbytes: the number of bytes to read
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Read given @nbytes data from peek of channel into
+ * memory region @buf.
+ *
+ * The function will be blocked until read size is
+ * equal to requested size.
+ *
+ * Returns: 1 if all bytes were read, 0 if end-of-file
+ *  occurs without data, or -1 on error
+ */
+int qio_channel_read_peek_all(QIOChannel *ioc,
+  void* buf,
+  size_t nbytes,
+  Error **errp);
+
  /**
   * qio_channel_set_blocking:
   * @ioc: the channel object
diff --git a/io/channel-socket.c b/io/channel-socket.c
index b76dca9cc1..b99f5dfda6 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -705,6 +705,32 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
  }
  #endif /* WIN32 */
  
+static ssize_t qio_channel_socket_read_peek(QIOChannel *ioc,

+void *buf,
+size_t nbytes,
+Error **errp)
+{
+QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
+ssize_t bytes = 0;
+
+retry:
+bytes = recv(sioc->fd, buf, nbytes, MSG_PEEK);
+
+if (bytes < 0) {
+if (errno == EINTR) {
+goto retry;
+}
+if (errno == EAGAIN) {
+return QIO_CHANNEL_ERR_BLOCK;
+}
+
+error_setg_errno(errp, errno,
+ "Unable to read from peek of socket");
+return -1;
+}
+
+return bytes;
+}
  
  #ifdef QEMU_MSG_ZEROCOPY

  static int qio_channel_socket_flush(QIOChannel *ioc,
@@ -902,6 +928,7 @@ static void qio_channel_socket_class_init(ObjectClass 
*klass,
  
  ioc_klass->io_writev = qio_channel_socket_writev;

  ioc_klass->io_readv = qio_channel_socket_readv;
+ioc_klass->io_read_peek = qio_channel_socket_read_peek;
  ioc_klass->io_set_blocking = qio_channel_socket_set_blocking;
  ioc_klass->io_close = qio_channel_socket_close;
  ioc_klass->io_shutdown = qio_channel_socket_shutdown;
diff --git a/io/channel.c b/io/channel.c
index 0640941ac5..a2d9b96f3f 100644
--- a/io/channel.c
+++ b/io/channel.c
@@ -346,6 +346,45 @@ int qio_channel_write_all(QIOChannel *ioc,
  return qio_channel_writev_all(ioc, &iov, 1, errp);
  }
  
+int qio_channel_read_peek_all(QIOChannel *ioc,

+  void* buf,
+  size_t nbytes,
+  Error

Re: [PATCH] migration: check magic value for deciding the mapping of channels

2022-11-01 Thread manish.mishra

Sorry for late patch on this. I mentioned i will send it last week itself, but 
later reliased it was festival week in India, so was mostly holidays.

Thanks

Manish Mishra

On 01/11/22 8:00 pm, manish.mishra wrote:

Current logic assumes that channel connections on the destination side are
always established in the same order as the source and the first one will
always be the default channel followed by the multifid or post-copy
preemption channel. This may not be always true, as even if a channel has a
connection established on the source side it can be in the pending state on
the destination side and a newer connection can be established first.
Basically causing out of order mapping of channels on the destination side.
Currently, all channels except post-copy preempt send a magic number, this
patch uses that magic number to decide the type of channel. This logic is
applicable only for precopy(multifd) live migration, as mentioned, the
post-copy preempt channel does not send any magic number. Also, this patch
uses MSG_PEEK to check the magic number of channels so that current
data/control stream management remains un-effected.

Signed-off-by: manish.mishra
---
  include/io/channel.h | 25 +
  io/channel-socket.c  | 27 +++
  io/channel.c | 39 +++
  migration/migration.c| 33 +
  migration/multifd.c  | 12 
  migration/multifd.h  |  2 +-
  migration/postcopy-ram.c |  5 +
  migration/postcopy-ram.h |  2 +-
  8 files changed, 119 insertions(+), 26 deletions(-)

diff --git a/include/io/channel.h b/include/io/channel.h
index c680ee7480..74177aeeea 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -115,6 +115,10 @@ struct QIOChannelClass {
  int **fds,
  size_t *nfds,
  Error **errp);
+ssize_t (*io_read_peek)(QIOChannel *ioc,
+void *buf,
+size_t nbytes,
+Error **errp);
  int (*io_close)(QIOChannel *ioc,
  Error **errp);
  GSource * (*io_create_watch)(QIOChannel *ioc,
@@ -475,6 +479,27 @@ int qio_channel_write_all(QIOChannel *ioc,
size_t buflen,
Error **errp);
  
+/**

+ * qio_channel_read_peek_all:
+ * @ioc: the channel object
+ * @buf: the memory region to read in data
+ * @nbytes: the number of bytes to read
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Read given @nbytes data from peek of channel into
+ * memory region @buf.
+ *
+ * The function will be blocked until read size is
+ * equal to requested size.
+ *
+ * Returns: 1 if all bytes were read, 0 if end-of-file
+ *  occurs without data, or -1 on error
+ */
+int qio_channel_read_peek_all(QIOChannel *ioc,
+  void* buf,
+  size_t nbytes,
+  Error **errp);
+
  /**
   * qio_channel_set_blocking:
   * @ioc: the channel object
diff --git a/io/channel-socket.c b/io/channel-socket.c
index b76dca9cc1..b99f5dfda6 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -705,6 +705,32 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
  }
  #endif /* WIN32 */
  
+static ssize_t qio_channel_socket_read_peek(QIOChannel *ioc,

+void *buf,
+size_t nbytes,
+Error **errp)
+{
+QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
+ssize_t bytes = 0;
+
+retry:
+bytes = recv(sioc->fd, buf, nbytes, MSG_PEEK);
+
+if (bytes < 0) {
+if (errno == EINTR) {
+goto retry;
+}
+if (errno == EAGAIN) {
+return QIO_CHANNEL_ERR_BLOCK;
+}
+
+error_setg_errno(errp, errno,
+ "Unable to read from peek of socket");
+return -1;
+}
+
+return bytes;
+}
  
  #ifdef QEMU_MSG_ZEROCOPY

  static int qio_channel_socket_flush(QIOChannel *ioc,
@@ -902,6 +928,7 @@ static void qio_channel_socket_class_init(ObjectClass 
*klass,
  
  ioc_klass->io_writev = qio_channel_socket_writev;

  ioc_klass->io_readv = qio_channel_socket_readv;
+ioc_klass->io_read_peek = qio_channel_socket_read_peek;
  ioc_klass->io_set_blocking = qio_channel_socket_set_blocking;
  ioc_klass->io_close = qio_channel_socket_close;
  ioc_klass->io_shutdown = qio_channel_socket_shutdown;
diff --git a/io/channel.c b/io/channel.c
index 0640941ac5..a2d9b96f3f 100644
--- a/io/channel.c
+++ b/io/channel.c
@@ -346,6 +346,45 @@ int qio_channel_write_all(QIOChannel *ioc,
  return qio_channel_writev_all(ioc, &iov, 1, errp);
  }
  
+int qio_channel_read_peek_all(QIOChannel *ioc,

+ 

[PATCH] migration: check magic value for deciding the mapping of channels

2022-11-01 Thread manish.mishra
Current logic assumes that channel connections on the destination side are
always established in the same order as the source and the first one will
always be the default channel followed by the multifid or post-copy
preemption channel. This may not be always true, as even if a channel has a
connection established on the source side it can be in the pending state on
the destination side and a newer connection can be established first.
Basically causing out of order mapping of channels on the destination side.
Currently, all channels except post-copy preempt send a magic number, this
patch uses that magic number to decide the type of channel. This logic is
applicable only for precopy(multifd) live migration, as mentioned, the
post-copy preempt channel does not send any magic number. Also, this patch
uses MSG_PEEK to check the magic number of channels so that current
data/control stream management remains un-effected.

Signed-off-by: manish.mishra 
---
 include/io/channel.h | 25 +
 io/channel-socket.c  | 27 +++
 io/channel.c | 39 +++
 migration/migration.c| 33 +
 migration/multifd.c  | 12 
 migration/multifd.h  |  2 +-
 migration/postcopy-ram.c |  5 +
 migration/postcopy-ram.h |  2 +-
 8 files changed, 119 insertions(+), 26 deletions(-)

diff --git a/include/io/channel.h b/include/io/channel.h
index c680ee7480..74177aeeea 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -115,6 +115,10 @@ struct QIOChannelClass {
 int **fds,
 size_t *nfds,
 Error **errp);
+ssize_t (*io_read_peek)(QIOChannel *ioc,
+void *buf,
+size_t nbytes,
+Error **errp);
 int (*io_close)(QIOChannel *ioc,
 Error **errp);
 GSource * (*io_create_watch)(QIOChannel *ioc,
@@ -475,6 +479,27 @@ int qio_channel_write_all(QIOChannel *ioc,
   size_t buflen,
   Error **errp);
 
+/**
+ * qio_channel_read_peek_all:
+ * @ioc: the channel object
+ * @buf: the memory region to read in data
+ * @nbytes: the number of bytes to read
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Read given @nbytes data from peek of channel into
+ * memory region @buf.
+ *
+ * The function will be blocked until read size is
+ * equal to requested size.
+ *
+ * Returns: 1 if all bytes were read, 0 if end-of-file
+ *  occurs without data, or -1 on error
+ */
+int qio_channel_read_peek_all(QIOChannel *ioc,
+  void* buf,
+  size_t nbytes,
+  Error **errp);
+
 /**
  * qio_channel_set_blocking:
  * @ioc: the channel object
diff --git a/io/channel-socket.c b/io/channel-socket.c
index b76dca9cc1..b99f5dfda6 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -705,6 +705,32 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
 }
 #endif /* WIN32 */
 
+static ssize_t qio_channel_socket_read_peek(QIOChannel *ioc,
+void *buf,
+size_t nbytes,
+Error **errp)
+{
+QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
+ssize_t bytes = 0;
+
+retry:
+bytes = recv(sioc->fd, buf, nbytes, MSG_PEEK);
+
+if (bytes < 0) {
+if (errno == EINTR) {
+goto retry;
+}
+if (errno == EAGAIN) {
+return QIO_CHANNEL_ERR_BLOCK;
+}
+
+error_setg_errno(errp, errno,
+ "Unable to read from peek of socket");
+return -1;
+}
+
+return bytes;
+}
 
 #ifdef QEMU_MSG_ZEROCOPY
 static int qio_channel_socket_flush(QIOChannel *ioc,
@@ -902,6 +928,7 @@ static void qio_channel_socket_class_init(ObjectClass 
*klass,
 
 ioc_klass->io_writev = qio_channel_socket_writev;
 ioc_klass->io_readv = qio_channel_socket_readv;
+ioc_klass->io_read_peek = qio_channel_socket_read_peek;
 ioc_klass->io_set_blocking = qio_channel_socket_set_blocking;
 ioc_klass->io_close = qio_channel_socket_close;
 ioc_klass->io_shutdown = qio_channel_socket_shutdown;
diff --git a/io/channel.c b/io/channel.c
index 0640941ac5..a2d9b96f3f 100644
--- a/io/channel.c
+++ b/io/channel.c
@@ -346,6 +346,45 @@ int qio_channel_write_all(QIOChannel *ioc,
 return qio_channel_writev_all(ioc, &iov, 1, errp);
 }
 
+int qio_channel_read_peek_all(QIOChannel *ioc,
+  void* buf,
+  size_t nbytes,
+  Error **errp)
+{
+   QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
+   ssize_t bytes = 0;
+
+   if (!klass->io_read_peek) {
+   error_setg(

Re: MultiFD and default channel out of order mapping on receive side.

2022-10-21 Thread manish.mishra



On 21/10/22 3:37 am, Daniel P. Berrangé wrote:

On Thu, Oct 20, 2022 at 12:32:06PM -0400, Peter Xu wrote:

On Thu, Oct 20, 2022 at 08:14:19PM +0530, manish.mishra wrote:

I had one concern, during recover we do not send any magic. As of now we

   do not support multifd with postcopy so it should be fine, we can do
   explict checking for non-recovery case. But i remember from some
   discussion in future there may be support for multiFD with postcopy or
   have multiple postcopy preempt channels too, then proper handshake will
   be required? So at some point we want to take that path? For now i agree
   approach 1 will be good as suggested by Daniel it can be backported
   easily to older qemu's too.

Yes for the long run I think we should provide a generic solution for all
the channels to be established for migration purpose.

Not to mention that as I replied previously to my original email, the trick
won't easily work with dest QEMU where we need further change to allow qemu
to accept new channels during loading of the VM.

Considering the complexity that it'll take just to resolve the prempt
channel ordering, I think maybe it's cleaner we just look for the long term
goal.

I think we should just ignore the preempt channel. Lets just do the
easy bit and fix the main vs multifd channel mixup, as that's the one
that is definitely actively hitting people today. We can solve that as
a quick win in a way that is easy to backport to existing releases of
QEMU for those affected.

Yes, that works for now Daniel. I can send a patch as per your earlier 
suggestions on this, early next week if it fine?

Separately from that, lets define a clean slate migration protocol to
solve many of our historic problems and mistakes that can't be dealt
with through retrofitting, not limited to just this ordering mistake.

We had a significant discussion about it at the start of the year
in this thread, which I think we should take forward and write into
a formal protocol spec.

   
https://urldefense.proofpoint.com/v2/url?u=https-3A__lists.gnu.org_archive_html_qemu-2Ddevel_2022-2D03_msg03655.html&d=DwIBaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=vjRKUpojJeAWFcJNi7YETWaLjjOcLWIcdk8KO-HMu8_A3veG2aaGmZoLxFVHgLt0&s=uv5cKdIMcT8Wo3erYQZJqnMXvRCbda4gNtxsy9EXrC8&e=

This was a good read for me. :)


With regards,
Daniel


Thanks

Manish Mishra




Re: MultiFD and default channel out of order mapping on receive side.

2022-10-20 Thread manish.mishra


On 19/10/22 2:30 am, Peter Xu wrote:

On Tue, Oct 18, 2022 at 10:51:12AM -0400, Peter Xu wrote:

On Tue, Oct 18, 2022 at 09:18:28AM +0100, Daniel P. Berrangé wrote:

On Mon, Oct 17, 2022 at 05:15:35PM -0400, Peter Xu wrote:

On Mon, Oct 17, 2022 at 12:38:30PM +0100, Daniel P. Berrangé wrote:

On Mon, Oct 17, 2022 at 01:06:00PM +0530, manish.mishra wrote:

Hi Daniel,

I was thinking for some solutions for this so wanted to discuss that before 
going ahead. Also added Juan and Peter in loop.

1. Earlier i was thinking, on destination side as of now for default
and multi-FD channel first data to be sent is MAGIC_NUMBER and VERSION
so may be we can decide mapping based on that. But then that does not
work for newly added post copy preempt channel as it does not send
any MAGIC number. Also even for multiFD just MAGIC number does not
tell which multifd channel number is it, even though as per my thinking
it does not matter. So MAGIC number should be good for indentifying
default vs multiFD channel?

Yep, you don't need to know more than the MAGIC value.

In migration_io_process_incoming, we need to use MSG_PEEK to look at
the first 4 bytes pendingon the wire. If those bytes are 'QEVM' that's
the primary channel, if those bytes are big endian 0x11223344, that's
a multifd channel.  Using MSG_PEEK aviods need to modify thue later
code that actually reads this data.

The challenge is how long to wait with the MSG_PEEK. If we do it
in a blocking mode, its fine for main channel and multifd, but
IIUC for the post-copy pre-empt channel we'd be waiting for
something that will never arrive.

Having suggested MSG_PEEK though, this may well not work if the
channel has TLS present. In fact it almost definitely won't work.

To cope with TLS migration_io_process_incoming would need to
actually read the data off the wire, and later methods be
taught to skip reading the magic.


2. For post-copy preempt may be we can initiate this channel only
after we have received a request from remote e.g. remote page fault.
This to me looks safest considering post-copy recorvery case too.
I can not think of any depedency on post copy preempt channel which
requires it to be initialised very early. May be Peter can confirm
this.

I guess that could work

Currently all preempt code still assumes when postcopy activated it's in
preempt mode.  IIUC such a change will bring an extra phase of postcopy
with no-preempt before preempt enabled.  We may need to teach qemu to
understand that if it's needed.

Meanwhile the initial page requests will not be able to benefit from the
new preempt channel too.


3. Another thing we can do is to have 2-way handshake on every
channel creation with some additional metadata, this to me looks
like cleanest approach and durable, i understand that can break
migration to/from old qemu, but then that can come as migration
capability?

The benefit of (1) is that the fix can be deployed for all existing
QEMU releases by backporting it.  (3) will meanwhile need mgmt app
updates to make it work, which is much more work to deploy.

We really shoulud have had a more formal handshake, and I've described
ways to achieve this in the past, but it is quite alot of work.

I don't know whether (1) is a valid option if there are use cases that it
cannot cover (on either tls or preempt).  The handshake is definitely the
clean approach.

What's the outcome of such wrongly ordered connections?  Will migration
fail immediately and safely?

For multifd, I think it should fail immediately after the connection
established.

For preempt, I'd also expect the same thing because the only wrong order to
happen right now is having the preempt channel to be the migration channel,
then it should also fail immediately on the first qemu_get_byte().

Hopefully that's still not too bad - I mean, if we can fail constantly and
safely (never fail during postcopy), we can always retry and as long as
connections created successfully we can start the migration safely.  But
please correct me if it's not the case.

It should typically fail as the magic bytes are different, which will not
pass validation. The exception being the postcopy pre-empt  channel which
may well cause migration to stall as nothing will be sent initially by
the src.

Hmm right..

Actually if preempt channel is special we can fix it alone.  As both of you
discussed, we can postpone the preempt channel setup, maybe not as late as
when we receive the 1st page request, but:

   (1) For newly established migration, we can postpone preempt channel
   setup (postcopy_preempt_setup, resume=false) to the entrance of
   postcopy_start().

   (2) For a postcopy recovery process, we can postpone preempt channel
   setup (postcopy_preempt_setup, resume=true) to postcopy_do_resume(),
   maybe between qemu_savevm_state_resume_prepare() and the final
   handshake of postcopy_resume_handshake().

Yes Peter

Re: MultiFD and default channel out of order mapping on receive side.

2022-10-17 Thread manish.mishra

Hi Daniel,

I was thinking for some solutions for this so wanted to discuss that before 
going ahead. Also added Juan and Peter in loop.

1. Earlier i was thinking, on destination side as of now for default and 
multi-FD channel first data to be sent is MAGIC_NUMBER and VERSION so may be we 
can decide mapping based on that. But then that does not work for newly added 
post copy preempt channel as it does not send any MAGIC number. Also even for 
multiFD just MAGIC number does not tell which multifd channel number is it, 
even though as per my thinking it does not matter. So MAGIC number should be 
good for indentifying default vs multiFD channel?
2. For post-copy preempt may be we can initiate this channel only after we have 
received a request from remote e.g. remote page fault. This to me looks safest 
considering post-copy recorvery case too. I can not think of any depedency on 
post copy preempt channel which requires it to be initialised very early. May 
be Peter can confirm this.
3. Another thing we can do is to have 2-way handshake on every channel creation 
with some additional metadata, this to me looks like cleanest approach and 
durable, i understand that can break migration to/from old qemu, but then that 
can come as migration capability?

Please let me know if any of these works or if you have some other suggestions?

Thanks

Manish Mishra


On 13/10/22 1:45 pm, Daniel P. Berrangé wrote:

On Thu, Oct 13, 2022 at 01:23:40AM +0530, manish.mishra wrote:

Hi Everyone,
Hope everyone is doing great. I have seen some live migration issues with 
qemu-4.2 when using multiFD. Signature of issue is something like this.
2022-10-01T09:57:53.972864Z qemu-kvm: failed to receive packet via multifd 
channel 0: multifd: received packet magic 5145564d expected 11223344

Basically default live migration channel packet is received on multiFD channel. 
I see a older patch explaining potential reason for this behavior.
https://urldefense.proofpoint.com/v2/url?u=https-3A__lists.gnu.org_archive_html_qemu-2Ddevel_2019-2D10_msg05920.html&d=DwIBaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=LZBcU_C3HMbpUCFZgqxkS-pV8C2mHOjqUTzt45LlLwa26DA0pCAjJVDoamnX8vnC&s=B-b_HMnn_ee6JeA87-PVNBrBqxzdWYgo5PpaP91dqT8&e=   

[PATCH 3/3] migration/multifd: fix potential wrong acception order of IO.

But i see this patch was not merged. By looking at qemu master code, i
could not find any other patch too which can handle this issue. So as
per my understanding this is still a potential issue even in qemu
master. I mainly wanted to check why this patch was dropped?

See my repllies in that message - it broke compatilibity of data on
the wire, meaning old QEMU can't talk to new QEMU and vica-verca.

We need a fix for this issue, but it needs to take into account
wire compatibility.

With regards,
Daniel

Re: MultiFD and default channel out of order mapping on receive side.

2022-10-13 Thread manish.mishra



On 13/10/22 1:45 pm, Daniel P. Berrangé wrote:

On Thu, Oct 13, 2022 at 01:23:40AM +0530, manish.mishra wrote:

Hi Everyone,
Hope everyone is doing great. I have seen some live migration issues with 
qemu-4.2 when using multiFD. Signature of issue is something like this.
2022-10-01T09:57:53.972864Z qemu-kvm: failed to receive packet via multifd 
channel 0: multifd: received packet magic 5145564d expected 11223344

Basically default live migration channel packet is received on multiFD channel. 
I see a older patch explaining potential reason for this behavior.
https://urldefense.proofpoint.com/v2/url?u=https-3A__lists.gnu.org_archive_html_qemu-2Ddevel_2019-2D10_msg05920.html&d=DwIBaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=LZBcU_C3HMbpUCFZgqxkS-pV8C2mHOjqUTzt45LlLwa26DA0pCAjJVDoamnX8vnC&s=B-b_HMnn_ee6JeA87-PVNBrBqxzdWYgo5PpaP91dqT8&e=

[PATCH 3/3] migration/multifd: fix potential wrong acception order of IO.

But i see this patch was not merged. By looking at qemu master code, i
could not find any other patch too which can handle this issue. So as
per my understanding this is still a potential issue even in qemu
master. I mainly wanted to check why this patch was dropped?

See my repllies in that message - it broke compatilibity of data on
the wire, meaning old QEMU can't talk to new QEMU and vica-verca.

We need a fix for this issue, but it needs to take into account
wire compatibility.

With regards,
Daniel


ok got it, thank you so much Daniel, in that case i will try to create some 
patch considering backward compatibility and send for review. Mainly i wanted 
to understand if it is handled somehow differently in upstream master, but 
manually looking code it did not look like that, so just wanted to confirm.

Thanks

Manish Mishra





MultiFD and default channel out of order mapping on receive side.

2022-10-12 Thread manish.mishra

Hi Everyone,
Hope everyone is doing great. I have seen some live migration issues with 
qemu-4.2 when using multiFD. Signature of issue is something like this.
2022-10-01T09:57:53.972864Z qemu-kvm: failed to receive packet via multifd 
channel 0: multifd: received packet magic 5145564d expected 11223344

Basically default live migration channel packet is received on multiFD channel. 
I see a older patch explaining potential reason for this behavior.
https://lists.gnu.org/archive/html/qemu-devel/2019-10/msg05920.html
> [PATCH 3/3] migration/multifd: fix potential wrong acception order of IO.

But i see this patch was not merged. By looking at qemu master code, i could 
not find any other patch too which can handle this issue. So as per my 
understanding this is still a potential issue even in qemu master. I mainly 
wanted to check why this patch was dropped? Sorry if mis-understood something. 
It will be great if someone can provide some pointers on this.

Thanks
Manish Mishra




Re: [PATCH 0/4] Multiple interface support on top of Multi-FD

2022-06-21 Thread manish.mishra



On 16/06/22 9:20 pm, Dr. David Alan Gilbert wrote:

* Daniel P. Berrangé (berra...@redhat.com) wrote:

On Wed, Jun 15, 2022 at 05:43:28PM +0100, Daniel P. Berrangé wrote:

On Fri, Jun 10, 2022 at 05:58:31PM +0530, manish.mishra wrote:

On 09/06/22 9:17 pm, Daniel P. Berrangé wrote:

On Thu, Jun 09, 2022 at 07:33:01AM +, Het Gala wrote:

As of now, the multi-FD feature supports connection over the default network
only. This Patchset series is a Qemu side implementation of providing multiple
interfaces support for multi-FD. This enables us to fully utilize dedicated or
multiple NICs in case bonding of NICs is not possible.


Introduction
-
Multi-FD Qemu implementation currently supports connection only on the default
network. This forbids us from advantages like:
- Separating VM live migration traffic from the default network.

Hi Daniel,

I totally understand your concern around this approach increasing compexity 
inside qemu,

when similar things can be done with NIC teaming. But we thought this approach 
provides

much more flexibility to user in few cases like.

1. We checked our customer data, almost all of the host had multiple NIC, but 
LACP support

     in their setups was very rare. So for those cases this approach can help 
in utilise multiple

     NICs as teaming is not possible there.

AFAIK,  LACP is not required in order to do link aggregation with Linux.
Traditional Linux bonding has no special NIC hardware or switch requirements,
so LACP is merely a "nice to have" in order to simplify some aspects.

IOW, migration with traffic spread across multiple NICs is already
possible AFAICT.

I can understand that some people may not have actually configured
bonding on their hosts, but it is not unreasonable to request that
they do so, if they want to take advantage fo aggrated bandwidth.

It has the further benefit that it will be fault tolerant. With
this proposal if any single NIC has a problem, the whole migration
will get stuck. With kernel level bonding, if any single NIC haus
a problem, it'll get offlined by the kernel and migration will
continue to  work across remaining active NICs.


2. We have seen requests recently to separate out traffic of storage, VM 
netwrok, migration

     over different vswitch which can be backed by 1 or more NICs as this give 
better

     predictability and assurance. So host with multiple ips/vswitches can be 
very common

     environment. In this kind of enviroment this approach gives per vm or 
migration level

     flexibilty, like for critical VM we can still use bandwidth from all 
available vswitch/interface

     but for normal VM they can keep live migration only on dedicated NICs 
without changing

     complete host network topology.

     At final we want it to be something like this [, , 
]

     to provide bandwidth_control per interface.

Again, it is already possible to separate migration traffic from storage
traffic, from other network traffic. The target IP given will influence
which NIC is used based on routing table and I know this is already
done widely with OpenStack deployments.

Actually I should clarify this is only practical if the two NICs are
using different IP subnets, otherwise routing rules are not viable.
So needing to set source IP would be needed to select between a pair
of NICs on the same IP subnet.

Yeh so I think that's one reason that the idea in this series is OK
(together with the idea for the NUMA stuff) and I suspect there are
other cases as well.

Dave


yes, David multiFD per NUMA seems interesting idea, I was just curious

how much throughput diff we can experience per multiFD channel

with local vs remote NIC?

thanks

Manish Mishra


Previous usage I've seen has always setup fully distinct IP subnets
for generic vs storage vs migration network traffic.

With regards,
Daniel
--
|: 
https://urldefense.proofpoint.com/v2/url?u=https-3A__berrange.com&d=DwIDAw&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=qfclRDP-GXttuWQ3knJS2RHXmg2XjmG7Pju002cBrHugZE8hpO3DRbKdHphItFr-&s=1RKIz6cO82_JwgkJ-QLP3SRWaG2Lo6J8w4O0Z2YVJ4Q&e=
   -o-
https://urldefense.proofpoint.com/v2/url?u=https-3A__www.flickr.com_photos_dberrange&d=DwIDAw&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=qfclRDP-GXttuWQ3knJS2RHXmg2XjmG7Pju002cBrHugZE8hpO3DRbKdHphItFr-&s=BkGiCLXloxlYYBJeJ_0XGRUgkUraRPJdIu26ukR6erI&e=
  :|
|: 
https://urldefense.proofpoint.com/v2/url?u=https-3A__libvirt.org&d=DwIDAw&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=qfclRDP-GXttuWQ3knJS2RHXmg2XjmG7Pju002cBrHugZE8hpO3DRbKdHphItFr-&s=KOz_zQXuQzFxwhNLINm-FrADPcBgnVjjmULmZ6iZTi4&e=
  -o-
https://urldefense.proofpoint.com/v2/url?u=https-3A__fstop138.berrange.com&d=DwIDAw&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m

Re: [PATCH 4/4] Adding support for multi-FD connections dynamically

2022-06-21 Thread manish.mishra



On 17/06/22 12:17 am, Dr. David Alan Gilbert wrote:

* Het Gala (het.g...@nutanix.com) wrote:

i) Dynamically decide appropriate source and destination ip pairs for the
corresponding multi-FD channel to be connected.

ii) Removed the support for setting the number of multi-fd channels from qmp
 commands. As now all multiFD parameters will be passed via qmp: migrate
 command or incoming flag itself.

We can't do that, because it's part of the API already; what you'll need
to do is check that the number of entries in your list corresponds to
the value set there and error if it's different.

Dave


thanks for review David. Yes, we will make sure in V2 that nothing existing 
breaks.

- Manish Mishra


Suggested-by: Manish Mishra 
Signed-off-by: Het Gala 
---
  migration/migration.c | 15 ---
  migration/migration.h |  1 -
  migration/multifd.c   | 42 +-
  migration/socket.c| 42 +-
  migration/socket.h|  4 +++-
  monitor/hmp-cmds.c|  4 
  qapi/migration.json   |  6 --
  7 files changed, 57 insertions(+), 57 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 9b0ad732e7..57dd4494b4 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1585,9 +1585,6 @@ static void 
migrate_params_test_apply(MigrateSetParameters *params,
  if (params->has_block_incremental) {
  dest->block_incremental = params->block_incremental;
  }
-if (params->has_multifd_channels) {
-dest->multifd_channels = params->multifd_channels;
-}
  if (params->has_multifd_compression) {
  dest->multifd_compression = params->multifd_compression;
  }
@@ -1702,9 +1699,6 @@ static void migrate_params_apply(MigrateSetParameters 
*params, Error **errp)
  if (params->has_block_incremental) {
  s->parameters.block_incremental = params->block_incremental;
  }
-if (params->has_multifd_channels) {
-s->parameters.multifd_channels = params->multifd_channels;
-}
  if (params->has_multifd_compression) {
  s->parameters.multifd_compression = params->multifd_compression;
  }
@@ -2686,15 +2680,6 @@ bool migrate_pause_before_switchover(void)
  MIGRATION_CAPABILITY_PAUSE_BEFORE_SWITCHOVER];
  }
  
-int migrate_multifd_channels(void)

-{
-MigrationState *s;
-
-s = migrate_get_current();
-
-return s->parameters.multifd_channels;
-}
-
  MultiFDCompression migrate_multifd_compression(void)
  {
  MigrationState *s;
diff --git a/migration/migration.h b/migration/migration.h
index fa8717ec9e..9464de8ef7 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -372,7 +372,6 @@ bool migrate_validate_uuid(void);
  bool migrate_auto_converge(void);
  bool migrate_use_multifd(void);
  bool migrate_pause_before_switchover(void);
-int migrate_multifd_channels(void);
  MultiFDCompression migrate_multifd_compression(void);
  int migrate_multifd_zlib_level(void);
  int migrate_multifd_zstd_level(void);
diff --git a/migration/multifd.c b/migration/multifd.c
index 9282ab6aa4..ce017436fb 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -225,7 +225,7 @@ static int multifd_recv_initial_packet(QIOChannel *c, Error 
**errp)
  return -1;
  }
  
-if (msg.id > migrate_multifd_channels()) {

+if (msg.id > total_multifd_channels()) {
  error_setg(errp, "multifd: received channel version %u "
 "expected %u", msg.version, MULTIFD_VERSION);
  return -1;
@@ -410,8 +410,8 @@ static int multifd_send_pages(QEMUFile *f)
   * using more channels, so ensure it doesn't overflow if the
   * limit is lower now.
   */
-next_channel %= migrate_multifd_channels();
-for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
+next_channel %= total_multifd_channels();
+for (i = next_channel;; i = (i + 1) % total_multifd_channels()) {
  p = &multifd_send_state->params[i];
  
  qemu_mutex_lock(&p->mutex);

@@ -422,7 +422,7 @@ static int multifd_send_pages(QEMUFile *f)
  }
  if (!p->pending_job) {
  p->pending_job++;
-next_channel = (i + 1) % migrate_multifd_channels();
+next_channel = (i + 1) % total_multifd_channels();
  break;
  }
  qemu_mutex_unlock(&p->mutex);
@@ -500,7 +500,7 @@ static void multifd_send_terminate_threads(Error *err)
  return;
  }
  
-for (i = 0; i < migrate_multifd_channels(); i++) {

+for (i = 0; i < total_multifd_channels(); i++) {
  MultiFDSendParams *p = &multifd_send_state->params[i];
  
  qemu_mutex_lock(&p->mutex);

@@ -521,14 +521,14 @@ void multifd_save_cleanup(void)
  return;
  }
  multifd_send_terminate_threads(NULL);
-for (i = 0; i < migrate_multifd_channels(); i++) {
+for (i = 0; i < total_multifd_channels(); i++) {
  MultiFDSendPar

Re: [PATCH 3/4] Establishing connection between any non-default source and destination pair

2022-06-21 Thread manish.mishra

Hi Daniel, David,

Thank you so much for review on patches. I am posting this message on

behalf of Het. We wanted to get a early feedback so sorry if code was not

in best of shape. Het is currently on break intership break so does not have

access to nutanix mail, he will join in first week of july and will definately 
post

v2 on this by 2nd week of july.

thanks

Manish MIshra

On 16/06/22 11:09 pm, Daniel P. Berrangé wrote:

On Thu, Jun 09, 2022 at 07:33:04AM +, Het Gala wrote:

i) Binding of the socket to source ip address and port on the non-default
interface has been implemented for multi-FD connection, which was not
necessary earlier because the binding was on the default interface itself.

ii) Created an end to end connection between all multi-FD source and
 destination pairs.

Suggested-by: Manish Mishra
Signed-off-by: Het Gala
---
  chardev/char-socket.c   |  4 +-
  include/io/channel-socket.h | 26 ++-
  include/qemu/sockets.h  |  6 ++-
  io/channel-socket.c | 50 ++--
  migration/socket.c  | 15 +++---
  nbd/client-connection.c |  2 +-
  qemu-nbd.c  |  4 +-
  scsi/pr-manager-helper.c|  1 +
  tests/unit/test-char.c  |  8 ++--
  tests/unit/test-io-channel-socket.c |  4 +-
  tests/unit/test-util-sockets.c  | 16 +++
  ui/input-barrier.c  |  2 +-
  ui/vnc.c|  3 +-
  util/qemu-sockets.c | 71 -
  14 files changed, 135 insertions(+), 77 deletions(-)

diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index dc4e218eeb..f3725238c5 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -932,7 +932,7 @@ static int tcp_chr_connect_client_sync(Chardev *chr, Error 
**errp)
  QIOChannelSocket *sioc = qio_channel_socket_new();
  tcp_chr_change_state(s, TCP_CHARDEV_STATE_CONNECTING);
  tcp_chr_set_client_ioc_name(chr, sioc);
-if (qio_channel_socket_connect_sync(sioc, s->addr, errp) < 0) {
+if (qio_channel_socket_connect_sync(sioc, s->addr, NULL, errp) < 0) {
  tcp_chr_change_state(s, TCP_CHARDEV_STATE_DISCONNECTED);
  object_unref(OBJECT(sioc));
  return -1;
@@ -1120,7 +1120,7 @@ static void tcp_chr_connect_client_task(QIOTask *task,
  SocketAddress *addr = opaque;
  Error *err = NULL;
  
-qio_channel_socket_connect_sync(ioc, addr, &err);

+qio_channel_socket_connect_sync(ioc, addr, NULL, &err);
  
  qio_task_set_error(task, err);

  }
diff --git a/include/io/channel-socket.h b/include/io/channel-socket.h
index 513c428fe4..59d5b1b349 100644
--- a/include/io/channel-socket.h
+++ b/include/io/channel-socket.h
@@ -83,41 +83,45 @@ qio_channel_socket_new_fd(int fd,
  /**
   * qio_channel_socket_connect_sync:
   * @ioc: the socket channel object
- * @addr: the address to connect to
+ * @dst_addr: the destination address to connect to
+ * @src_addr: the source address to be connected
   * @errp: pointer to a NULL-initialized error object
   *
- * Attempt to connect to the address @addr. This method
- * will run in the foreground so the caller will not regain
- * execution control until the connection is established or
+ * Attempt to connect to the address @dst_addr with @src_addr.
+ * This method will run in the foreground so the caller will not
+ * regain execution control until the connection is established or
   * an error occurs.
   */
  int qio_channel_socket_connect_sync(QIOChannelSocket *ioc,
-SocketAddress *addr,
+SocketAddress *dst_addr,
+SocketAddress *src_addr,
  Error **errp);
  
  /**

   * qio_channel_socket_connect_async:
   * @ioc: the socket channel object
- * @addr: the address to connect to
+ * @dst_addr: the destination address to connect to
   * @callback: the function to invoke on completion
   * @opaque: user data to pass to @callback
   * @destroy: the function to free @opaque
   * @context: the context to run the async task. If %NULL, the default
   *   context will be used.
+ * @src_addr: the source address to be connected
   *
- * Attempt to connect to the address @addr. This method
- * will run in the background so the caller will regain
+ * Attempt to connect to the address @dst_addr with the @src_addr.
+ * This method will run in the background so the caller will regain
   * execution control immediately. The function @callback
- * will be invoked on completion or failure. The @addr
+ * will be invoked on completion or failure. The @dst_addr
   * parameter will be copied, so may be freed as soon
   * as this function returns without waiting for completion.
   */
  void qio_channel_socket_connect_async(QIOChannelSocket *ioc,
-  SocketAddress *addr,
+

Re: [PATCH 0/4] Multiple interface support on top of Multi-FD

2022-06-16 Thread manish.mishra



On 16/06/22 1:46 pm, Daniel P. Berrangé wrote:

On Wed, Jun 15, 2022 at 08:14:26PM +0100, Dr. David Alan Gilbert wrote:

* Daniel P. Berrangé (berra...@redhat.com) wrote:

On Fri, Jun 10, 2022 at 05:58:31PM +0530, manish.mishra wrote:

On 09/06/22 9:17 pm, Daniel P. Berrangé wrote:

On Thu, Jun 09, 2022 at 07:33:01AM +, Het Gala wrote:

As of now, the multi-FD feature supports connection over the default network
only. This Patchset series is a Qemu side implementation of providing multiple
interfaces support for multi-FD. This enables us to fully utilize dedicated or
multiple NICs in case bonding of NICs is not possible.


Introduction
-
Multi-FD Qemu implementation currently supports connection only on the default
network. This forbids us from advantages like:
- Separating VM live migration traffic from the default network.

Hi Daniel,

I totally understand your concern around this approach increasing compexity 
inside qemu,

when similar things can be done with NIC teaming. But we thought this approach 
provides

much more flexibility to user in few cases like.

1. We checked our customer data, almost all of the host had multiple NIC, but 
LACP support

     in their setups was very rare. So for those cases this approach can help 
in utilise multiple

     NICs as teaming is not possible there.

AFAIK,  LACP is not required in order to do link aggregation with Linux.
Traditional Linux bonding has no special NIC hardware or switch requirements,
so LACP is merely a "nice to have" in order to simplify some aspects.

IOW, migration with traffic spread across multiple NICs is already
possible AFAICT.

Are we sure that works with multifd?  I've seen a lot of bonding NIC
setups which spread based on a hash of source/destination IP and port
numbers; given that we use the same dest port and IP at the moment what
happens in reality?  That hashing can be quite delicate for high
bandwidth single streams.

The simplest Linux bonding mode does per-packet round-robin across
NICs, so traffic from the collection of multifd connections should
fill up all the NICs in the bond. There are of course other modes
which may be sub-optimal for the reasons you describe. Which mode
to pick depends on the type of service traffic patterns you're
aiming to balance.


My understanding on networking is not good enough so apologies in advance if 
something

does not make sense. As per my understanding it is easy to do load balancing on 
sender

side because we have full control where to send packet but complicated on 
receive side

if we do not have LACP like support. I see there are some teaming technique 
which does

load balancing of incoming traffic by possibly sending different slaves mac 
address on arp

requests but that does not work for our use case and may require a complicated 
setup

for proper usage. Our use case can be something like this e.g. both source and 
destination

has 2-2 NICs of 10Gbps each and we want to get a throughput of 20Gbps for live 
migration.

thanks

Manish Mishra




Multi-interface with Multi-FD
-
Multiple-interface support over basic multi-FD has been implemented in the
patches. Advantages of this implementation are:
- Able to separate live migration traffic from default network interface by
creating multiFD channels on ip addresses of multiple non-default 
interfaces.
- Can optimize the number of multi-FD channels on a particular interface
depending upon the network bandwidth limit on a particular interface.

Manually assigning individual channels to different NICs is a pretty
inefficient way to optimizing traffic. Feels like you could easily get
into a situation where one NIC ends up idle while the other is busy,
especially if the traffic patterns are different. For example with
post-copy there's an extra channel for OOB async page requests, and
its far from clear that manually picking NICs per chanel upfront is
going work for that.  The kernel can continually dynamically balance
load on the fly and so do much better than any static mapping QEMU
tries to apply, especially if there are multiple distinct QEMU's
competing for bandwidth.


Yes, Daniel current solution is only for pre-copy. As with postcopy
multiFD is not yet supported but in future we can extend it for postcopy

I had been thinking about explicit selection of network device for NUMA
use though; ideally I'd like to be able to associate a set of multifd
threads to each NUMA node, and then associate a NIC with that set of
threads; so that the migration happens down the NIC that's on the node
the RAM is on.  On a really good day you'd have one NIC per top level
NUMA node.

Now that's an interesting idea, and not one that can be dealt with
by bonding, since the network layer won't be aware of the NUMA
affinity constraints.


With regards,
Daniel




Re: [PATCH v17 6/8] softmmu/dirtylimit: Implement virtual CPU throttle

2022-06-13 Thread manish.mishra



On 13/06/22 8:03 pm, Peter Xu wrote:

On Mon, Jun 13, 2022 at 03:28:34PM +0530, manish.mishra wrote:

On 26/05/22 8:21 am, Jason Wang wrote:

On Wed, May 25, 2022 at 11:56 PM Peter Xu  wrote:

On Wed, May 25, 2022 at 11:38:26PM +0800, Hyman Huang wrote:

2. Also this algorithm only control or limits dirty rate by guest
writes. There can be some memory dirtying done by virtio based devices
which is accounted only at qemu level so may not be accounted through
dirty rings so do we have plan for that in future? Those are not issue
for auto-converge as it slows full VM but dirty rate limit only slows
guest writes.


  From the migration point of view, time spent on migrating memory is far
greater than migrating devices emulated by qemu. I think we can do that when
migrating device costs the same magnitude time as migrating memory.

As to auto-converge, it throttle vcpu by kicking it and force it to sleep
periodically. The two seems has no much difference from the perspective of
internal method but the auto-converge is kind of "offensive" when doing
restraint. I'll read the auto-converge implementation code and figure out
the problem you point out.

This seems to be not virtio-specific, but can be applied to any device DMA
writting to guest mem (if not including vfio).  But indeed virtio can be
normally faster.

I'm also curious how fast a device DMA could dirty memories.  This could be
a question to answer to all vcpu-based throttling approaches (including the
quota based approach that was proposed on KVM list).  Maybe for kernel
virtio drivers we can have some easier estimation?

As you said below, it really depends on the speed of the backend.


   My guess is it'll be
much harder for DPDK-in-guest (aka userspace drivers) because IIUC that
could use a large chunk of guest mem.

Probably, for vhost-user backend, it could be ~20Mpps or even higher.

Sorry for late response on this. We did experiment with IO on virtio-scsi based 
disk.

Thanks for trying this and sharing it out.


We could see dirty rate of ~500MBps on my system and most of that was not 
tracked

as kvm_dirty_log. Also for reference i am attaching test we used to avoid 
tacking

in KVM. (as attached file).

The number looks sane as it seems to be the sequential bandwidth for a
disk, though I'm not 100% sure it'll work as expected since you mmap()ed
the region with private pages rather than shared, so after you did I'm
wondering whether below will happen (also based on the fact that you mapped
twice the size of guest mem as you mentioned in the comment):

   (1) Swap out will start to trigger after you read a lot of data into the
   mem already, then old-read pages will be swapped out to disk (and
   hopefully the swap device does not reside on the same virtio-scsi
   disk or it'll be even more complicated scenario of mixture IOs..),
   meanwhile when you finish reading a round and start to read from
   offset 0 swap-in will start to happen too.  Swapping can slow down
   things already, and I'm wondering whether the 500MB/s was really
   caused by the swapout rather than backend disk reads.  More below.

   (2) Another attribute of private pages AFAICT is after you read it once
   it does not need to be read again from the virtio-scsi disks.  In
   other words, I'm thinking whether starting from the 2nd iteration
   your program won't trigger any DMA at all but purely torturing the
   swap device.

Maybe changing MAP_PRIVATE to MAP_SHARED can emulate better on what we want
to measure, but I'm also not 100% sure on whether it could be accurate..

Thanks,


Thanks Peter, Yes agree MAP_SHARED should be used here, sorry i missed that 😁.

Yes, my purpose of taking file size larger than RAM_SIZE was to cause

frequent page cache flush and re-populating page-cache pages, not to

trigger swaps. I checked on my VM i had swapping disabled, may be

MAP_PRIVATE did not make difference because it was read-only.

I tested again with MAP_SHARED it comes around ~500MBps.

Thanks

Manish Mishra


Thanks


[copy Jason too]

--
Peter Xu


#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

#define PAGE_SIZE 4096
#define GB (1024 * 1024 * 1024)

int main()
{
 char *buff;
 size_t size;
 struct stat stat;
 // Take file of size atleast double of RAM size to
 // achieve max dirty rate possible.
 const char * file_name = "file_10_gb";
 int fd;
 size_t i = 0, count = 0;
 struct timespec ts1, ts0;
 double time_diff;

 fd = open(file_name, O_RDONLY);
 if (fd == -1) {
perror("Error opening file");
exit(1);
 }

 fstat (fd, &stat);
 size = stat.st_size;
 printf("File size %ld\n", (long)size);

 buff = (char *)mmap(0, size, PROT_READ, MAP_PRIVATE, fd, 0);
 if (buff == MAP_FAILED) {
perror("Mmap Error&qu

Re: [PATCH v17 6/8] softmmu/dirtylimit: Implement virtual CPU throttle

2022-06-13 Thread manish.mishra


On 26/05/22 8:21 am, Jason Wang wrote:

On Wed, May 25, 2022 at 11:56 PM Peter Xu  wrote:

On Wed, May 25, 2022 at 11:38:26PM +0800, Hyman Huang wrote:

2. Also this algorithm only control or limits dirty rate by guest
writes. There can be some memory dirtying done by virtio based devices
which is accounted only at qemu level so may not be accounted through
dirty rings so do we have plan for that in future? Those are not issue
for auto-converge as it slows full VM but dirty rate limit only slows
guest writes.


 From the migration point of view, time spent on migrating memory is far
greater than migrating devices emulated by qemu. I think we can do that when
migrating device costs the same magnitude time as migrating memory.

As to auto-converge, it throttle vcpu by kicking it and force it to sleep
periodically. The two seems has no much difference from the perspective of
internal method but the auto-converge is kind of "offensive" when doing
restraint. I'll read the auto-converge implementation code and figure out
the problem you point out.

This seems to be not virtio-specific, but can be applied to any device DMA
writting to guest mem (if not including vfio).  But indeed virtio can be
normally faster.

I'm also curious how fast a device DMA could dirty memories.  This could be
a question to answer to all vcpu-based throttling approaches (including the
quota based approach that was proposed on KVM list).  Maybe for kernel
virtio drivers we can have some easier estimation?

As you said below, it really depends on the speed of the backend.


  My guess is it'll be
much harder for DPDK-in-guest (aka userspace drivers) because IIUC that
could use a large chunk of guest mem.

Probably, for vhost-user backend, it could be ~20Mpps or even higher.


Sorry for late response on this. We did experiment with IO on virtio-scsi based 
disk.

We could see dirty rate of ~500MBps on my system and most of that was not 
tracked

as kvm_dirty_log. Also for reference i am attaching test we used to avoid 
tacking

in KVM. (as attached file).



Thanks


[copy Jason too]

--
Peter Xu
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

#define PAGE_SIZE 4096
#define GB (1024 * 1024 * 1024)

int main()
{
char *buff;
size_t size;
struct stat stat;
// Take file of size atleast double of RAM size to
// achieve max dirty rate possible.
const char * file_name = "file_10_gb";
int fd;
size_t i = 0, count = 0;
struct timespec ts1, ts0;
double time_diff;

fd = open(file_name, O_RDONLY);
if (fd == -1) {
   perror("Error opening file");
   exit(1);
}

fstat (fd, &stat);
size = stat.st_size;
printf("File size %ld\n", (long)size);

buff = (char *)mmap(0, size, PROT_READ, MAP_PRIVATE, fd, 0);
if (buff == MAP_FAILED) {
   perror("Mmap Error");
   exit(1);
}

(void)clock_gettime(CLOCK_MONOTONIC, &ts0);

while(1) {
   char c;

   i = (i + PAGE_SIZE) % size;
   c = buff[i];
   count++;
   // Check on every 10K pages for rate.
   if (count % 1 == 0) {
  (void)clock_gettime(CLOCK_MONOTONIC, &ts1);
  time_diff = ((double)ts1.tv_sec  + ts1.tv_nsec * 1.0e-9) 
-((double)ts0.tv_sec + ts0.tv_nsec * 1.0e-9);
  printf("Expected Dirty rate %f\n", (1.0 * PAGE_SIZE) / GB / 
time_diff);
  ts0 = ts1;
   }
}

close(fd);
return 0;
}


Re: [PATCH 0/4] Multiple interface support on top of Multi-FD

2022-06-10 Thread manish.mishra



On 09/06/22 9:17 pm, Daniel P. Berrangé wrote:

On Thu, Jun 09, 2022 at 07:33:01AM +, Het Gala wrote:

As of now, the multi-FD feature supports connection over the default network
only. This Patchset series is a Qemu side implementation of providing multiple
interfaces support for multi-FD. This enables us to fully utilize dedicated or
multiple NICs in case bonding of NICs is not possible.


Introduction
-
Multi-FD Qemu implementation currently supports connection only on the default
network. This forbids us from advantages like:
- Separating VM live migration traffic from the default network.


Hi Daniel,

I totally understand your concern around this approach increasing compexity 
inside qemu,

when similar things can be done with NIC teaming. But we thought this approach 
provides

much more flexibility to user in few cases like.

1. We checked our customer data, almost all of the host had multiple NIC, but 
LACP support

    in their setups was very rare. So for those cases this approach can help in 
utilise multiple

    NICs as teaming is not possible there.

2. We have seen requests recently to separate out traffic of storage, VM 
netwrok, migration

    over different vswitch which can be backed by 1 or more NICs as this give 
better

    predictability and assurance. So host with multiple ips/vswitches can be 
very common

    environment. In this kind of enviroment this approach gives per vm or 
migration level

    flexibilty, like for critical VM we can still use bandwidth from all 
available vswitch/interface

    but for normal VM they can keep live migration only on dedicated NICs 
without changing

    complete host network topology.

    At final we want it to be something like this [, , 
]

    to provide bandwidth_control per interface.

3. Dedicated NIC we mentioned as a use case, agree with you it can be done 
without this

    approach too.


Perhaps I'm mis-understanding your intent here, but AFAIK it
has been possible to separate VM migration traffic from general
host network traffic essentially forever.

If you have two NICs with IP addresses on different subnets,
then the kernel will pick which NIC to use automatically
based on the IP address of the target matching the kernel
routing table entries.

Management apps have long used this ability in order to
control which NIC migration traffic flows over.


- Fully utilize all NICs’ capacity in cases where creating a LACP bond (Link
   Aggregation Control Protocol) is not supported.

Can you elaborate on scenarios in which it is impossible to use LACP
bonding at the kernel level ?

Yes, as mentioned above LACP support was rare in customer setups.

Multi-interface with Multi-FD
-
Multiple-interface support over basic multi-FD has been implemented in the
patches. Advantages of this implementation are:
- Able to separate live migration traffic from default network interface by
   creating multiFD channels on ip addresses of multiple non-default interfaces.
- Can optimize the number of multi-FD channels on a particular interface
   depending upon the network bandwidth limit on a particular interface.

Manually assigning individual channels to different NICs is a pretty
inefficient way to optimizing traffic. Feels like you could easily get
into a situation where one NIC ends up idle while the other is busy,
especially if the traffic patterns are different. For example with
post-copy there's an extra channel for OOB async page requests, and
its far from clear that manually picking NICs per chanel upfront is
going work for that.  The kernel can continually dynamically balance
load on the fly and so do much better than any static mapping QEMU
tries to apply, especially if there are multiple distinct QEMU's
competing for bandwidth.


Yes, Daniel current solution is only for pre-copy. As with postcopy

multiFD is not yet supported but in future we can extend it for postcopy

channels too.


Implementation
--

Earlier the 'migrate' qmp command:
{ "execute": "migrate", "arguments": { "uri": "tcp:0:4446" } }

Modified qmp command:
{ "execute": "migrate",
  "arguments": { "uri": "tcp:0:4446", "multi-fd-uri-list": [ {
  "source-uri": "tcp::6900", "destination-uri": "tcp:0:4480",
  "multifd-channels": 4}, { "source-uri": "tcp:10.0.0.0: ",
  "destination-uri": "tcp:11.0.0.0:7789",
  "multifd-channels": 5} ] } }
--

Earlier the 'migrate-incoming' qmp command:
{ "execute": "migrate-incoming", "arguments": { "uri": "tcp::4446" } }

Modified 'migrate-incoming' qmp command:
{ "execute": "migrate-incoming",
 "arguments": {"uri": "tcp::6789",
 "multi-fd-uri-list" : [ {"destination-uri" : "tcp::6900",
 "multifd-channels": 4}, {"destination-uri" : "tcp:11.0.0.0:7789",
 "multifd-channels": 5} ] } }

Re: [PATCH v17 6/8] softmmu/dirtylimit: Implement virtual CPU throttle

2022-06-01 Thread manish.mishra



On 26/05/22 8:21 am, Jason Wang wrote:

On Wed, May 25, 2022 at 11:56 PM Peter Xu  wrote:

On Wed, May 25, 2022 at 11:38:26PM +0800, Hyman Huang wrote:

2. Also this algorithm only control or limits dirty rate by guest
writes. There can be some memory dirtying done by virtio based devices
which is accounted only at qemu level so may not be accounted through
dirty rings so do we have plan for that in future? Those are not issue
for auto-converge as it slows full VM but dirty rate limit only slows
guest writes.


 From the migration point of view, time spent on migrating memory is far
greater than migrating devices emulated by qemu. I think we can do that when
migrating device costs the same magnitude time as migrating memory.

As to auto-converge, it throttle vcpu by kicking it and force it to sleep
periodically. The two seems has no much difference from the perspective of
internal method but the auto-converge is kind of "offensive" when doing
restraint. I'll read the auto-converge implementation code and figure out
the problem you point out.

This seems to be not virtio-specific, but can be applied to any device DMA
writting to guest mem (if not including vfio).  But indeed virtio can be
normally faster.

I'm also curious how fast a device DMA could dirty memories.  This could be
a question to answer to all vcpu-based throttling approaches (including the
quota based approach that was proposed on KVM list).  Maybe for kernel
virtio drivers we can have some easier estimation?

As you said below, it really depends on the speed of the backend.


  My guess is it'll be
much harder for DPDK-in-guest (aka userspace drivers) because IIUC that
could use a large chunk of guest mem.

Probably, for vhost-user backend, it could be ~20Mpps or even higher.

Thanks


[copy Jason too]


I will try to get some numbers of this by next week. Jason Just wanted to get 
more

context why it should be ~20Mbps like it can be as much as throughput limit of

storage/network in worst case?

Also we were internally discussing to keep this kind of throttling not as an 
alternate

to auto-converge but somehow to run orthogonal to auto-converge with some 
modifications.

In cases where most dirty is by guest writes auto-converge anyway will not be 
active as it

decides throttle based on ratio of dirty/2*transferred which if is forced 
correctly by e.g.

dirty rate limit will be ~1. This is easiest approach we could think for start 
but can

definately be improved in future. May be something similar can be done for this 
dirty limit

approach too?

Surely not for this patch series but in future.

thanks

Manish Mishra



--
Peter Xu





Re: [PATCH v6 10/13] migration: Respect postcopy request order in preemption mode

2022-05-23 Thread manish.mishra



On 23/05/22 4:26 pm, Dr. David Alan Gilbert wrote:

* Peter Xu (pet...@redhat.com) wrote:

With preemption mode on, when we see a postcopy request that was requesting
for exactly the page that we have preempted before (so we've partially sent
the page already via PRECOPY channel and it got preempted by another
postcopy request), currently we drop the request so that after all the
other postcopy requests are serviced then we'll go back to precopy stream
and start to handle that.

We dropped the request because we can't send it via postcopy channel since
the precopy channel already contains partial of the data, and we can only
send a huge page via one channel as a whole.  We can't split a huge page
into two channels.

That's a very corner case and that works, but there's a change on the order
of postcopy requests that we handle since we're postponing this (unlucky)
postcopy request to be later than the other queued postcopy requests.  The
problem is there's a possibility that when the guest was very busy, the
postcopy queue can be always non-empty, it means this dropped request will
never be handled until the end of postcopy migration. So, there's a chance
that there's one dest QEMU vcpu thread waiting for a page fault for an
extremely long time just because it's unluckily accessing the specific page
that was preempted before.

The worst case time it needs can be as long as the whole postcopy migration
procedure.  It's extremely unlikely to happen, but when it happens it's not
good.

The root cause of this problem is because we treat pss->postcopy_requested
variable as with two meanings bound together, as the variable shows:

   1. Whether this page request is urgent, and,
   2. Which channel we should use for this page request.

With the old code, when we set postcopy_requested it means either both (1)
and (2) are true, or both (1) and (2) are false.  We can never have (1)
and (2) to have different values.

However it doesn't necessarily need to be like that.  It's very legal that
there's one request that has (1) very high urgency, but (2) we'd like to
use the precopy channel.  Just like the corner case we were discussing
above.

To differenciate the two meanings better, introduce a new field called
postcopy_target_channel, showing which channel we should use for this page
request, so as to cover the old meaning (2) only.  Then we leave the
postcopy_requested variable to stand only for meaning (1), which is the
urgency of this page request.

With this change, we can easily boost priority of a preempted precopy page
as long as we know that page is also requested as a postcopy page.  So with
the new approach in get_queued_page() instead of dropping that request, we
send it right away with the precopy channel so we get back the ordering of
the page faults just like how they're requested on dest.

Alongside, I touched up find_dirty_block() to only set the postcopy fields
in the pss section if we're going through a postcopy migration.  That's a
very light optimization and shouldn't affect much.

Reported-by: manish.mis...@nutanix.com
Signed-off-by: Peter Xu 

So I think this is OK; getting a bit complicated!

Reviewed-by: Dr. David Alan Gilbert 
Reviewed-by: Manish Mishra 


---
  migration/ram.c | 69 +++--
  1 file changed, 55 insertions(+), 14 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 9d76db8491..fdcd9984fa 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -441,8 +441,28 @@ struct PageSearchStatus {
  unsigned long page;
  /* Set once we wrap around */
  bool complete_round;
-/* Whether current page is explicitly requested by postcopy */
+/*
+ * [POSTCOPY-ONLY] Whether current page is explicitly requested by
+ * postcopy.  When set, the request is "urgent" because the dest QEMU
+ * threads are waiting for us.
+ */
  bool postcopy_requested;
+/*
+ * [POSTCOPY-ONLY] The target channel to use to send current page.
+ *
+ * Note: This may _not_ match with the value in postcopy_requested
+ * above. Let's imagine the case where the postcopy request is exactly
+ * the page that we're sending in progress during precopy. In this case
+ * we'll have postcopy_requested set to true but the target channel
+ * will be the precopy channel (so that we don't split brain on that
+ * specific page since the precopy channel already contains partial of
+ * that page data).
+ *
+ * Besides that specific use case, postcopy_target_channel should
+ * always be equal to postcopy_requested, because by default we send
+ * postcopy pages via postcopy preempt channel.
+ */
+bool postcopy_target_channel;
  };
  typedef struct PageSearchStatus PageSearchStatus;
  
@@ -496,6 +516,9 @@ static QemuCond decomp_done_cond;

  static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock 
*block,
   ram_add

Re: [PATCH v17 6/8] softmmu/dirtylimit: Implement virtual CPU throttle

2022-05-23 Thread manish.mishra



On 17/05/22 1:49 pm, Hyman Huang wrote:

Thanks Manish for the comment, i'll give my explanation and any supplement are 
welcomed.

Really sorry for such late reply Hyman, this slipped my mind.


在 2022/5/17 1:13, manish.mishra 写道:

Hi Hyman Huang,

I had few doubts regarding this patch series.

For the first point, m'm rudely guessing that you want to figure out how should 
we set the vcpu dirty limit correctly during live migration to make it 
convergent.

This can be achieved by set a single dirtylimit value on all vcpus, the value 
need not be equivalent of the half of available bandwidth so precisely since 
the dirtylimit is sufficient conditions of migration sucess, but not necessary 
condition.

We can set the dirtylimit as the minimum of what user can tolerate, in most 
case, migration can achieve convergent in advance and do the switchover with 
the real dirtyrate greater than dirtylimit. This can be implemented because 
Qemu will check the criteria every iteration, once it meet the condition, Qemu 
will do the switch over no matter what convergent algo is.



Yes got it Hyman, my question was in direction that if we control dirty rate 
per vcpu, total dirty of VM become very unpredictable. For example if we have 
set dirty rate limit of each vcpu 50MBps for 10vcpu VM. Then total dirty rate 
of VM can be anywhere from 0-500MBps based on how many vcpu are active and how 
much. So if we had dirty rate limit control per VM it would have been much more 
predictable for user to use. I mean we can keep account of total dirty rate of 
VM and individual dirty rate and then assign throttle_sleep according to their 
weights to keep total dirty rate within limit of per vm dirty rate limit. But 
definately it can be targetted in future and should not be a blocker for now.



1. Why we choose for dirty rate limit per vcpu. I mean it becomes very hard for 
user to decide per

 vcpu dirty rate limit. For e.g. we have 1Gbps network and 10 vcpu vm. Now 
if someone wants to

 keep criteria for convergence as total dirty rate of VM should be lesser 
than half of available

 bandwidth. For this case to ensure convergence user has to give dirty rate 
limit per vcpu

 as 1Gbps/ 2 / 10 = 50Mbps. But assume then that VM has only 1 thread which 
is actively

 dirtying memory, in that case so much of available quota will be wasted.

This is a good and frequent question about dirtylimit, as mentioned above, 
throttle occurs only when dirty ring full and exit to user space.
A vcpu is set up with dirtylimit during live migration, but it does not dirty 
memory, it may never get throttled.
The dirtylimit only throttle those vcpu who dirty memory and dirtyrate greater 
then dirtylimit.

 So would not it be


 better to use dirty rate limit control per VM instead of vcpu?

2. Also Here we are adaptively trying to adjust sleep time based on current 
obsered dirty rate and

 dirty rate limit. Can it be more forceful like. Assume we have dirty rate 
limit of 10pages

 per sec and auto-converge/ dirty rate limit was triggered at time 0. Now 
at any point of time assume

 at time 10 sec if number of pages dirtyed are more than 100pages we sleep 
for interpolated amount

 of time. Basically at every dirty ring exit we can check if current number 
of pages dirtied are more than

 what should be allowed by this time?

Yes, indeed, but as memtioned above, if dirty ring exit, it give Qemu a hint 
that vcpu is dirting memory, we should check it.

I post the series of dirtylimit capability for RFC, may be it can help me to 
explain the usage of vcpu dirty limit, it can be found here:
https://urldefense.proofpoint.com/v2/url?u=https-3A__lore.kernel.org_qemu-2Ddevel_cover.1652762652.git.huangy81-40chinatelecom.cn_&d=DwIDaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=eTtzbPA0FcwY1xwq3KPGhj-Nk5zT41MwAjVGH8a-yeQokG7j3pJxtGsFVCzMDH2X&s=iitKUTNXv8Xkvs-n-K1Aow8MxLEP64RdTXw532_oLIY&e=
Thanks,
Yong

thanks I read this.

Also i had few additional things in mind.

1. I see there is no limit on cpu->throttle_us_per_full. I see below line but then 
ring_full_time_us can be very high value so in some rare cases 
cpu->throttle_us_per_full can be very high. I know few database applications which 
can not tolerate continous sleep of more than 2 secs. I agree user should not 
configure very low dirty rate limit to avoid such situation but then user may not 
have enough idea of algorithm so better we keep out internal limits?

cpu->throttle_us_per_full = MIN(cpu->throttle_us_per_full,
   ring_full_time_us * DIRTYLIMIT_THROTTLE_PCT_MAX);

2. Also this algorithm only control or limits dirty rate by guest writes. There 
can be some memory dirtying done by virtio based devices which is accounted 
only at qemu level so may not be accounted through dirty rings so do we have 
plan for that in future? Those are not issue for auto-converge 

Re: [PATCH v17 6/8] softmmu/dirtylimit: Implement virtual CPU throttle

2022-05-16 Thread manish.mishra

Hi Hyman Huang,

I had few doubts regarding this patch series.

1. Why we choose for dirty rate limit per vcpu. I mean it becomes very hard for 
user to decide per

    vcpu dirty rate limit. For e.g. we have 1Gbps network and 10 vcpu vm. Now 
if someone wants to

    keep criteria for convergence as total dirty rate of VM should be lesser 
than half of available

    bandwidth. For this case to ensure convergence user has to give dirty rate 
limit per vcpu

    as 1Gbps/ 2 / 10 = 50Mbps. But assume then that VM has only 1 thread which 
is actively

    dirtying memory, in that case so much of available quota will be wasted. So 
would not it be

    better to use dirty rate limit control per VM instead of vcpu?

2. Also Here we are adaptively trying to adjust sleep time based on current 
obsered dirty rate and

    dirty rate limit. Can it be more forceful like. Assume we have dirty rate 
limit of 10pages

    per sec and auto-converge/ dirty rate limit was triggered at time 0. Now at 
any point of time assume

    at time 10 sec if number of pages dirtyed are more than 100pages we sleep 
for interpolated amount

    of time. Basically at every dirty ring exit we can check if current number 
of pages dirtied are more than

    what should be allowed by this time?

thanks

Manish Mishra

On 02/03/22 11:25 pm, huang...@chinatelecom.cn wrote:

From: Hyman Huang(黄勇)

Setup a negative feedback system when vCPU thread
handling KVM_EXIT_DIRTY_RING_FULL exit by introducing
throttle_us_per_full field in struct CPUState. Sleep
throttle_us_per_full microseconds to throttle vCPU
if dirtylimit is in service.

Signed-off-by: Hyman Huang(黄勇)
Reviewed-by: Peter Xu
---
  accel/kvm/kvm-all.c |  19 ++-
  include/hw/core/cpu.h   |   6 +
  include/sysemu/dirtylimit.h |  15 +++
  softmmu/dirtylimit.c| 291 
  softmmu/trace-events|   7 ++
  5 files changed, 337 insertions(+), 1 deletion(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 8821d80..98e43e6 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -45,6 +45,7 @@
  #include "qemu/guest-random.h"
  #include "sysemu/hw_accel.h"
  #include "kvm-cpus.h"
+#include "sysemu/dirtylimit.h"
  
  #include "hw/boards.h"
  
@@ -476,6 +477,7 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp)

  cpu->kvm_state = s;
  cpu->vcpu_dirty = true;
  cpu->dirty_pages = 0;
+cpu->throttle_us_per_full = 0;
  
  mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);

  if (mmap_size < 0) {
@@ -1469,6 +1471,11 @@ static void *kvm_dirty_ring_reaper_thread(void *data)
   */
  sleep(1);
  
+/* keep sleeping so that dirtylimit not be interfered by reaper */

+if (dirtylimit_in_service()) {
+continue;
+}
+
  trace_kvm_dirty_ring_reaper("wakeup");
  r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
  
@@ -2964,8 +2971,18 @@ int kvm_cpu_exec(CPUState *cpu)

   */
  trace_kvm_dirty_ring_full(cpu->cpu_index);
  qemu_mutex_lock_iothread();
-kvm_dirty_ring_reap(kvm_state, NULL);
+/* We throttle vCPU by making it sleep once it exit from kernel
+ * due to dirty ring full. In the dirtylimit scenario, reaping
+ * all vCPUs after a single vCPU dirty ring get full result in
+ * the miss of sleep, so just reap the ring-fulled vCPU.
+ */
+if (dirtylimit_in_service()) {
+kvm_dirty_ring_reap(kvm_state, cpu);
+} else {
+kvm_dirty_ring_reap(kvm_state, NULL);
+}
  qemu_mutex_unlock_iothread();
+dirtylimit_vcpu_execute(cpu);
  ret = 0;
  break;
  case KVM_EXIT_SYSTEM_EVENT:
diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index 76ab3b8..dbeb31a 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -411,6 +411,12 @@ struct CPUState {
   */
  bool throttle_thread_scheduled;
  
+/*

+ * Sleep throttle_us_per_full microseconds once dirty ring is full
+ * if dirty page rate limit is enabled.
+ */
+int64_t throttle_us_per_full;
+
  bool ignore_memory_transaction_failures;
  
  /* Used for user-only emulation of prctl(PR_SET_UNALIGN). */

diff --git a/include/sysemu/dirtylimit.h b/include/sysemu/dirtylimit.h
index da459f0..8d2c1f3 100644
--- a/include/sysemu/dirtylimit.h
+++ b/include/sysemu/dirtylimit.h
@@ -19,4 +19,19 @@ void vcpu_dirty_rate_stat_start(void);
  void vcpu_dirty_rate_stat_stop(void);
  void vcpu_dirty_rate_stat_initialize(void);
  void vcpu_dirty_rate_stat_finalize(void);
+
+void dirtylimit_state_lock(void);
+void dirtylimit_state_unlock(void);
+void dirtylimit_state_initialize(void);
+void dirtylimit_state_finalize(void);
+bool dirtylimit_in_service(void);
+bool dirtylimit_vcpu_index_valid(int cpu_index);
+void dirtylimit_process(void);
+void dirtyl

Re: [PATCH v5 13/21] migration: Postcopy recover with preempt enabled

2022-05-16 Thread manish.mishra


On 16/05/22 8:21 pm, manish.mishra wrote:



On 16/05/22 7:41 pm, Peter Xu wrote:

Hi, Manish,

On Mon, May 16, 2022 at 07:01:35PM +0530, manish.mishra wrote:

On 26/04/22 5:08 am, Peter Xu wrote:
LGTM,
Peter, I wanted to give review-tag for this and ealier patch too. I am new
to qemu
review process so not sure how give review-tag, did not find any reference
on
google too. So if you please let me know how to do it.

It's here:

https://urldefense.proofpoint.com/v2/url?u=https-3A__git.qemu.org_-3Fp-3Dqemu.git-3Ba-3Dblob-3Bf-3Ddocs_devel_submitting-2Da-2Dpatch.rst-3Bhb-3DHEAD-23l492&d=DwIBaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=8LU6rphEJ5GMSXEpSxe8JZ_hpn6TQDUXfjWM6Vt7DdShxnU3X5zYXbAMBLPYchdK&s=TUNUCtdl7LWhrdlfnIx1F08kC0d9IMvArl6cNMpfXkc&e=  


Since afaict QEMU is mostly following what Linux does, you can also
reference to this one with more context:

https://urldefense.proofpoint.com/v2/url?u=https-3A__www.kernel.org_doc_html_v4.17_process_submitting-2Dpatches.html-23using-2Dreported-2Dby-2Dtested-2Dby-2Dreviewed-2Dby-2Dsuggested-2Dby-2Dand-2Dfixes&d=DwIBaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=8LU6rphEJ5GMSXEpSxe8JZ_hpn6TQDUXfjWM6Vt7DdShxnU3X5zYXbAMBLPYchdK&s=TJmr_eC4LAccVY1EqgkLleXfJhUgtIjTJmLc3cedYr0&e=  


But since you're still having question regarding this patch, no rush on
providing your R-bs; let's finish the discussion first.

[...]


+static void postcopy_pause_ram_fast_load(MigrationIncomingState *mis)
+{
+trace_postcopy_pause_fast_load();
+qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex);

I may have misunderstood synchronisation here but very very rare chance,

as both threads are working independently is it possible qemu_sem_post on

postcopy_pause_sem_fast_load is called by main thread even before we go

to qemu_sem_wait in next line, causing some kind of deadlock. That's should

not be possible as i understand it requires manually calling
qmp_migration_recover

so chances are almost impossible. Just wanted to confirm it.

Sorry I don't quite get the question, could you elaborate?  E.g., when the
described deadlock happened, what is both main thread and preempt load
thread doing?  What are they waiting at?

Note: we have already released mutex before waiting on sem.


What i meant here is deadlock could be due the reason that we infinately wait

on qemu_sem_wait(&mis->postcopy_pause_sem_fast_load), because main

thread already called post on postcopy_pause_sem_fast_load after recovery

even before we moved to qemu_sem_wait(&mis->postcopy_pause_sem_fast_load)

in next line. Basically if we miss a post on postcopy_pause_sem_fast_load.

This is nearly impossibily case becuase it requires full recovery path to be 
completed

before this thread executes just next line. Also as recovery needs to be called 
manually,

So please ignore this.

Basically i wanted to check if we should use something like

int pthread_cond_wait(pthread_cond_t *restrict cond,
pthread_mutex_t *restrict mutex);

so that there is no race between releasing mutex and calling wait.


Really sorry, please ignore this it is sem_post and sem_wait so that is not 
possible.



+qemu_sem_wait(&mis->postcopy_pause_sem_fast_load);

Just wanted to confirm why postcopy_pause_incoming is not called here
itself.

postcopy_pause_incoming() is only used in the main ram load thread, while
this function (postcopy_pause_ram_fast_load) is only called by the preempt
load thread.


ok got it, thanks Peter, i meant if we should close both the channels as soon

as we relise there is some failure instead of main thread waiting for error 
event

and then closing and pausing post-copy. But agree current approach is good.


Is it based on assumption that if there is error in any of the channel it
will

eventually be paused on source side, closing both channels, resulting

postcopy_pause_incoming will be called from main thread on destination?

Yes.


Usually it should be good to call as early as possible. It is left to main

thread default path so that we do not have any synchronisation overhead?

What's the sync overhead you mentioned? What we want to do here is simply
to put all the dest QEMU migration threads into a halted state rather than
quitting, so that they can be continued when necessary.


Also Peter, i was trying to understand postcopy recovery model so is use
case

of qmp_migrate_pause just for debugging purpose?

Yes.  It's also a way to cleanly stop using the network (comparing to force
unplug the nic ports?) for whatever reason with a shutdown() syscall upon
the socket.  I just don't know whether there's any real use case of that in
reality.

Thanks,


Re: [PATCH v5 00/21] migration: Postcopy Preemption

2022-05-16 Thread manish.mishra



On 26/04/22 5:08 am, Peter Xu wrote:

This is v5 of postcopy preempt series.  It can also be found here:

   https://github.com/xzpeter/qemu/tree/postcopy-preempt

RFC: https://lore.kernel.org/qemu-devel/20220119080929.39485-1-pet...@redhat.com
V1:  https://lore.kernel.org/qemu-devel/20220216062809.57179-1-pet...@redhat.com
V2:  https://lore.kernel.org/qemu-devel/20220301083925.33483-1-pet...@redhat.com
V3:  https://lore.kernel.org/qemu-devel/20220330213908.26608-1-pet...@redhat.com
V4:  https://lore.kernel.org/qemu-devel/20220331150857.74406-1-pet...@redhat.com

v4->v5 changelog:
- Fixed all checkpatch.pl warnings
- Picked up leftover patches from Dan's tls test case series:
   
https://lore.kernel.org/qemu-devel/20220310171821.3724080-1-berra...@redhat.com/
- Rebased to v7.0.0 tag, collected more R-bs from Dave/Dan
- In migrate_fd_cleanup(), use g_clear_pointer() for s->hostname [Dan]
- Mark postcopy-preempt capability for 7.1 not 7.0 [Dan]
- Moved migrate_channel_requires_tls() into tls.[ch] [Dan]
- Mention the bug-fixing side effect of patch "migration: Export
   tls-[creds|hostname|authz] params to cmdline too" on tls_authz [Dan]
- Use g_autoptr where proper [Dan]
- Drop a few (probably over-cautious) asserts on local_err being set [Dan]
- Quite a few renamings in the qtest in the last few test patches [Dan]

Abstract


This series contains two parts now:

   (1) Leftover patches from Dan's tls unit tests v2, which is first half
   (2) Leftover patches from my postcopy preempt v4, which is second half

This series added a new migration capability called "postcopy-preempt".  It can
be enabled when postcopy is enabled, and it'll simply (but greatly) speed up
postcopy page requests handling process.

Below are some initial postcopy page request latency measurements after the
new series applied.

For each page size, I measured page request latency for three cases:

   (a) Vanilla:the old postcopy
   (b) Preempt no-break-huge:  preempt enabled, 
x-postcopy-preempt-break-huge=off
   (c) Preempt full:   preempt enabled, x-postcopy-preempt-break-huge=on
   (this is the default option when preempt enabled)

Here x-postcopy-preempt-break-huge parameter is just added in v2 so as to
conditionally disable the behavior to break sending a precopy huge page for
debugging purpose.  So when it's off, postcopy will not preempt precopy
sending a huge page, but still postcopy will use its own channel.

I tested it separately to give a rough idea on which part of the change
helped how much of it.  The overall benefit should be the comparison
between case (a) and (c).

   |---+-+---+--|
   | Page size | Vanilla | Preempt no-break-huge | Preempt full |
   |---+-+---+--|
   | 4K|   10.68 |   N/A [*] | 0.57 |
   | 2M|   10.58 |  5.49 | 5.02 |
   | 1G| 2046.65 |   933.185 |  649.445 |
   |---+-+---+--|
   [*]: This case is N/A because 4K page does not contain huge page at all

[1] 
https://github.com/xzpeter/small-stuffs/blob/master/tools/huge_vm/uffd-latency.bpf


Hi Peter, Just wanted understand what setup was used for these experiments like

number of vcpu, workload, network bandwidth so that i can make sense of these

numbers. Also i could not understand reason for so much difference between 
preempt

full and Preempt no-break-huge especially for 1G case, so if you please share 
little more

details on this.



TODO List
=

Avoid precopy write() blocks postcopy
-

I didn't prove this, but I always think the write() syscalls being blocked
for precopy pages can affect postcopy services.  If we can solve this
problem then my wild guess is we can further reduce the average page
latency.

Two solutions at least in mind: (1) we could have made the write side of
the migration channel NON_BLOCK too, or (2) multi-threads on send side,
just like multifd, but we may use lock to protect which page to send too
(e.g., the core idea is we should _never_ rely anything on the main thread,
multifd has that dependency on queuing pages only on main thread).

That can definitely be done and thought about later.

Multi-channel for preemption threads


Currently the postcopy preempt feature use only one extra channel and one
extra thread on dest (no new thread on src QEMU).  It should be mostly good
enough for major use cases, but when the postcopy queue is long enough
(e.g. hundreds of vCPUs faulted on different pages) logically we could
still observe more delays in average.  Whether growing threads/channels can
solve it is debatable, but sounds worthwhile a try.  That's yet another
thing we can think about after this patchset lands.

Logically the design provides space for that - 

Re: [PATCH v5 15/21] migration: Parameter x-postcopy-preempt-break-huge

2022-05-16 Thread manish.mishra


On 26/04/22 5:08 am, Peter Xu wrote:

Add a parameter that can conditionally disable the "break sending huge
page" behavior in postcopy preemption.  By default it's enabled.

It should only be used for debugging purposes, and we should never remove
the "x-" prefix.

Signed-off-by: Peter Xu

Reviewed-by: Manish Mishra 

---
  migration/migration.c | 2 ++
  migration/migration.h | 7 +++
  migration/ram.c   | 7 +++
  3 files changed, 16 insertions(+)

diff --git a/migration/migration.c b/migration/migration.c
index cce741e20e..cd9650f2e2 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -4329,6 +4329,8 @@ static Property migration_properties[] = {
  DEFINE_PROP_SIZE("announce-step", MigrationState,
parameters.announce_step,
DEFAULT_MIGRATE_ANNOUNCE_STEP),
+DEFINE_PROP_BOOL("x-postcopy-preempt-break-huge", MigrationState,
+  postcopy_preempt_break_huge, true),
  
  /* Migration capabilities */

  DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
diff --git a/migration/migration.h b/migration/migration.h
index f898b8547a..6ee520642f 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -340,6 +340,13 @@ struct MigrationState {
  bool send_configuration;
  /* Whether we send section footer during migration */
  bool send_section_footer;
+/*
+ * Whether we allow break sending huge pages when postcopy preempt is
+ * enabled.  When disabled, we won't interrupt precopy within sending a
+ * host huge page, which is the old behavior of vanilla postcopy.
+ * NOTE: this parameter is ignored if postcopy preempt is not enabled.
+ */
+bool postcopy_preempt_break_huge;
  
  /* Needed by postcopy-pause state */

  QemuSemaphore postcopy_pause_sem;
diff --git a/migration/ram.c b/migration/ram.c
index a4b39e3675..f3a79c8556 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2266,11 +2266,18 @@ static int ram_save_target_page(RAMState *rs, 
PageSearchStatus *pss)
  
  static bool postcopy_needs_preempt(RAMState *rs, PageSearchStatus *pss)

  {
+MigrationState *ms = migrate_get_current();
+
  /* Not enabled eager preempt?  Then never do that. */
  if (!migrate_postcopy_preempt()) {
  return false;
  }
  
+/* If the user explicitly disabled breaking of huge page, skip */

+if (!ms->postcopy_preempt_break_huge) {
+return false;
+}
+
  /* If the ramblock we're sending is a small page?  Never bother. */
  if (qemu_ram_pagesize(pss->block) == TARGET_PAGE_SIZE) {
  return false;

Re: [PATCH v5 13/21] migration: Postcopy recover with preempt enabled

2022-05-16 Thread manish.mishra


On 16/05/22 7:41 pm, Peter Xu wrote:

Hi, Manish,

On Mon, May 16, 2022 at 07:01:35PM +0530, manish.mishra wrote:

On 26/04/22 5:08 am, Peter Xu wrote:
LGTM,
Peter, I wanted to give review-tag for this and ealier patch too. I am new
to qemu
review process so not sure how give review-tag, did not find any reference
on
google too. So if you please let me know how to do it.

It's here:

https://urldefense.proofpoint.com/v2/url?u=https-3A__git.qemu.org_-3Fp-3Dqemu.git-3Ba-3Dblob-3Bf-3Ddocs_devel_submitting-2Da-2Dpatch.rst-3Bhb-3DHEAD-23l492&d=DwIBaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=8LU6rphEJ5GMSXEpSxe8JZ_hpn6TQDUXfjWM6Vt7DdShxnU3X5zYXbAMBLPYchdK&s=TUNUCtdl7LWhrdlfnIx1F08kC0d9IMvArl6cNMpfXkc&e=  


Since afaict QEMU is mostly following what Linux does, you can also
reference to this one with more context:

https://urldefense.proofpoint.com/v2/url?u=https-3A__www.kernel.org_doc_html_v4.17_process_submitting-2Dpatches.html-23using-2Dreported-2Dby-2Dtested-2Dby-2Dreviewed-2Dby-2Dsuggested-2Dby-2Dand-2Dfixes&d=DwIBaQ&c=s883GpUCOChKOHiocYtGcg&r=c4KON2DiMd-szjwjggQcuUvTsPWblztAL0gVzaHnNmc&m=8LU6rphEJ5GMSXEpSxe8JZ_hpn6TQDUXfjWM6Vt7DdShxnU3X5zYXbAMBLPYchdK&s=TJmr_eC4LAccVY1EqgkLleXfJhUgtIjTJmLc3cedYr0&e=  


But since you're still having question regarding this patch, no rush on
providing your R-bs; let's finish the discussion first.

[...]


+static void postcopy_pause_ram_fast_load(MigrationIncomingState *mis)
+{
+trace_postcopy_pause_fast_load();
+qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex);

I may have misunderstood synchronisation here but very very rare chance,

as both threads are working independently is it possible qemu_sem_post on

postcopy_pause_sem_fast_load is called by main thread even before we go

to qemu_sem_wait in next line, causing some kind of deadlock. That's should

not be possible as i understand it requires manually calling
qmp_migration_recover

so chances are almost impossible. Just wanted to confirm it.

Sorry I don't quite get the question, could you elaborate?  E.g., when the
described deadlock happened, what is both main thread and preempt load
thread doing?  What are they waiting at?

Note: we have already released mutex before waiting on sem.


What i meant here is deadlock could be due the reason that we infinately wait

on qemu_sem_wait(&mis->postcopy_pause_sem_fast_load), because main

thread already called post on postcopy_pause_sem_fast_load after recovery

even before we moved to qemu_sem_wait(&mis->postcopy_pause_sem_fast_load)

in next line. Basically if we miss a post on postcopy_pause_sem_fast_load.

This is nearly impossibily case becuase it requires full recovery path to be 
completed

before this thread executes just next line. Also as recovery needs to be called 
manually,

So please ignore this.

Basically i wanted to check if we should use something like

int pthread_cond_wait(pthread_cond_t *restrict cond,
   pthread_mutex_t *restrict mutex);

so that there is no race between releasing mutex and calling wait.




+qemu_sem_wait(&mis->postcopy_pause_sem_fast_load);

Just wanted to confirm why postcopy_pause_incoming is not called here
itself.

postcopy_pause_incoming() is only used in the main ram load thread, while
this function (postcopy_pause_ram_fast_load) is only called by the preempt
load thread.


ok got it, thanks Peter, i meant if we should close both the channels as soon

as we relise there is some failure instead of main thread waiting for error 
event

and then closing and pausing post-copy. But agree current approach is good.


Is it based on assumption that if there is error in any of the channel it
will

eventually be paused on source side, closing both channels, resulting

postcopy_pause_incoming will be called from main thread on destination?

Yes.


Usually it should be good to call as early as possible. It is left to main

thread default path so that we do not have any synchronisation overhead?

What's the sync overhead you mentioned? What we want to do here is simply
to put all the dest QEMU migration threads into a halted state rather than
quitting, so that they can be continued when necessary.


Also Peter, i was trying to understand postcopy recovery model so is use
case

of qmp_migrate_pause just for debugging purpose?

Yes.  It's also a way to cleanly stop using the network (comparing to force
unplug the nic ports?) for whatever reason with a shutdown() syscall upon
the socket.  I just don't know whether there's any real use case of that in
reality.

Thanks,


Re: [PATCH v5 14/21] migration: Create the postcopy preempt channel asynchronously

2022-05-16 Thread manish.mishra


On 26/04/22 5:08 am, Peter Xu wrote:

This patch allows the postcopy preempt channel to be created
asynchronously.  The benefit is that when the connection is slow, we won't
take the BQL (and potentially block all things like QMP) for a long time
without releasing.

A function postcopy_preempt_wait_channel() is introduced, allowing the
migration thread to be able to wait on the channel creation.  The channel
is always created by the main thread, in which we'll kick a new semaphore
to tell the migration thread that the channel has created.

We'll need to wait for the new channel in two places: (1) when there's a
new postcopy migration that is starting, or (2) when there's a postcopy
migration to resume.

For the start of migration, we don't need to wait for this channel until
when we want to start postcopy, aka, postcopy_start().  We'll fail the
migration if we found that the channel creation failed (which should
probably not happen at all in 99% of the cases, because the main channel is
using the same network topology).

For a postcopy recovery, we'll need to wait in postcopy_pause().  In that
case if the channel creation failed, we can't fail the migration or we'll
crash the VM, instead we keep in PAUSED state, waiting for yet another
recovery.

Signed-off-by: Peter Xu


Reviewed-by: Manish Mishra 

This also looks good to me, sorry if i mis-understood and this is not correct 
way to add review tag.


---
  migration/migration.c| 16 
  migration/migration.h|  7 +
  migration/postcopy-ram.c | 56 +++-
  migration/postcopy-ram.h |  1 +
  4 files changed, 68 insertions(+), 12 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index a0db5de685..cce741e20e 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -3020,6 +3020,12 @@ static int postcopy_start(MigrationState *ms)
  int64_t bandwidth = migrate_max_postcopy_bandwidth();
  bool restart_block = false;
  int cur_state = MIGRATION_STATUS_ACTIVE;
+
+if (postcopy_preempt_wait_channel(ms)) {
+migrate_set_state(&ms->state, ms->state, MIGRATION_STATUS_FAILED);
+return -1;
+}
+
  if (!migrate_pause_before_switchover()) {
  migrate_set_state(&ms->state, MIGRATION_STATUS_ACTIVE,
MIGRATION_STATUS_POSTCOPY_ACTIVE);
@@ -3501,6 +3507,14 @@ static MigThrError postcopy_pause(MigrationState *s)
  if (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
  /* Woken up by a recover procedure. Give it a shot */
  
+if (postcopy_preempt_wait_channel(s)) {

+/*
+ * Preempt enabled, and new channel create failed; loop
+ * back to wait for another recovery.
+ */
+continue;
+}
+
  /*
   * Firstly, let's wake up the return path now, with a new
   * return path channel.
@@ -4360,6 +4374,7 @@ static void migration_instance_finalize(Object *obj)
  qemu_sem_destroy(&ms->postcopy_pause_sem);
  qemu_sem_destroy(&ms->postcopy_pause_rp_sem);
  qemu_sem_destroy(&ms->rp_state.rp_sem);
+qemu_sem_destroy(&ms->postcopy_qemufile_src_sem);
  error_free(ms->error);
  }
  
@@ -4406,6 +4421,7 @@ static void migration_instance_init(Object *obj)

  qemu_sem_init(&ms->rp_state.rp_sem, 0);
  qemu_sem_init(&ms->rate_limit_sem, 0);
  qemu_sem_init(&ms->wait_unplug_sem, 0);
+qemu_sem_init(&ms->postcopy_qemufile_src_sem, 0);
  qemu_mutex_init(&ms->qemu_file_lock);
  }
  
diff --git a/migration/migration.h b/migration/migration.h

index 91f845e9e4..f898b8547a 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -219,6 +219,13 @@ struct MigrationState {
  QEMUFile *to_dst_file;
  /* Postcopy specific transfer channel */
  QEMUFile *postcopy_qemufile_src;
+/*
+ * It is posted when the preempt channel is established.  Note: this is
+ * used for both the start or recover of a postcopy migration.  We'll
+ * post to this sem every time a new preempt channel is created in the
+ * main thread, and we keep post() and wait() in pair.
+ */
+QemuSemaphore postcopy_qemufile_src_sem;
  QIOChannelBuffer *bioc;
  /*
   * Protects to_dst_file/from_dst_file pointers.  We need to make sure we
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
index b3c81b46f6..1bb603051a 100644
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@@ -1552,10 +1552,50 @@ bool 
postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file)
  return true;
  }
  
-int postcopy_preempt_setup(MigrationState *s, Error **errp)

+static void
+postcopy_preempt_send_channel_new(QIOTask *task, gpointer opaque)
  {
-QIOChannel *ioc;
+MigrationState *s = opaque;
+QIOChannel *ioc = QIO_CHANNEL(qio_task_get_source(task));
+Error *local_err = NULL;
+
+if (qio_task_propagate_err

Re: [PATCH v5 13/21] migration: Postcopy recover with preempt enabled

2022-05-16 Thread manish.mishra



On 26/04/22 5:08 am, Peter Xu wrote:
LGTM,
Peter, I wanted to give review-tag for this and ealier patch too. I am 
new to qemu
review process so not sure how give review-tag, did not find any 
reference on

google too. So if you please let me know how to do it.

To allow postcopy recovery, the ram fast load (preempt-only) dest QEMU thread
needs similar handling on fault tolerance.  When ram_load_postcopy() fails,
instead of stopping the thread it halts with a semaphore, preparing to be
kicked again when recovery is detected.

A mutex is introduced to make sure there's no concurrent operation upon the
socket.  To make it simple, the fast ram load thread will take the mutex during
its whole procedure, and only release it if it's paused.  The fast-path socket
will be properly released by the main loading thread safely when there's
network failures during postcopy with that mutex held.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Peter Xu 
---
  migration/migration.c| 27 +++
  migration/migration.h| 19 +++
  migration/postcopy-ram.c | 25 +++--
  migration/qemu-file.c| 27 +++
  migration/qemu-file.h|  1 +
  migration/savevm.c   | 26 --
  migration/trace-events   |  2 ++
  7 files changed, 119 insertions(+), 8 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 8264b03d4d..a0db5de685 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -215,9 +215,11 @@ void migration_object_init(void)
  current_incoming->postcopy_remote_fds =
  g_array_new(FALSE, TRUE, sizeof(struct PostCopyFD));
  qemu_mutex_init(¤t_incoming->rp_mutex);
+qemu_mutex_init(¤t_incoming->postcopy_prio_thread_mutex);
  qemu_event_init(¤t_incoming->main_thread_load_event, false);
  qemu_sem_init(¤t_incoming->postcopy_pause_sem_dst, 0);
  qemu_sem_init(¤t_incoming->postcopy_pause_sem_fault, 0);
+qemu_sem_init(¤t_incoming->postcopy_pause_sem_fast_load, 0);
  qemu_mutex_init(¤t_incoming->page_request_mutex);
  current_incoming->page_requested = g_tree_new(page_request_addr_cmp);
  
@@ -697,9 +699,9 @@ static bool postcopy_try_recover(void)
  
  /*

   * Here, we only wake up the main loading thread (while the
- * fault thread will still be waiting), so that we can receive
+ * rest threads will still be waiting), so that we can receive
   * commands from source now, and answer it if needed. The
- * fault thread will be woken up afterwards until we are sure
+ * rest threads will be woken up afterwards until we are sure
   * that source is ready to reply to page requests.
   */
  qemu_sem_post(&mis->postcopy_pause_sem_dst);
@@ -3470,6 +3472,18 @@ static MigThrError postcopy_pause(MigrationState *s)
  qemu_file_shutdown(file);
  qemu_fclose(file);
  
+/*

+ * Do the same to postcopy fast path socket too if there is.  No
+ * locking needed because no racer as long as we do this before setting
+ * status to paused.
+ */
+if (s->postcopy_qemufile_src) {
+migration_ioc_unregister_yank_from_file(s->postcopy_qemufile_src);
+qemu_file_shutdown(s->postcopy_qemufile_src);
+qemu_fclose(s->postcopy_qemufile_src);
+s->postcopy_qemufile_src = NULL;
+}
+
  migrate_set_state(&s->state, s->state,
MIGRATION_STATUS_POSTCOPY_PAUSED);
  
@@ -3525,8 +3539,13 @@ static MigThrError migration_detect_error(MigrationState *s)

  return MIG_THR_ERR_FATAL;
  }
  
-/* Try to detect any file errors */

-ret = qemu_file_get_error_obj(s->to_dst_file, &local_error);
+/*
+ * Try to detect any file errors.  Note that postcopy_qemufile_src will
+ * be NULL when postcopy preempt is not enabled.
+ */
+ret = qemu_file_get_error_obj_any(s->to_dst_file,
+  s->postcopy_qemufile_src,
+  &local_error);
  if (!ret) {
  /* Everything is fine */
  assert(!local_error);
diff --git a/migration/migration.h b/migration/migration.h
index b8aacfe3af..91f845e9e4 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -118,6 +118,18 @@ struct MigrationIncomingState {
  /* Postcopy priority thread is used to receive postcopy requested pages */
  QemuThread postcopy_prio_thread;
  bool postcopy_prio_thread_created;
+/*
+ * Used to sync between the ram load main thread and the fast ram load
+ * thread.  It protects postcopy_qemufile_dst, which is the postcopy
+ * fast channel.
+ *
+ * The ram fast load thread will take it mostly for the whole lifecycle
+ * because it needs to continuously read data from the channel, and
+ * it'll only release this mutex if postcopy is interrupted, 

Re: [PATCH v4 10/19] migration: Postcopy preemption enablement

2022-05-13 Thread manish.mishra



On 12/05/22 9:52 pm, Peter Xu wrote:

Hi, Manish,

On Wed, May 11, 2022 at 09:24:28PM +0530, manish.mishra wrote:

@@ -1962,9 +2038,17 @@ static bool get_queued_page(RAMState *rs, 
PageSearchStatus *pss)
   RAMBlock  *block;
   ram_addr_t offset;
+again:
   block = unqueue_page(rs, &offset);
-if (!block) {
+if (block) {
+/* See comment above postcopy_preempted_contains() */
+if (postcopy_preempted_contains(rs, block, offset)) {
+trace_postcopy_preempt_hit(block->idstr, offset);
+/* This request is dropped */
+goto again;
+}

If we continuosly keep on getting new post-copy request. Is it possible this
case can starve post-copy request which is in precopy preemtion?

I didn't fully get your thoughts, could you elaborate?

Here we're checking against the case where the postcopy requested page is
exactly the one that we have preempted in previous precopy sessions.  If
true, we drop this postcopy page and continue with the rest.

When there'll be no postcopy requests pending then we'll continue with the
precopy page, which is exactly the request we've dropped.

Why we did this is actually in comment above postcopy_preempted_contains(),
and quotting from there:

/*
  * This should really happen very rarely, because it means when we were sending
  * during background migration for postcopy we're sending exactly the page that
  * some vcpu got faulted on on dest node.  When it happens, we probably don't
  * need to do much but drop the request, because we know right after we restore
  * the precopy stream it'll be serviced.  It'll slightly affect the order of
  * postcopy requests to be serviced (e.g. it'll be the same as we move current
  * request to the end of the queue) but it shouldn't be a big deal.  The most
  * imporant thing is we can _never_ try to send a partial-sent huge page on the
  * POSTCOPY channel again, otherwise that huge page will got "split brain" on
  * two channels (PRECOPY, POSTCOPY).
  */

[...]


Hi Peter, what i meant here is that as we go to precopy sending only 
when there is


no post-copy request left so if there is some workload which is 
continuosly generating


new post-copy fault request, It may take very long before we resume on 
precopy channel.


So basically precopy channel may have a post-copy request pending for 
very long in


this case? Earlier as it was FCFS there was a guarantee a post-copy 
request will be


served after a fixed amount of time.


@@ -2211,7 +2406,16 @@ static int ram_save_host_page(RAMState *rs, 
PageSearchStatus *pss)
   return 0;
   }
+if (migrate_postcopy_preempt() && migration_in_postcopy()) {

I see why there is only one extra channel, multiFD is not supported for
postcopy. Peter, Any particular reason for that.

We used one channel not because multifd is not enabled - if you read into
the series the channels are separately managed because they're servicing
different goals.  It's because I don't really know whether multiple
channels would be necessary, because postcopy requests should not be the
major channel that pages will be sent, kind of a fast-path.

One of the major goal of this series is to avoid interruptions made to
postcopy urgent pages due to sending of precopy pages.  One extra channel
already serviced it well, so I just stopped there as the initial version.
I actually raised that question myself too in the cover letter in the todo
section, I think we can always evaluate the possibility of that in the
future without major reworks (but we may need another parameter to specify
the num of threads just like multifd).


>because postcopy requests should not be the major channel that pages 
will be sent, kind of a fast-path.


Yes, agree Peter, but in worst case scenario it is possible we may have 
to transfer full memory of VM


by post-copy requests? So in that case we may require higher number of 
threads. But agree there can not be


be binding with number of mutliFD channels as multiFD uses 256KB buffer 
size but here we may have to 4k


in small page case so there can be big diff in throughput limits. Also 
smaller the buffer size much higher will


be cpu usage so it needs to be decided carefully.


As it must be very bad without multiFD, we have seen we can not utilise NIC
higher than 10 Gbps without multiFD. If it

is something in TODO can we help with that?

Yes, that should be on Juan's todo list (in the cc list as well), and
AFAICT he'll be happy if anyone would like to take items out of the list.
We can further discuss it somewhere.

One thing to mention is that I suspect the thread models will still need to
be separate even if multifd joins the equation.  I mean, IMHO multifd
threads take chunks of pages and send things in bulk, while if you read
into this series postcopy preempt threads send page one by one and asap.
T

Re: [PATCH v5 11/21] migration: Postcopy preemption preparation on channel creation

2022-05-11 Thread manish.mishra

LGTM

On 26/04/22 5:08 am, Peter Xu wrote:

Create a new socket for postcopy to be prepared to send postcopy requested
pages via this specific channel, so as to not get blocked by precopy pages.

A new thread is also created on dest qemu to receive data from this new channel
based on the ram_load_postcopy() routine.

The ram_load_postcopy(POSTCOPY) branch and the thread has not started to
function, and that'll be done in follow up patches.

Cleanup the new sockets on both src/dst QEMUs, meanwhile look after the new
thread too to make sure it'll be recycled properly.

Signed-off-by: Peter Xu 
Reviewed-by: Daniel P. Berrangé 
---
  migration/migration.c| 62 +++
  migration/migration.h|  8 
  migration/postcopy-ram.c | 92 ++--
  migration/postcopy-ram.h | 10 +
  migration/ram.c  | 25 ---
  migration/ram.h  |  4 +-
  migration/savevm.c   | 20 -
  migration/socket.c   | 22 +-
  migration/socket.h   |  1 +
  migration/trace-events   |  5 ++-
  10 files changed, 218 insertions(+), 31 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 75d9185c3a..e27aa306bc 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -321,6 +321,12 @@ void migration_incoming_state_destroy(void)
  mis->page_requested = NULL;
  }
  
+if (mis->postcopy_qemufile_dst) {

+migration_ioc_unregister_yank_from_file(mis->postcopy_qemufile_dst);
+qemu_fclose(mis->postcopy_qemufile_dst);
+mis->postcopy_qemufile_dst = NULL;
+}
+
  yank_unregister_instance(MIGRATION_YANK_INSTANCE);
  }
  
@@ -714,15 +720,21 @@ void migration_fd_process_incoming(QEMUFile *f, Error **errp)

  migration_incoming_process();
  }
  
+static bool migration_needs_multiple_sockets(void)

+{
+return migrate_use_multifd() || migrate_postcopy_preempt();
+}
+
  void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
  {
  MigrationIncomingState *mis = migration_incoming_get_current();
  Error *local_err = NULL;
  bool start_migration;
+QEMUFile *f;
  
  if (!mis->from_src_file) {

  /* The first connection (multifd may have multiple) */
-QEMUFile *f = qemu_fopen_channel_input(ioc);
+f = qemu_fopen_channel_input(ioc);
  
  if (!migration_incoming_setup(f, errp)) {

  return;
@@ -730,13 +742,18 @@ void migration_ioc_process_incoming(QIOChannel *ioc, 
Error **errp)
  
  /*

   * Common migration only needs one channel, so we can start
- * right now.  Multifd needs more than one channel, we wait.
+ * right now.  Some features need more than one channel, we wait.
   */
-start_migration = !migrate_use_multifd();
+start_migration = !migration_needs_multiple_sockets();
  } else {
  /* Multiple connections */
-assert(migrate_use_multifd());
-start_migration = multifd_recv_new_channel(ioc, &local_err);
+assert(migration_needs_multiple_sockets());
+if (migrate_use_multifd()) {
+start_migration = multifd_recv_new_channel(ioc, &local_err);
+} else if (migrate_postcopy_preempt()) {
+f = qemu_fopen_channel_input(ioc);
+start_migration = postcopy_preempt_new_channel(mis, f);
+}
  if (local_err) {
  error_propagate(errp, local_err);
  return;
@@ -761,11 +778,20 @@ void migration_ioc_process_incoming(QIOChannel *ioc, 
Error **errp)
  bool migration_has_all_channels(void)
  {
  MigrationIncomingState *mis = migration_incoming_get_current();
-bool all_channels;
  
-all_channels = multifd_recv_all_channels_created();

+if (!mis->from_src_file) {
+return false;
+}
+
+if (migrate_use_multifd()) {
+return multifd_recv_all_channels_created();
+}
+
+if (migrate_postcopy_preempt()) {
+return mis->postcopy_qemufile_dst != NULL;
+}
  
-return all_channels && mis->from_src_file != NULL;

+return true;
  }
  
  /*

@@ -1862,6 +1888,12 @@ static void migrate_fd_cleanup(MigrationState *s)
  qemu_fclose(tmp);
  }
  
+if (s->postcopy_qemufile_src) {

+migration_ioc_unregister_yank_from_file(s->postcopy_qemufile_src);
+qemu_fclose(s->postcopy_qemufile_src);
+s->postcopy_qemufile_src = NULL;
+}
+
  assert(!migration_is_active(s));
  
  if (s->state == MIGRATION_STATUS_CANCELLING) {

@@ -3237,6 +3269,11 @@ static void migration_completion(MigrationState *s)
  qemu_savevm_state_complete_postcopy(s->to_dst_file);
  qemu_mutex_unlock_iothread();
  
+/* Shutdown the postcopy fast path thread */

+if (migrate_postcopy_preempt()) {
+postcopy_preempt_shutdown_file(s);
+}
+
  trace_migration_completion_postcopy_end_after_complete();
  } else {
  goto fail;
@@ -41

Re: [PATCH v4 10/19] migration: Postcopy preemption enablement

2022-05-11 Thread manish.mishra

On 31/03/22 8:38 pm, Peter Xu wrote:

LGTM


This patch enables postcopy-preempt feature.

It contains two major changes to the migration logic:

(1) Postcopy requests are now sent via a different socket from precopy
 background migration stream, so as to be isolated from very high page
 request delays.

(2) For huge page enabled hosts: when there's postcopy requests, they can now
 intercept a partial sending of huge host pages on src QEMU.

After this patch, we'll live migrate a VM with two channels for postcopy: (1)
PRECOPY channel, which is the default channel that transfers background pages;
and (2) POSTCOPY channel, which only transfers requested pages.

There's no strict rule of which channel to use, e.g., if a requested page is
already being transferred on precopy channel, then we will keep using the same
precopy channel to transfer the page even if it's explicitly requested.  In 99%
of the cases we'll prioritize the channels so we send requested page via the
postcopy channel as long as possible.

On the source QEMU, when we found a postcopy request, we'll interrupt the
PRECOPY channel sending process and quickly switch to the POSTCOPY channel.
After we serviced all the high priority postcopy pages, we'll switch back to
PRECOPY channel so that we'll continue to send the interrupted huge page again.
There's no new thread introduced on src QEMU.

On the destination QEMU, one new thread is introduced to receive page data from
the postcopy specific socket (done in the preparation patch).

This patch has a side effect: after sending postcopy pages, previously we'll
assume the guest will access follow up pages so we'll keep sending from there.
Now it's changed.  Instead of going on with a postcopy requested page, we'll go
back and continue sending the precopy huge page (which can be intercepted by a
postcopy request so the huge page can be sent partially before).

Whether that's a problem is debatable, because "assuming the guest will
continue to access the next page" may not really suite when huge pages are
used, especially if the huge page is large (e.g. 1GB pages).  So that locality
hint is much meaningless if huge pages are used.

Reviewed-by: Dr. David Alan Gilbert
Signed-off-by: Peter Xu
---
  migration/migration.c  |   2 +
  migration/migration.h  |   2 +-
  migration/ram.c| 250 +++--
  migration/trace-events |   7 ++
  4 files changed, 252 insertions(+), 9 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 01b882494d..56d54c186b 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -3158,6 +3158,8 @@ static int postcopy_start(MigrationState *ms)
MIGRATION_STATUS_FAILED);
  }
  
+trace_postcopy_preempt_enabled(migrate_postcopy_preempt());

+
  return ret;
  
  fail_closefb:

diff --git a/migration/migration.h b/migration/migration.h
index caa910d956..b8aacfe3af 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -68,7 +68,7 @@ typedef struct {
  struct MigrationIncomingState {
  QEMUFile *from_src_file;
  /* Previously received RAM's RAMBlock pointer */
-RAMBlock *last_recv_block;
+RAMBlock *last_recv_block[RAM_CHANNEL_MAX];
  /* A hook to allow cleanup at the end of incoming migration */
  void *transport_data;
  void (*transport_cleanup)(void *data);
diff --git a/migration/ram.c b/migration/ram.c
index c7ea1d9215..518d511874 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -295,6 +295,20 @@ struct RAMSrcPageRequest {
  QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
  };
  
+typedef struct {

+/*
+ * Cached ramblock/offset values if preempted.  They're only meaningful if
+ * preempted==true below.
+ */
+RAMBlock *ram_block;
+unsigned long ram_page;
+/*
+ * Whether a postcopy preemption just happened.  Will be reset after
+ * precopy recovered to background migration.
+ */
+bool preempted;
+} PostcopyPreemptState;
+
  /* State of RAM for migration */
  struct RAMState {
  /* QEMUFile used for this migration */
@@ -349,6 +363,14 @@ struct RAMState {
  /* Queue of outstanding page requests from the destination */
  QemuMutex src_page_req_mutex;
  QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
+
+/* Postcopy preemption informations */
+PostcopyPreemptState postcopy_preempt_state;
+/*
+ * Current channel we're using on src VM.  Only valid if postcopy-preempt
+ * is enabled.
+ */
+unsigned int postcopy_channel;
  };
  typedef struct RAMState RAMState;
  
@@ -356,6 +378,11 @@ static RAMState *ram_state;
  
  static NotifierWithReturnList precopy_notifier_list;
  
+static void postcopy_preempt_reset(RAMState *rs)

+{
+memset(&rs->postcopy_preempt_state, 0, sizeof(PostcopyPreemptState));
+}
+
  /* Whether postcopy has queued requests? */
  static bool postcopy_has_request(RAMState *rs)
  {
@@ -1947,6 +1974,55 @@ void ram_wr

[PATCH] migration/ram: Avoid throttling VM in the first iteration

2021-09-28 Thread manish.mishra
In some cases for large size VM, memory_global_dirty_log_sync() and
ramblock_sync_dirty_bitmap() combined can take more than 1 sec. As a
result diff of end_time and rs->time_last_bitmap_sync can be more
than 1 sec even when migration_bitmap_sync is called the first time,
setting up the throttle in the first iteration itself. When
migration_bitmap_sync is called the first-time num_dirty_pages_period
is equal to VM total pages and ram_counters.transferred is zero which
forces VM to throttle to 99 from migration start. So we should always
check if dirty_sync_count is greater than 1 before trying throttling.

Signed-off-by: manish.mishra 
---
 migration/ram.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 7a43bfd7af..9ba1c8b235 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1006,8 +1006,12 @@ static void migration_bitmap_sync(RAMState *rs)
 
 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 
-/* more than 1 second = 1000 millisecons */
-if (end_time > rs->time_last_bitmap_sync + 1000) {
+/*
+ * more than 1 second = 1000 millisecons
+ * Avoid throttling VM in the first iteration of live migration.
+ */
+if (end_time > rs->time_last_bitmap_sync + 1000 &&
+ram_counters.dirty_sync_count > 1) {
 migration_trigger_throttle(rs);
 
 migration_update_rates(rs, end_time);
-- 
2.22.3