On Thu, Aug 7, 2025 at 5:36 PM Lukas Straub <lukasstra...@web.de> wrote:
> On Thu, 7 Aug 2025 10:41:17 +0800 > yong.hu...@smartx.com wrote: > > > From: Hyman Huang <yong.hu...@smartx.com> > > > > When there are network issues like missing TCP ACKs on the send > > side during the multifd live migration. At the send side, the error > > "Connection timed out" is thrown out and source QEMU process stop > > sending data, at the receive side, The IO-channels may be blocked > > at recvmsg() and thus the main loop gets stuck and fails to respond > > to QMP commands consequently. > > ... > > Hi Hyman Huang, > > Have you tried the 'yank' command to shutdown the sockets? It exactly > meant to recover from hangs and should solve your issue. > > https://www.qemu.org/docs/master/interop/qemu-qmp-ref.html#yank-feature Thanks for the comment and advice. Let me give more details about the migration state when the issue happens: On the source side, libvirt has already aborted the migration job: $ virsh domjobinfo fdecd242-f278-4308-8c3b-46e144e55f63 Job type: Failed Operation: Outgoing migration QMP query-yank shows that there is no migration yank instance: $ virsh qemu-monitor-command fdecd242-f278-4308-8c3b-46e144e55f63 '{"execute":"query-yank"}' --pretty { "return": [ { "type": "chardev", "id": "charmonitor" }, { "type": "chardev", "id": "charchannel0" }, { "type": "chardev", "id": "libvirt-2-virtio-format" } ], "id": "libvirt-5217" } The libvirt migration job is stuck as the following backtrace shows; it shows that migration is waiting for the "Finish" RPC on the destination side to return. #0 0x00007f4c93d086c9 in __GI___poll (fds=0x7f4c50000d20, nfds=2, timeout=-1) at ../sysdeps/unix/sysv/linux/poll.c:29 #1 0x00007f4c93e99379 in ?? () from /lib64/libglib-2.0.so.0 #2 0x00007f4c93e996c2 in g_main_loop_run () from /lib64/libglib-2.0.so.0 #3 0x00007f4c94aac92a in virNetClientIOEventLoop (client=client@entry=0x7f4c501a3ef0, thiscall=thiscall@entry=0x7f4c50052a90) at ../../src/rpc/virnetclient.c:1684 #4 0x00007f4c94aacf59 in virNetClientIO (thiscall=0x7f4c50052a90, client=0x7f4c501a3ef0) at ../../src/rpc/virnetclient.c:1952 #5 virNetClientSendInternal (client=client@entry=0x7f4c501a3ef0, msg=msg@entry=0x7f4c501a2150, expectReply=expectReply@entry=true, nonBlock=nonBlock@entry=false) at ../../src/rpc/virnetclient.c:2123 #6 0x00007f4c94aae793 in virNetClientSendWithReply (client=client@entry=0x7f4c501a3ef0, msg=msg@entry=0x7f4c501a2150) at ../../src/rpc/virnetclient.c:2151 #7 0x00007f4c94aa9460 in virNetClientProgramCall (prog=prog@entry=0x7f4c50066870, client=client@entry=0x7f4c501a3ef0, serial=serial@entry=10, proc=proc@entry=306, noutfds=noutfds@entry=0, outfds=outfds@entry=0x0, ninfds=0x0, infds=0x0, args_filter=0x7f4c94af1290 <xdr_remote_domain_migrate_finish3_params_args>, args=0x7f4c8487e300, ret_filter=0x7f4c94af1310 <xdr_remote_domain_migrate_finish3_params_ret>, ret=0x7f4c8487e350) at ../../src/rpc/virnetclientprogram.c:324 #8 0x00007f4c94acb2e4 in callFull (priv=priv@entry=0x7f4c5004c800, flags=flags@entry=0, fdin=fdin@entry=0x0, fdinlen=fdinlen@entry=0, fdout=fdout@entry=0x0, fdoutlen=fdoutlen@entry=0x0, proc_nr=306, args_filter=0x7f4c94af1290 <xdr_remote_domain_migrate_finish3_params_args>, args=0x7f4c8487e300 "\004", ret_filter=0x7f4c94af1310 <xdr_remote_domain_migrate_finish3_params_ret>, ret=0x7f4c8487e350 "", conn=0x7f4c5007c900) at ../../src/remote/remote_driver.c:6754 #9 0x00007f4c94ae20f8 in call (conn=0x7f4c5007c900, ret=0x7f4c8487e350 "", ret_filter=<optimized out>, args=0x7f4c8487e300 "\004", args_filter=<optimized out>, proc_nr=306, flags=0, priv=<optimized out>) at ../../src/remote/remote_driver.c:6776 #10 remoteDomainMigrateFinish3Params (dconn=0x7f4c5007c900, params=<optimized out>, nparams=4, cookiein=0x0, cookieinlen=0, cookieout=0x7f4c8487e4e0, cookieoutlen=0x7f4c8487e4b4, flags=131611, cancelled=1) at ../../src/remote/remote_driver.c:7362 // 调用目的端 Finish API 的 RPC,阻塞等待其执行结果 #11 0x00007f4c74d44600 in qemuMigrationSrcPerformPeer2Peer3 (flags=<optimized out>, useParams=<optimized out>, bandwidth=0, migParams=0x7f4c5002b540, nbdPort=0, migrate_disks=<optimized out>, nmigrate_disks=0, listenAddress=<optimized out>, graphicsuri=0x0, uri=<optimized out>, dname=0x0, persist_xml=0x7f4c5006f720 "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<domain type=\"kvm\" xmlns:qemu=\"http://libvirt.org/schemas/domain/qemu/1.0\"><name>5cdd670f-ac55-4820-a66e-b6e3985e1520</name><uuid>dd053ff0-5e12-44f5-9b97-1826715"..., xmlin=<optimized out>, vm=0x7f4c38257de0, dconnuri=0x7f4c5000f840 "qemu+tls://172.16.170.52/system?no_verify=1", dconn=0x7f4c5007c900, sconn=0x7f4c0000fb70, driver=0x7f4c3814e4b0) at ../../src/qemu/qemu_migration.c:4512 #12 qemuMigrationSrcPerformPeer2Peer (v3proto=<synthetic pointer>, resource=0, dname=0x0, flags=<optimized out>, migParams=0x7f4c5002b540, nbdPort=0, migrate_disks=<optimized out>, nmigrate_disks=0, listenAddress=<optimized out>, graphicsuri=0x0, uri=<optimized out>, dconnuri=0x7f4c5000f840 "qemu+tls://172.16.170.52/system?no_verify=1", persist_xml=0x7f4c5006f720 "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<domain type=\"kvm\" xmlns:qemu=\"http://libvirt.org/schemas/domain/qemu/1.0\"><name>5cdd670f-ac55-4820-a66e-b6e3985e1520</name><uuid>dd053ff0-5e12-44f5-9b97-1826715"..., xmlin=<optimized out>, vm=0x7f4c38257de0, sconn=0x7f4c0000fb70, driver=0x7f4c3814e4b0) at ../../src/qemu/qemu_migration.c:4767 #13 qemuMigrationSrcPerformJob (driver=driver@entry=0x7f4c3814e4b0, conn=conn@entry=0x7f4c0000fb70, vm=vm@entry=0x7f4c38257de0, xmlin=xmlin@entry=0x7f4c50026f80 "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<domain type=\"kvm\" xmlns:qemu=\"http://libvirt.org/schemas/domain/qemu/1.0\"><name>5cdd670f-ac55-4820-a66e-b6e3985e1520</name><uuid>dd053ff0-5e12-44f5-9b97-1826715"..., persist_xml=persist_xml@entry=0x7f4c5006f720 "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<domain type=\"kvm\" xmlns:qemu=\"http://libvirt.org/schemas/domain/qemu/1.0\"><name>5cdd670f-ac55-4820-a66e-b6e3985e1520</name><uuid>dd053ff0-5e12-44f5-9b97-1826715"..., dconnuri=0x7f4c5000f840 "qemu+tls://172.16.170.52/system?no_verify=1", uri=0x7f4c501a1430 "tcp://172.16.170.52", graphicsuri=0x0, listenAddress=0x0, nmigrate_disks=0, migrate_disks=0x0, nbdPort=0, migParams=0x7f4c5002b540, cookiein=0x0, cookieinlen=0, cookieout=0x7f4c8487e8c8, cookieoutlen=0x7f4c8487e8bc, flags=1073885723, dname=0x0, resource=0, v3proto=<optimized out>) at ../../src/qemu/qemu_migration.c:4842 #14 0x00007f4c74d44c6c in qemuMigrationSrcPerform (driver=driver@entry=0x7f4c3814e4b0, conn=0x7f4c0000fb70, vm=0x7f4c38257de0, xmlin=0x7f4c50026f80 "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<domain type=\"kvm\" xmlns:qemu=\"http://libvirt.org/schemas/domain/qemu/1.0\"><name>5cdd670f-ac55-4820-a66e-b6e3985e1520</name><uuid>dd053ff0-5e12-44f5-9b97-1826715"..., persist_xml=0x7f4c5006f720 "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<domain type=\"kvm\" xmlns:qemu=\"http://libvirt.org/schemas/domain/qemu/1.0\"><name>5cdd670f-ac55-4820-a66e-b6e3985e1520</name><uuid>dd053ff0-5e12-44f5-9b97-1826715"..., dconnuri=dconnuri@entry=0x7f4c5000f840 "qemu+tls://172.16.170.52/system?no_verify=1", uri=0x7f4c501a1430 "tcp://172.16.170.52", graphicsuri=0x0, listenAddress=0x0, nmigrate_disks=0, migrate_disks=0x0, nbdPort=0, migParams=0x7f4c5002b540, cookiein=0x0, cookieinlen=0, cookieout=0x7f4c8487e8c8, cookieoutlen=0x7f4c8487e8bc, flags=1073885723, dname=0x0, resource=0, v3proto=true) at ../../src/qemu/qemu_migration.c:5030 #15 0x00007f4c74d769e0 in qemuDomainMigratePerform3Params (dom=0x7f4c5019bfe0, dconnuri=0x7f4c5000f840 "qemu+tls://172.16.170.52/system?no_verify=1", params=<optimized out>, nparams=<optimized out>, cookiein=0x0, cookieinlen=0, cookieout=0x7f4c8487e8c8, cookieoutlen=0x7f4c8487e8bc, flags=1073885723) at ../../src/qemu/qemu_driver.c:12730 #16 0x00007f4c94b072e8 in virDomainMigratePerform3Params (domain=domain@entry=0x7f4c5019bfe0, dconnuri=0x7f4c5000f840 "qemu+tls://172.16.170.52/system?no_verify=1", params=0x7f4c500926d0, nparams=4, cookiein=0x0, cookieinlen=0, cookieout=0x7f4c8487e8c8, cookieoutlen=0x7f4c8487e8bc, flags=1073885723) at ../../src/libvirt-domain.c:4989 #17 0x000055b881c3fb1e in remoteDispatchDomainMigratePerform3Params (server=0x55b881dbba70, msg=0x55b881df23d0, ret=0x7f4c50054210, args=0x7f4c5019ff10, rerr=0x7f4c8487e9c0, client=<optimized out>) at ../../src/remote/remote_daemon_dispatch.c:5736 #18 remoteDispatchDomainMigratePerform3ParamsHelper (server=0x55b881dbba70, client=<optimized out>, msg=0x55b881df23d0, rerr=0x7f4c8487e9c0, args=0x7f4c5019ff10, ret=0x7f4c50054210) at ./remote/remote_daemon_dispatch_stubs.h:8805 --Type <RET> for more, q to quit, c to continue without paging-- #19 0x00007f4c94aa242d in virNetServerProgramDispatchCall (msg=0x55b881df23d0, client=0x55b881e0f740, server=0x55b881dbba70, prog=0x55b881dc8750) at ../../src/rpc/virnetserverprogram.c:430 #20 virNetServerProgramDispatch (prog=0x55b881dc8750, server=server@entry=0x55b881dbba70, client=0x55b881e0f740, msg=0x55b881df23d0) at ../../src/rpc/virnetserverprogram.c:302 #21 0x00007f4c94aa73c2 in virNetServerProcessMsg (msg=<optimized out>, prog=<optimized out>, client=<optimized out>, srv=0x55b881dbba70) at ../../src/rpc/virnetserver.c:137 #22 virNetServerHandleJob (jobOpaque=0x55b881de8140, opaque=0x55b881dbba70) at ../../src/rpc/virnetserver.c:154 #23 0x00007f4c949bbf80 in virThreadPoolWorker (opaque=<optimized out>) at ../../src/util/virthreadpool.c:163 #24 0x00007f4c949bb5b7 in virThreadHelper (data=<optimized out>) at ../../src/util/virthread.c:233 #25 0x00007f4c93dfbf1b in start_thread (arg=0x7f4c8487f700) at pthread_create.c:486 #26 0x00007f4c93d131a0 in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:98 While at the destination side, Libvirt shows a "paused" VM: $ virsh list Id Name State ----------------------------------------------------- 31 fdecd242-f278-4308-8c3b-46e144e55f63 paused Libvirt is stuck with the following backtrace. It means Libvirt is querying the VM status by issuing the QMP "query-status" before killing the VM. The piece of code is: qemuMigrationDstFinish: if (retcode != 0) { /* Check for a possible error on the monitor in case Finish was called * earlier than monitor EOF handler got a chance to process the error */ qemuDomainCheckMonitor(driver, vm, QEMU_ASYNC_JOB_MIGRATION_IN); goto endjob; } Thread 2 (Thread 0x7f1161c6c700 (LWP 3244)): #0 0x00007f116f9eba0c in futex_wait_cancelable (private=<optimized out>, expected=0, futex_word=0x7f1138068550) at ../sysdeps/unix/sysv/linux/futex-internal.h:88 #1 __pthread_cond_wait_common (abstime=0x0, clockid=0, mutex=0x7f1138068500, cond=0x7f1138068528) at pthread_cond_wait.c:508 #2 __pthread_cond_wait (cond=cond@entry=0x7f1138068528, mutex=mutex@entry=0x7f1138068500) at pthread_cond_wait.c:638 #3 0x00007f11705a5476 in virCondWait (c=c@entry=0x7f1138068528, m=m@entry=0x7f1138068500) at ../../src/util/virthread.c:148 #4 0x00007f116013fbfc in qemuMonitorSend (mon=mon@entry=0x7f11380684f0, msg=msg@entry=0x7f1161c6b600) at ../../src/qemu/qemu_monitor.c:953 #5 0x00007f116014fde5 in qemuMonitorJSONCommandWithFd (mon=mon@entry=0x7f11380684f0, cmd=cmd@entry=0x7f115c0512e0, scm_fd=scm_fd@entry=-1, reply=reply@entry=0x7f1161c6b680) at ../../src/qemu/qemu_monitor_json.c:358 #6 0x00007f1160152025 in qemuMonitorJSONCommand (reply=0x7f1161c6b680, cmd=0x7f115c0512e0, mon=0x7f11380684f0) at ../../src/qemu/qemu_monitor_json.c:383 #7 qemuMonitorJSONGetStatus (mon=0x7f11380684f0, running=0x7f1161c6b6c7, reason=0x0) at ../../src/qemu/qemu_monitor_json.c:1740 #8 0x00007f1160141a80 in qemuMonitorCheck (mon=<optimized out>) at ../../src/qemu/qemu_monitor.c:1633 #9 0x00007f11600f0d87 in qemuDomainCheckMonitor (driver=driver@entry=0x7f11141273b0, vm=vm@entry=0x7f1138135920, asyncJob=asyncJob@entry=QEMU_ASYNC_JOB_MIGRATION_IN) at ../../src/qemu/qemu_domain.c:14393 #10 0x00007f1160133d18 in qemuMigrationDstFinish (driver=driver@entry=0x7f11141273b0, dconn=dconn@entry=0x7f1134012000, vm=<optimized out>, cookiein=cookiein@entry=0x0, cookieinlen=cookieinlen@entry=0, cookieout=cookieout@entry=0x7f1161c6b8f0, cookieoutlen=0x7f1161c6b8e4, flags=131611, retcode=1, v3proto=true) at ../../src/qemu/qemu_migration.c:5211 #11 0x00007f116016a436 in qemuDomainMigrateFinish3Params (dconn=0x7f1134012000, params=0x7f115c05e9e0, nparams=4, cookiein=0x0, cookieinlen=0, cookieout=0x7f1161c6b8f0, cookieoutlen=0x7f1161c6b8e4, flags=131611, cancelled=1) at ../../src/qemu/qemu_driver.c:12827 #12 0x00007f11706f15bb in virDomainMigrateFinish3Params (dconn=<optimized out>, params=0x7f115c05e9e0, nparams=4, cookiein=0x0, cookieinlen=0, cookieout=0x7f1161c6b8f0, cookieoutlen=0x7f1161c6b8e4, flags=131611, cancelled=1) at ../../src/libvirt-domain.c:5033 #13 0x000055ffa89cf8c0 in ?? () #14 0x00007f117068c42d in virNetServerProgramDispatchCall (msg=0x55ffaa6a3ec0, client=0x55ffaa6b4840, server=0x55ffaa682030, prog=0x55ffaa68f760) at ../../src/rpc/virnetserverprogram.c:430 #15 virNetServerProgramDispatch (prog=0x55ffaa68f760, server=server@entry=0x55ffaa682030, client=0x55ffaa6b4840, msg=0x55ffaa6a3ec0) at ../../src/rpc/virnetserverprogram.c:302 #16 0x00007f11706913c2 in virNetServerProcessMsg (msg=<optimized out>, prog=<optimized out>, client=<optimized out>, srv=0x55ffaa682030) at ../../src/rpc/virnetserver.c:137 #17 virNetServerHandleJob (jobOpaque=0x55ffaa669af0, opaque=0x55ffaa682030) at ../../src/rpc/virnetserver.c:154 #18 0x00007f11705a5f80 in virThreadPoolWorker (opaque=<optimized out>) at ../../src/util/virthreadpool.c:163 #19 0x00007f11705a55b7 in virThreadHelper (data=<optimized out>) at ../../src/util/virthread.c:233 #20 0x00007f116f9e5f1b in start_thread (arg=0x7f1161c6c700) at pthread_create.c:486 #21 0x00007f116f8fd1a0 in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:98 Thread 1 (Thread 0x7f116e9a1580 (LWP 2925)): #0 0x00007f116f8f26c9 in __GI___poll (fds=0x55ffaa65f130, nfds=14, timeout=4982) at ../sysdeps/unix/sysv/linux/poll.c:29 #1 0x00007f116fa83379 in ?? () from /lib64/libglib-2.0.so.0 #2 0x00007f116fa8348c in g_main_context_iteration () from /lib64/libglib-2.0.so.0 #3 0x00007f117054cdc0 in virEventGLibRunOnce () at ../../src/util/vireventglib.c:533 #4 0x00007f117054c085 in virEventRunDefaultImpl () at ../../src/util/virevent.c:344 #5 0x00007f1170690bcd in virNetDaemonRun (dmn=0x55ffaa680d60) at ../../src/rpc/virnetdaemon.c:852 #6 0x000055ffa89c03bc in ?? () #7 0x00007f116f82ab27 in __libc_start_main (main=0x55ffa89be930, argc=2, argv=0x7ffe19beea78, init=<optimized out>, fini=<optimized out>, rtld_fini=<optimized out>, stack_end=0x7ffe19beea68) at ../csu/libc-start.c:308 #8 0x000055ffa89c06ba in ?? () IMHO, the key reason for the issue is that QEMU fails to run the main loop and fails to respond to QMP, which is not what we usually expected. Giving the Libvirt a window of time to issue a QMP and kill the VM is the ideal solution for this issue; this provides an automatic method. I do not dig the yank feature, perhaps it is helpful, but only manually? After all, these two options are not exclusive of one another, I think. > > Best regards, > Lukas Straub > Thanks, Yong -- Best regards