On Mon, Jan 17, 2022 at 5:05 AM Kevin Wolf <kw...@redhat.com> wrote: > > Am 10.01.2022 um 16:55 hat Peter Maydell geschrieben: > > Just saw this failure of iotests in a netbsd VM (the in-tree > > tests/vm stuff). Pretty sure it's an intermittent as the > > pulreq being tested has nothing io or block related. > > > > > > TEST iotest-qcow2: 036 > > TEST iotest-qcow2: 037 > > TEST iotest-qcow2: 038 [not run] > > TEST iotest-qcow2: 039 [not run] > > TEST iotest-qcow2: 040 [fail] > > QEMU -- > > "/home/qemu/qemu-test.MPWquy/build/tests/qemu-iotests/../../qemu-system-aarch64" > > -nodefaults -display none -accel qtest -machine > > virt > > QEMU_IMG -- > > "/home/qemu/qemu-test.MPWquy/build/tests/qemu-iotests/../../qemu-img" > > QEMU_IO -- > > "/home/qemu/qemu-test.MPWquy/build/tests/qemu-iotests/../../qemu-io" > > --cache writeback --aio threads -f qcow2 > > QEMU_NBD -- > > "/home/qemu/qemu-test.MPWquy/build/tests/qemu-iotests/../../qemu-nbd" > > IMGFMT -- qcow2 > > IMGPROTO -- file > > PLATFORM -- NetBSD/amd64 localhost 9.2 > > TEST_DIR -- > > /home/qemu/qemu-test.MPWquy/build/tests/qemu-iotests/scratch > > SOCK_DIR -- /tmp/tmpuniuicbi > > GDB_OPTIONS -- > > VALGRIND_QEMU -- > > PRINT_QEMU_OUTPUT -- > > > > --- /home/qemu/qemu-test.MPWquy/src/tests/qemu-iotests/040.out > > fcntl(): Invalid argument > > +++ 040.out.bad > > @@ -1,5 +1,30 @@ > > -................................................................. > > +............................................................ERROR:qemu.aqmp.qmp_client.qemu-7648:Failed > > to establish connection: concurrent.futures._base.CancelledError > > +E.... > > +====================================================================== > > +ERROR: test_top_is_default_active (__main__.TestSingleDrive) > > +---------------------------------------------------------------------- > > +Traceback (most recent call last): > > + File "/home/qemu/qemu-test.MPWquy/src/tests/qemu-iotests/040", line > > 94, in setUp > > + self.vm.launch() > > + File "/home/qemu/qemu-test.MPWquy/src/python/qemu/machine/machine.py", > > line 399, in launch > > + self._launch() > > + File "/home/qemu/qemu-test.MPWquy/src/python/qemu/machine/machine.py", > > line 434, in _launch > > + self._post_launch() > > + File "/home/qemu/qemu-test.MPWquy/src/python/qemu/machine/qtest.py", > > line 147, in _post_launch > > + super()._post_launch() > > + File "/home/qemu/qemu-test.MPWquy/src/python/qemu/machine/machine.py", > > line 340, in _post_launch > > + self._qmp.accept(self._qmp_timer) > > + File "/home/qemu/qemu-test.MPWquy/src/python/qemu/aqmp/legacy.py", > > line 69, in accept > > + timeout > > + File "/home/qemu/qemu-test.MPWquy/src/python/qemu/aqmp/legacy.py", > > line 42, in _sync > > + asyncio.wait_for(future, timeout=timeout) > > + File "/usr/pkg/lib/python3.7/asyncio/base_events.py", line 587, in > > run_until_complete > > + return future.result() > > + File "/usr/pkg/lib/python3.7/asyncio/tasks.py", line 449, in wait_for > > + raise futures.TimeoutError() > > +concurrent.futures._base.TimeoutError > > + > > ---------------------------------------------------------------------- > > Ran 65 tests > >
This trace says that we timed out while awaiting a connection from QEMU during the VM launch phase. i.e. python/qemu/qmp/machine.py line 340: def _post_launch(self) -> None: if self._qmp_connection: self._qmp.accept(self._qmp_timer) <-- we timed out here. (Note to self: make this traceback look more obvious as to what was canceled and why. I think I can improve readability here a bit ...) Sky's the limit on why QEMU never connected to the socket, but: > > --- /home/qemu/qemu-test.MPWquy/src/tests/qemu-iotests/040.out > > fcntl(): Invalid argument That looks fairly suspicious, and I don't know which process was responsible for printing it (or when, relative to the other outputs). I assume you don't see any such output like this on a good run. > > -OK > > +FAILED (errors=1) > > TEST iotest-qcow2: 041 [fail] > > QEMU -- > > "/home/qemu/qemu-test.MPWquy/build/tests/qemu-iotests/../../qemu-system-aarch64" > > -nodefaults -display none -accel qtest -machine virt > > QEMU_IMG -- > > "/home/qemu/qemu-test.MPWquy/build/tests/qemu-iotests/../../qemu-img" > > QEMU_IO -- > > "/home/qemu/qemu-test.MPWquy/build/tests/qemu-iotests/../../qemu-io" > > --cache writeback --aio threads -f qcow2 > > QEMU_NBD -- > > "/home/qemu/qemu-test.MPWquy/build/tests/qemu-iotests/../../qemu-nbd" > > IMGFMT -- qcow2 > > IMGPROTO -- file > > PLATFORM -- NetBSD/amd64 localhost 9.2 > > TEST_DIR -- > > /home/qemu/qemu-test.MPWquy/build/tests/qemu-iotests/scratch > > SOCK_DIR -- /tmp/tmpuniuicbi > > GDB_OPTIONS -- > > VALGRIND_QEMU -- > > PRINT_QEMU_OUTPUT -- > > > > --- /home/qemu/qemu-test.MPWquy/src/tests/qemu-iotests/041.out > > +++ 041.out.bad > > @@ -1,5 +1,32 @@ > > -........................................................................................................... > > +..........................................ERROR:qemu.aqmp.qmp_client.qemu-15252:Failed > > to establish connection: concurrent.futures._base.CancelledError > > +E................................................................ > > +====================================================================== > > +ERROR: test_small_buffer (__main__.TestSingleBlockdev) > > +---------------------------------------------------------------------- > > +Traceback (most recent call last): > > + File "/home/qemu/qemu-test.MPWquy/src/tests/qemu-iotests/041", line > > 233, in setUp > > + TestSingleDrive.setUp(self) > > + File "/home/qemu/qemu-test.MPWquy/src/tests/qemu-iotests/041", line > > 54, in setUp > > + self.vm.launch() > > + File "/home/qemu/qemu-test.MPWquy/src/python/qemu/machine/machine.py", > > line 399, in launch > > + self._launch() > > + File "/home/qemu/qemu-test.MPWquy/src/python/qemu/machine/machine.py", > > line 434, in _launch > > + self._post_launch() > > + File "/home/qemu/qemu-test.MPWquy/src/python/qemu/machine/qtest.py", > > line 147, in _post_launch > > + super()._post_launch() > > + File "/home/qemu/qemu-test.MPWquy/src/python/qemu/machine/machine.py", > > line 340, in _post_launch > > + self._qmp.accept(self._qmp_timer) > > + File "/home/qemu/qemu-test.MPWquy/src/python/qemu/aqmp/legacy.py", > > line 69, in accept > > + timeout > > + File "/home/qemu/qemu-test.MPWquy/src/python/qemu/aqmp/legacy.py", > > line 42, in _sync > > + asyncio.wait_for(future, timeout=timeout) > > + File "/usr/pkg/lib/python3.7/asyncio/base_events.py", line 587, in > > run_until_complete > > + return future.result() > > + File "/usr/pkg/lib/python3.7/asyncio/tasks.py", line 449, in wait_for > > + raise futures.TimeoutError() > > +concurrent.futures._base.TimeoutError Same problem here, except I don't see any output from QEMU to blame. As far as the Python code knows, it just never got a connection on the socket, so it timed out and died. I do expect this to print more information on failure than it currently is, though (bug somewhere in machine.py, I think). Can you please try applying this temporary patch and running `./check -qcow2 040 041` until you see a breakage and show me the output from that? diff --git a/python/qemu/machine/machine.py b/python/qemu/machine/machine.py index 67ab06ca2b..ca49e6fcd2 100644 --- a/python/qemu/machine/machine.py +++ b/python/qemu/machine/machine.py @@ -403,16 +403,19 @@ def launch(self) -> None: # Assume the VM didn't launch or is exiting. # If we don't wait for the process, exitcode() may still be # 'None' by the time control is ceded back to the caller. + LOG.error('Error launching VM') if self._launched: + LOG.error('Process was forked, waiting on it') self.wait() else: + LOG.error('Process was not forked, just cleaning up') self._post_shutdown() - LOG.debug('Error launching VM') if self._qemu_full_args: - LOG.debug('Command: %r', ' '.join(self._qemu_full_args)) + LOG.error('Command: %r', ' '.join(self._qemu_full_args)) if self._iolog: - LOG.debug('Output: %r', self._iolog) + LOG.error('Output: %r', self._iolog) + LOG.error('exitcode: %s', str(self.exitcode())) raise def _launch(self) -> None: > > + > > ---------------------------------------------------------------------- > > Ran 107 tests > > > > -OK > > +FAILED (errors=1) > > TEST iotest-qcow2: 042 > > TEST iotest-qcow2: 043 > > [...] > > Not run: 005 013 018 019 024 034 038 039 048 060 061 074 079 080 086 > > 097 099 108 114 137 138 140 141 150 154 161 172 176 179 181 184 186 > > 192 203 220 226 229 244 249 251 252 265 267 271 287 290 292 313 > > nbd-qemu-allocation > > Failures: 040 041 > > Failed 2 of 74 iotests > > > > > > Does it look familiar ? > > Not to me, but since there is a lot of AQMP in the trace, which is > relatively new, and 040 and 041 haven't changed in quite a while, let me > CC John to have a look. > > Previously, with the old synchronous QMP library, I think timeout while > connecting often meant that there was an error in the command line, so > QEMU failed to start up and only printed an error message. But that > doesn't make sense for an intermittent failure, and even less for a test > that hasn't changed. > > Kevin >