Bernhard Voelker <[email protected]> writes:
> How to reproduce:
> $ tar xf $ATTACHMENT
> $ scriptreplay -d 6 typescript.timing typescript | timeout 25 tee .test
>
> At the end of the 'screenreplay' recording, tee(1) goes into a 100% busy loop
> until it gets killed by timeout(1).
>
> read(0, "bernhard@node52:~/osc/t/openSUSE"..., 8192) = 51
> write(3, "bernhard@node52:~/osc/t/openSUSE"..., 51) = 51
> read(0, "exit\r\n\n", 8192) = 7
> write(3, "exit\r\n\n", 7) = 7
> read(0, "", 8192) = 0
> poll([{fd=-1}, {fd=3, events=POLLOUT}], 2, -1) = 1 ([{fd=3,
> revents=POLLOUT}])
> poll([{fd=-1}, {fd=3, events=POLLOUT}], 2, -1) = 1 ([{fd=3,
> revents=POLLOUT}])
> poll([{fd=-1}, {fd=3, events=POLLOUT}], 2, -1) = 1 ([{fd=3,
> revents=POLLOUT}])
> poll([{fd=-1}, {fd=3, events=POLLOUT}], 2, -1) = 1 ([{fd=3,
> revents=POLLOUT}])
>
> The OP claims he bisected the problem to this commit:
>
> 9b855166e tee: prefer file descriptors over streams
>
> GDB shows that tee(1) does not ever get out of the endless loop in
> close_wait():
>
> #0 0x00007fa801497f9a in __internal_syscall_cancel () from /lib64/libc.so.6
> #1 0x00007fa801497fc1 in __syscall_cancel () from /lib64/libc.so.6
> #2 0x00007fa8015128ea in poll () from /lib64/libc.so.6
> #3 0x00000000004035df in poll (__fds=0x7fffceca2c70, __nfds=2,
> __timeout=<optimized out>) \
> at /usr/include/bits/poll2.h:44
> #4 iopoll_internal (fdin=fdin@entry=-1, fdout=fdout@entry=3,
> block=block@entry=true, \
> broken_output=broken_output@entry=false) \
> at src/iopoll.c:82
> #5 0x0000000000403713 in wait_for_nonblocking_write (fd=<optimized out>) \
> at src/iopoll.c:187
> #6 close_wait (fd=3) \
> at src/iopoll.c:202
> #7 0x0000000000402cdb in tee_files (nfiles=1, files=<optimized out>,
> pipe_check=false) \
> at src/tee.c:332
> #8 0x00000000004028c4 in main (argc=2, argv=0x7fffceca4e68) \
> at src/tee.c:184
>
> Note 1: for me, the issue is not reproducible every time, but I saw it as
> least
> 1 out of 20 times.
>
> Note 2: if the effect happens, then the terminal does not show the end of the
> screenreplay.
> strace shows that tee(1) is getting EAGAIN, and from then on stops outputting
> to stdout,
> and only continues to write to FD 3.
>
> read(0, "[ 26s] AutoGen: Parsing \"SRC:/"..., 8192) = 8192
> write(1, "[ 26s] AutoGen: Parsing \"SRC:/"..., 8192) = 8192
> write(3, "[ 26s] AutoGen: Parsing \"SRC:/"..., 8192) = 8192
> read(0, "de/qt6/QtDBus -I/usr/include/KF6"..., 8192) = 8192
> write(1, "de/qt6/QtDBus -I/usr/include/KF6"..., 8192) = 1661
> fcntl(1, F_GETFL) = 0x8802 (flags O_RDWR|O_NONBLOCK|O_LARGEFILE)
> write(2, "tee: ", 5) = -1 EAGAIN (Resource temporarily unavailable)
> write(2, "'standard output'", 17) = -1 EAGAIN (Resource temporarily
> unavailable)
> write(2, "\n", 1) = -1 EAGAIN (Resource temporarily unavailable)
> write(3, "de/qt6/QtDBus -I/usr/include/KF6"..., 8192) = 8192
> read(0, "p\", because it doesn't exist, fr"..., 8192) = 8192
> write(3, "p\", because it doesn't exist, fr"..., 8192) = 8192
> read(0, "L_CAST_FROM_STRING -DQT_OPENGL_L"..., 8192) = 8192
> write(3, "L_CAST_FROM_STRING -DQT_OPENGL_L"..., 8192) = 8192
On my system this case shows up much more often than 1 out of 20 times.
I think I see what is happening. The close_wait function is buggy and I
should have just used a normal close.
It was based on fclose_wait, which was the following:
extern bool
fclose_wait (FILE *f)
{
for (;;)
{
if (fflush (f) == 0)
break;
if (! fwait_for_nonblocking_write (f))
break;
}
return fclose (f) == 0;
}
The buggy close_wait function is this:
extern bool
close_wait (int fd)
{
while (wait_for_nonblocking_write (fd))
;
return close (fd) == 0;
}
In the previous case we only called wait_for_nonblocking_write if the
output stream had buffered data that needed to be written. Since we are
not using streams, we have no buffered data that needs to be written
before closing.
If the last system call the fails sets errno to EAGAIN, close_wait will
proceed to call wait_for_nonblocking_write which will poll forever.
This patch fixes it for me:
diff --git a/src/iopoll.c b/src/iopoll.c
index de20bc8d9..a48a12e12 100644
--- a/src/iopoll.c
+++ b/src/iopoll.c
@@ -194,17 +194,6 @@ wait_for_nonblocking_write (int fd)
return true;
}
-/* wrapper for close() that also waits for FD if non blocking. */
-
-extern bool
-close_wait (int fd)
-{
- while (wait_for_nonblocking_write (fd))
- ;
- return close (fd) == 0;
-}
-
-
/* wrapper for write() that also waits for FD if non blocking. */
extern bool
diff --git a/src/iopoll.h b/src/iopoll.h
index 1711fdab9..a1561b9ff 100644
--- a/src/iopoll.h
+++ b/src/iopoll.h
@@ -5,5 +5,4 @@ int iopoll (int fdin, int fdout, bool block);
bool iopoll_input_ok (int fdin);
bool iopoll_output_ok (int fdout);
-bool close_wait (int fd);
bool write_wait (int fd, void const *buffer, size_t size);
diff --git a/src/tee.c b/src/tee.c
index 32a18e340..fba6ae089 100644
--- a/src/tee.c
+++ b/src/tee.c
@@ -329,7 +329,7 @@ tee_files (int nfiles, char **files, bool pipe_check)
/* Close the files, but not standard output. */
for (int i = 1; i <= nfiles; i++)
- if (0 <= descriptors[i] && ! close_wait (descriptors[i]))
+ if (0 <= descriptors[i] && close (descriptors[i]) < 0)
{
error (0, errno, "%s", quotef (files[i]));
ok = false;
> Note 3: Regardless whether the tee(1) infloop happens or not, the terminal is
> showing
> some debris after terminating tee(1):
>
>
> :1+r6E616D65=787465726D2D323536636F6C6F72^[\^[[8;61;270t^[[61;10R^[[61;270R^[[8;61;270t^[[61;1R^[[61;270R
This one confuses me, since I also see it with coreutils-9.10. Perhaps
it is a bug that was introduced earlier than the previously discussed
commit?
> Note 4:. Interestingly, I also saw another effect in some of the runs:
> the regular output on the terminal stopped somewhere in the middle of the
> compilation section
> of the screen recording, and tee terminates with
> tee: 'standard output'
> and with $? = 130. Not sure this is the same case.
Interestingly, I haven't seen this one.
Collin