https://sourceware.org/git/gitweb.cgi?p=newlib-cygwin.git;h=b091b47b9e569389ad68bf28274c88d22be38ca5

commit b091b47b9e569389ad68bf28274c88d22be38ca5
Author:     Jeremy Drake <cyg...@jdrake.com>
AuthorDate: Tue Nov 19 11:06:42 2024 -0800
Commit:     Corinna Vinschen <cori...@vinschen.de>
CommitDate: Wed Nov 20 11:16:24 2024 +0100

    cygthread: suspend thread before terminating.
    
    This addresses an extremely difficult to debug deadlock when running
    under emulation on ARM64.
    
    A relatively easy way to trigger this bug is to call `fork()`, then within 
the
    child process immediately call another `fork()` and then `exit()` the
    intermediate process.
    
    It would seem that there is a "code emulation" lock on the wait thread at
    this stage, and if the thread is terminated too early, that lock still 
exists
    albeit without a thread, and nothing moves forward.
    
    It seems that a `SuspendThread()` combined with a `GetThreadContext()`
    (to force the thread to _actually_ be suspended, for more details see
    https://devblogs.microsoft.com/oldnewthing/20150205-00/?p=44743)
    makes sure the thread is "booted" from emulation before it is suspended.
    
    Hopefully this means it won't be holding any locks or otherwise leave
    emulation in a bad state when the thread is terminated.
    
    Also, attempt to use `CancelSynchonousIo()` (as seen in `flock.cc`) to avoid
    the need for `TerminateThread()` altogether.  This doesn't always work,
    however, so was not a complete fix for the deadlock issue.
    
    Addresses: 
https://cygwin.com/pipermail/cygwin-developers/2024-May/012694.html
    Signed-off-by: Jeremy Drake <cyg...@jdrake.com>

Diff:
---
 winsup/cygwin/cygthread.cc | 14 ++++++++++++++
 winsup/cygwin/pinfo.cc     | 10 +++++++---
 winsup/cygwin/sigproc.cc   | 12 ++++++++++--
 3 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/winsup/cygwin/cygthread.cc b/winsup/cygwin/cygthread.cc
index 54918e767784..4f160975311b 100644
--- a/winsup/cygwin/cygthread.cc
+++ b/winsup/cygwin/cygthread.cc
@@ -302,6 +302,20 @@ cygthread::terminate_thread ()
   if (!inuse)
     goto force_notterminated;
 
+  if (_my_tls._ctinfo != this)
+    {
+      CONTEXT context;
+      context.ContextFlags = CONTEXT_CONTROL;
+      /* SuspendThread makes sure a thread is "booted" from emulation before
+        it is suspended.  As such, the emulator hopefully won't be in a bad
+        state (aka, holding any locks) when the thread is terminated. */
+      SuspendThread (h);
+      /* We need to call GetThreadContext, even though we don't care about the
+        context, because SuspendThread is asynchronous and GetThreadContext
+        will make sure the thread is *really* suspended before returning */
+      GetThreadContext (h, &context);
+    }
+
   TerminateThread (h, 0);
   WaitForSingleObject (h, INFINITE);
   CloseHandle (h);
diff --git a/winsup/cygwin/pinfo.cc b/winsup/cygwin/pinfo.cc
index e31a67d8f412..f1db71595181 100644
--- a/winsup/cygwin/pinfo.cc
+++ b/winsup/cygwin/pinfo.cc
@@ -1252,13 +1252,17 @@ proc_waiter (void *arg)
 
   for (;;)
     {
-      DWORD nb;
+      DWORD nb, err;
       char buf = '\0';
 
       if (!ReadFile (vchild.rd_proc_pipe, &buf, 1, &nb, NULL)
-         && GetLastError () != ERROR_BROKEN_PIPE)
+         && (err = GetLastError ()) != ERROR_BROKEN_PIPE)
        {
-         system_printf ("error on read of child wait pipe %p, %E", 
vchild.rd_proc_pipe);
+         /* ERROR_OPERATION_ABORTED is expected due to the possibility that
+            CancelSynchronousIo interruped the ReadFile call, so don't output
+            that error */
+         if (err != ERROR_OPERATION_ABORTED)
+           system_printf ("error on read of child wait pipe %p, %E", 
vchild.rd_proc_pipe);
          break;
        }
 
diff --git a/winsup/cygwin/sigproc.cc b/winsup/cygwin/sigproc.cc
index 81b6c316956a..9e20ae6f71c0 100644
--- a/winsup/cygwin/sigproc.cc
+++ b/winsup/cygwin/sigproc.cc
@@ -409,7 +409,11 @@ proc_terminate ()
             to 1 iff it is a Cygwin process.  */
          if (!have_execed || !have_execed_cygwin)
            chld_procs[i]->ppid = 1;
-         if (chld_procs[i].wait_thread)
+         /* Attempt to exit the wait_thread cleanly via CancelSynchronousIo
+            before falling back to the (explicitly dangerous) cross-thread
+            termination */
+         if (chld_procs[i].wait_thread
+             && !CancelSynchronousIo (chld_procs[i].wait_thread->thread_handle 
()))
            chld_procs[i].wait_thread->terminate_thread ();
          /* Release memory associated with this process unless it is 'myself'.
             'myself' is only in the chld_procs table when we've execed.  We
@@ -1174,7 +1178,11 @@ remove_proc (int ci)
 {
   if (have_execed)
     {
-      if (_my_tls._ctinfo != chld_procs[ci].wait_thread)
+      /* Attempt to exit the wait_thread cleanly via CancelSynchronousIo
+        before falling back to the (explicitly dangerous) cross-thread
+        termination */
+      if (_my_tls._ctinfo != chld_procs[ci].wait_thread
+         && !CancelSynchronousIo (chld_procs[ci].wait_thread->thread_handle 
()))
        chld_procs[ci].wait_thread->terminate_thread ();
     }
   else if (chld_procs[ci] && chld_procs[ci]->exists ())

Reply via email to