This is an automated email from the ASF dual-hosted git repository.

masaori335 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/trafficserver.git


The following commit(s) were added to refs/heads/master by this push:
     new 3c000b304d traffic_crashlog: fix false-positive crash logs on clean 
shutdown (#13296)
3c000b304d is described below

commit 3c000b304dea10dac2b7fb66c08e09a61c72e8dc
Author: Masaori Koshiba <[email protected]>
AuthorDate: Fri Jun 19 07:23:52 2026 +0900

    traffic_crashlog: fix false-positive crash logs on clean shutdown (#13296)
    
    * traffic_crashlog: fix false-positive crash logs on clean shutdown
    
    The crash log helper is parked with SIGSTOP and armed with
    PR_SET_PDEATHSIG=SIGCONT, so the kernel resumes it whenever
    traffic_server exits, not only on a crash. It used getppid() to tell a
    crash from a plain exit, but that is racy: PR_SET_PDEATHSIG fires on the
    forking thread's death, which can occur while the parent pid is still
    alive, so a normal SIGTERM shutdown could emit an empty crash log.
    
    Replace the getppid() guess with a socket handshake: a real crash writes
    the signal number to the helper, while any other exit closes the socket
    so the helper reads EOF and exits quietly. FD_CLOEXEC keeps that EOF
    reliable against later fork+exec children.
    
    * Address Copilot's comment
---
 src/traffic_crashlog/traffic_crashlog.cc           | 21 +++++--
 src/traffic_server/Crash.cc                        |  9 +++
 .../shutdown/crashlog_no_false_positive.test.py    | 73 ++++++++++++++++++++++
 3 files changed, 97 insertions(+), 6 deletions(-)

diff --git a/src/traffic_crashlog/traffic_crashlog.cc 
b/src/traffic_crashlog/traffic_crashlog.cc
index e31c0d40bf..dd77893ffa 100644
--- a/src/traffic_crashlog/traffic_crashlog.cc
+++ b/src/traffic_crashlog/traffic_crashlog.cc
@@ -201,7 +201,6 @@ main(int /* argc ATS_UNUSED */, const char **argv)
   FILE           *fp;
   char           *logname;
   crashlog_target target;
-  pid_t           parent = getppid();
 
   DiagsPtr::set(new Diags("traffic_crashlog", "" /* tags */, "" /* actions */, 
new BaseLogFile("stderr")));
 
@@ -213,12 +212,22 @@ main(int /* argc ATS_UNUSED */, const char **argv)
   if (wait_mode) {
     EnableDeathSignal(SIGCONT);
     kill(getpid(), SIGSTOP);
-  }
 
-  // If our parent changed, then we were woken after traffic_server exited. 
There's no point trying to
-  // emit a crashlog because traffic_server is gone.
-  if (getppid() != parent) {
-    return 0;
+    // A real crash sends us a notification on this socket; any other parent 
exit just closes
+    // it, so EOF (or a short read) means there is nothing to log. getppid() 
is unreliable here:
+    // PR_SET_PDEATHSIG can fire while our parent pid is still alive.
+    //
+    //
+    //
+    int     crash_signo = 0; // value unused; an int keeps the socket framing 
aligned with the writer
+    ssize_t nbytes;
+    do {
+      nbytes = read(STDIN_FILENO, &crash_signo, sizeof(crash_signo));
+    } while (nbytes < 0 && errno == EINTR);
+
+    if (nbytes != static_cast<ssize_t>(sizeof(crash_signo))) {
+      return 0;
+    }
   }
 
   runroot_handler(argv);
diff --git a/src/traffic_server/Crash.cc b/src/traffic_server/Crash.cc
index ff5b5e29b9..bb5f703106 100644
--- a/src/traffic_server/Crash.cc
+++ b/src/traffic_server/Crash.cc
@@ -30,6 +30,7 @@
 
 #include <string>
 #include <unistd.h>
+#include <fcntl.h>
 #if defined(__linux__)
 #include <sys/prctl.h>
 #include <sys/syscall.h>
@@ -146,6 +147,10 @@ crash_logger_init(const char *user)
   crash_logger_pid = child;
   crash_logger_fd  = pipe[0];
 
+  // Don't leak this socket into later fork+exec children; the logger relies 
on seeing EOF
+  // here when we (its parent) exit.
+  fcntl(crash_logger_fd, F_SETFD, FD_CLOEXEC);
+
 #if defined(__linux__)
   // Allow the crash logger to ptrace us. Without this, Yama's ptrace_scope=1
   // (the default on many distros) prevents a child process from tracing its 
parent.
@@ -174,6 +179,10 @@ crash_logger_invoke(int signo, siginfo_t *info, void *ctx)
     // Let the crash logger free ...
     kill(crash_logger_pid, SIGCONT);
 
+    // Notify the logger that this is a real crash; a plain exit closes the 
socket (EOF)
+    // instead. Written on every platform, ahead of the Linux-only thread 
payload below.
+    ATS_UNUSED_RETURN(write(crash_logger_fd, &signo, sizeof(signo)));
+
 #if defined(__linux__)
     // Write the crashing thread information to the crash logger. While the 
siginfo_t is blesses by POSIX, the
     // ucontext_t can contain pointers, so it's highly platform dependent. On 
Linux with glibc, however, it is
diff --git a/tests/gold_tests/shutdown/crashlog_no_false_positive.test.py 
b/tests/gold_tests/shutdown/crashlog_no_false_positive.test.py
new file mode 100644
index 0000000000..0feb322f5e
--- /dev/null
+++ b/tests/gold_tests/shutdown/crashlog_no_false_positive.test.py
@@ -0,0 +1,73 @@
+'''
+Verify that a normal (SIGTERM) shutdown does not spuriously fire 
traffic_crashlog.
+'''
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import os
+
+Test.Summary = '''
+Verify that gracefully stopping traffic_server (SIGTERM) does not cause the 
crash
+log helper (traffic_crashlog) to emit a bogus crash log.
+
+Regression test: the crash log helper is parked with SIGSTOP and arms a
+parent-death signal (PR_SET_PDEATHSIG=SIGCONT). When traffic_server exits
+normally -- e.g. a routine SIGTERM shutdown -- the kernel resumes the helper 
via
+that death signal. The helper used to decide "did I wake because of a real 
crash
+or a plain exit?" with a racy getppid() check, and would lose the race and 
write
+an empty crash log against the already-exiting process. The fix replaces that
+guess with a handshake: crash_logger_invoke() writes the signal number to the
+helper, and a plain exit closes the socket so the helper reads EOF and stays
+quiet.
+
+Note: PR_SET_PDEATHSIG is a no-op on Darwin, so this false positive only occurs
+on Linux. On other platforms this test still confirms that the helper is armed
+and does not emit a crash log on a clean shutdown.
+'''
+
+ts = Test.MakeATSProcess('ts')
+
+ts.Disk.records_config.update(
+    {
+        # Enable the "server" debug tag so traffic_server emits the "received 
exit signal,
+        # shutting down" DIAG line (the shutdown anchor checked below) to 
traffic.out.
+        'proxy.config.diags.debug.enabled': 1,
+        'proxy.config.diags.debug.tags': 'server',
+        # Force the crash log helper on regardless of the build's 
TS_USE_REMOTE_UNWINDING
+        # default (it is NULL on platforms without remote unwinding, e.g. 
Darwin).
+        'proxy.config.crash_log_helper': os.path.join(ts.Variables.BINDIR, 
'traffic_crashlog'),
+    })
+
+tr = Test.AddTestRun('Start traffic_server with the crash log helper armed, 
then let AuTest SIGTERM it')
+tr.Processes.Default.Command = 'printf "crash log helper armed"'
+tr.Processes.Default.ReturnCode = 0
+tr.Processes.Default.StartBefore(ts)
+tr.StillRunningAfter = ts
+
+# Anchor the scenario: confirm traffic_server actually ran the graceful 
(SIGTERM) shutdown
+# path -- the exact trigger that woke the crash log helper in production. 
Without this the
+# ExcludesExpression checks below could pass without ever exercising shutdown. 
(The crash
+# log helper itself is forced on via proxy.config.crash_log_helper above, and 
debug/DIAG
+# output is emitted to traffic.out.)
+ts.Disk.traffic_out.Content = Testers.ContainsExpression(
+    'received exit signal, shutting down', 'traffic_server should have run the 
graceful shutdown path')
+
+# The crash log helper shares traffic_server's stderr (bound to traffic.out). 
On a clean
+# shutdown it must neither wake nor write a crash log.
+ts.Disk.traffic_out.Content += Testers.ExcludesExpression(
+    'crashlog started', 'the crash log helper must not wake on a clean SIGTERM 
shutdown')
+ts.Disk.traffic_out.Content += Testers.ExcludesExpression(
+    'wrote crash log', 'a clean SIGTERM shutdown must not produce a crash log')

Reply via email to