This is an automated email from the ASF dual-hosted git repository.
masaori335 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/trafficserver.git
The following commit(s) were added to refs/heads/master by this push:
new 3c000b304d traffic_crashlog: fix false-positive crash logs on clean
shutdown (#13296)
3c000b304d is described below
commit 3c000b304dea10dac2b7fb66c08e09a61c72e8dc
Author: Masaori Koshiba <[email protected]>
AuthorDate: Fri Jun 19 07:23:52 2026 +0900
traffic_crashlog: fix false-positive crash logs on clean shutdown (#13296)
* traffic_crashlog: fix false-positive crash logs on clean shutdown
The crash log helper is parked with SIGSTOP and armed with
PR_SET_PDEATHSIG=SIGCONT, so the kernel resumes it whenever
traffic_server exits, not only on a crash. It used getppid() to tell a
crash from a plain exit, but that is racy: PR_SET_PDEATHSIG fires on the
forking thread's death, which can occur while the parent pid is still
alive, so a normal SIGTERM shutdown could emit an empty crash log.
Replace the getppid() guess with a socket handshake: a real crash writes
the signal number to the helper, while any other exit closes the socket
so the helper reads EOF and exits quietly. FD_CLOEXEC keeps that EOF
reliable against later fork+exec children.
* Address Copilot's comment
---
src/traffic_crashlog/traffic_crashlog.cc | 21 +++++--
src/traffic_server/Crash.cc | 9 +++
.../shutdown/crashlog_no_false_positive.test.py | 73 ++++++++++++++++++++++
3 files changed, 97 insertions(+), 6 deletions(-)
diff --git a/src/traffic_crashlog/traffic_crashlog.cc
b/src/traffic_crashlog/traffic_crashlog.cc
index e31c0d40bf..dd77893ffa 100644
--- a/src/traffic_crashlog/traffic_crashlog.cc
+++ b/src/traffic_crashlog/traffic_crashlog.cc
@@ -201,7 +201,6 @@ main(int /* argc ATS_UNUSED */, const char **argv)
FILE *fp;
char *logname;
crashlog_target target;
- pid_t parent = getppid();
DiagsPtr::set(new Diags("traffic_crashlog", "" /* tags */, "" /* actions */,
new BaseLogFile("stderr")));
@@ -213,12 +212,22 @@ main(int /* argc ATS_UNUSED */, const char **argv)
if (wait_mode) {
EnableDeathSignal(SIGCONT);
kill(getpid(), SIGSTOP);
- }
- // If our parent changed, then we were woken after traffic_server exited.
There's no point trying to
- // emit a crashlog because traffic_server is gone.
- if (getppid() != parent) {
- return 0;
+ // A real crash sends us a notification on this socket; any other parent
exit just closes
+ // it, so EOF (or a short read) means there is nothing to log. getppid()
is unreliable here:
+ // PR_SET_PDEATHSIG can fire while our parent pid is still alive.
+ //
+ //
+ //
+ int crash_signo = 0; // value unused; an int keeps the socket framing
aligned with the writer
+ ssize_t nbytes;
+ do {
+ nbytes = read(STDIN_FILENO, &crash_signo, sizeof(crash_signo));
+ } while (nbytes < 0 && errno == EINTR);
+
+ if (nbytes != static_cast<ssize_t>(sizeof(crash_signo))) {
+ return 0;
+ }
}
runroot_handler(argv);
diff --git a/src/traffic_server/Crash.cc b/src/traffic_server/Crash.cc
index ff5b5e29b9..bb5f703106 100644
--- a/src/traffic_server/Crash.cc
+++ b/src/traffic_server/Crash.cc
@@ -30,6 +30,7 @@
#include <string>
#include <unistd.h>
+#include <fcntl.h>
#if defined(__linux__)
#include <sys/prctl.h>
#include <sys/syscall.h>
@@ -146,6 +147,10 @@ crash_logger_init(const char *user)
crash_logger_pid = child;
crash_logger_fd = pipe[0];
+ // Don't leak this socket into later fork+exec children; the logger relies
on seeing EOF
+ // here when we (its parent) exit.
+ fcntl(crash_logger_fd, F_SETFD, FD_CLOEXEC);
+
#if defined(__linux__)
// Allow the crash logger to ptrace us. Without this, Yama's ptrace_scope=1
// (the default on many distros) prevents a child process from tracing its
parent.
@@ -174,6 +179,10 @@ crash_logger_invoke(int signo, siginfo_t *info, void *ctx)
// Let the crash logger free ...
kill(crash_logger_pid, SIGCONT);
+ // Notify the logger that this is a real crash; a plain exit closes the
socket (EOF)
+ // instead. Written on every platform, ahead of the Linux-only thread
payload below.
+ ATS_UNUSED_RETURN(write(crash_logger_fd, &signo, sizeof(signo)));
+
#if defined(__linux__)
// Write the crashing thread information to the crash logger. While the
siginfo_t is blesses by POSIX, the
// ucontext_t can contain pointers, so it's highly platform dependent. On
Linux with glibc, however, it is
diff --git a/tests/gold_tests/shutdown/crashlog_no_false_positive.test.py
b/tests/gold_tests/shutdown/crashlog_no_false_positive.test.py
new file mode 100644
index 0000000000..0feb322f5e
--- /dev/null
+++ b/tests/gold_tests/shutdown/crashlog_no_false_positive.test.py
@@ -0,0 +1,73 @@
+'''
+Verify that a normal (SIGTERM) shutdown does not spuriously fire
traffic_crashlog.
+'''
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+Test.Summary = '''
+Verify that gracefully stopping traffic_server (SIGTERM) does not cause the
crash
+log helper (traffic_crashlog) to emit a bogus crash log.
+
+Regression test: the crash log helper is parked with SIGSTOP and arms a
+parent-death signal (PR_SET_PDEATHSIG=SIGCONT). When traffic_server exits
+normally -- e.g. a routine SIGTERM shutdown -- the kernel resumes the helper
via
+that death signal. The helper used to decide "did I wake because of a real
crash
+or a plain exit?" with a racy getppid() check, and would lose the race and
write
+an empty crash log against the already-exiting process. The fix replaces that
+guess with a handshake: crash_logger_invoke() writes the signal number to the
+helper, and a plain exit closes the socket so the helper reads EOF and stays
+quiet.
+
+Note: PR_SET_PDEATHSIG is a no-op on Darwin, so this false positive only occurs
+on Linux. On other platforms this test still confirms that the helper is armed
+and does not emit a crash log on a clean shutdown.
+'''
+
+ts = Test.MakeATSProcess('ts')
+
+ts.Disk.records_config.update(
+ {
+ # Enable the "server" debug tag so traffic_server emits the "received
exit signal,
+ # shutting down" DIAG line (the shutdown anchor checked below) to
traffic.out.
+ 'proxy.config.diags.debug.enabled': 1,
+ 'proxy.config.diags.debug.tags': 'server',
+ # Force the crash log helper on regardless of the build's
TS_USE_REMOTE_UNWINDING
+ # default (it is NULL on platforms without remote unwinding, e.g.
Darwin).
+ 'proxy.config.crash_log_helper': os.path.join(ts.Variables.BINDIR,
'traffic_crashlog'),
+ })
+
+tr = Test.AddTestRun('Start traffic_server with the crash log helper armed,
then let AuTest SIGTERM it')
+tr.Processes.Default.Command = 'printf "crash log helper armed"'
+tr.Processes.Default.ReturnCode = 0
+tr.Processes.Default.StartBefore(ts)
+tr.StillRunningAfter = ts
+
+# Anchor the scenario: confirm traffic_server actually ran the graceful
(SIGTERM) shutdown
+# path -- the exact trigger that woke the crash log helper in production.
Without this the
+# ExcludesExpression checks below could pass without ever exercising shutdown.
(The crash
+# log helper itself is forced on via proxy.config.crash_log_helper above, and
debug/DIAG
+# output is emitted to traffic.out.)
+ts.Disk.traffic_out.Content = Testers.ContainsExpression(
+ 'received exit signal, shutting down', 'traffic_server should have run the
graceful shutdown path')
+
+# The crash log helper shares traffic_server's stderr (bound to traffic.out).
On a clean
+# shutdown it must neither wake nor write a crash log.
+ts.Disk.traffic_out.Content += Testers.ExcludesExpression(
+ 'crashlog started', 'the crash log helper must not wake on a clean SIGTERM
shutdown')
+ts.Disk.traffic_out.Content += Testers.ExcludesExpression(
+ 'wrote crash log', 'a clean SIGTERM shutdown must not produce a crash log')