github-actions[bot] commented on code in PR #64039:
URL: https://github.com/apache/doris/pull/64039#discussion_r3351226527
##########
be/src/udf/python/python_udf_runtime.cpp:
##########
@@ -47,29 +79,51 @@ void PythonUDFProcess::remove_unix_socket() {
void PythonUDFProcess::shutdown() {
if (!_child.valid() || _is_shutdown) return;
- _child.terminate();
- bool graceful = false;
- constexpr std::chrono::milliseconds retry_interval(100); // 100ms
-
- for (int i = 0; i < TERMINATE_RETRY_TIMES; ++i) {
- if (!_child.running()) {
- graceful = true;
- break;
- }
- std::this_thread::sleep_for(retry_interval);
+ constexpr std::chrono::milliseconds terminate_timeout(1000);
+ int exit_status = 0;
+ bool exited = !_child.running();
+ bool status_available = false;
+ bool already_reaped = false;
+ if (!exited) {
+ ::kill(_child_pid, SIGTERM);
+ auto wait_result = wait_child_exit(_child_pid, terminate_timeout,
&exit_status);
+ exited = wait_result == ChildExitWaitResult::EXITED ||
+ wait_result == ChildExitWaitResult::ALREADY_REAPED;
+ status_available = wait_result == ChildExitWaitResult::EXITED;
+ already_reaped = wait_result == ChildExitWaitResult::ALREADY_REAPED;
+ } else {
+ auto wait_result = wait_child_exit(_child_pid,
std::chrono::milliseconds(0), &exit_status);
+ status_available = wait_result == ChildExitWaitResult::EXITED;
+ already_reaped = wait_result == ChildExitWaitResult::ALREADY_REAPED;
}
- if (!graceful) {
- LOG(WARNING) << "Python process did not terminate gracefully, sending
SIGKILL";
+ if (!exited) {
+ LOG(WARNING) << "Python process did not terminate gracefully, sending
SIGKILL, pid="
+ << _child_pid;
::kill(_child_pid, SIGKILL);
- _child.wait();
+ auto wait_result = wait_child_exit(_child_pid, terminate_timeout,
&exit_status);
+ exited = wait_result == ChildExitWaitResult::EXITED ||
+ wait_result == ChildExitWaitResult::ALREADY_REAPED;
+ status_available = wait_result == ChildExitWaitResult::EXITED;
+ already_reaped = wait_result == ChildExitWaitResult::ALREADY_REAPED;
}
-
- if (int exit_code = _child.exit_code(); exit_code > 128 && exit_code <=
255) {
- int signal = exit_code - 128;
- LOG(INFO) << "Python process was killed by signal " << signal;
Review Comment:
Detaching here after both SIGTERM and SIGKILL waits time out leaves no owner
that will ever reap this pid. A concrete path is a Python process stuck in
uninterruptible I/O longer than `terminate_timeout`: `shutdown()` sends
SIGKILL, `wait_child_exit()` returns `TIMEOUT`, `_child.detach()` drops boost's
handle, and `_is_shutdown` is set. If the process exits later, it remains as a
zombie under Doris because neither `PythonServerManager` nor a background
reaper tracks the pid. Please keep a reaping owner, for example by enqueueing
the pid to a bounded background reaper, or avoid detaching/marking shutdown
until the child has actually been reaped.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]